In [68]:
import pandas as pd
import numpy as np
import re

from keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from keras.models import Model
from keras.utils import plot_model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [80]:
df = pd.read_csv('data/new_labeled_tweets.csv')
splits = df['tweettext'].str.split(' ')
to_remove = splits.apply(lambda x: len(x)).sort_values(ascending=False)[:10].index
df = df.drop(to_remove).reset_index(drop=True)

In [81]:
df['tweettext'] = df['tweettext'].str.replace('<quoted_status>', '<quoted_status> ')
df['tweettext'] = df['tweettext'].str.replace('<hashtag>', '<hashtag> ')
df['tweettext'] = df['tweettext'].apply(lambda x: re.sub( '\s+', ' ', x ).strip())

In [82]:
df_clean_split = df['tweettext'].str.split(' ', expand=True)
words = df_clean_split.stack().unique()
max_sequence = df_clean_split.shape[1]

In [84]:
tockenizer = Tokenizer(words.shape[0]) 
tockenizer.fit_on_texts(df['tweettext'])
sequences = tockenizer.texts_to_sequences(df['tweettext'])

In [85]:
word_index = tockenizer.word_index
data = pad_sequences(sequences, maxlen=max_sequence)

In [86]:
embeddings_index = {}
f = open('data/glove.twitter.27B.50d.txt', encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [87]:
embedding_matrix = np.zeros((len(word_index) + 1, 50))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [88]:
X_text_train = data[:20000,:]
X_text_test = data[20000:,:]

In [89]:
number_data = df[['retweets', 'likes', 'replies', 'hashtag_count', 'mention_count', 'url_count']].values

X_number_train = number_data[:20000,:]
X_number_test = number_data[20000:,:]

In [90]:
y_vals = df['BotOrNot'].values

y_train = y_vals[:20000].reshape(-1,1)
y_test = y_vals[20000:].reshape(-1,1)

In [91]:
embedding_layer = Embedding(len(word_index) + 1,
                            50,
                            weights=[embedding_matrix],
                            input_length=max_sequence,
                            trainable=False)

In [92]:
main_input = Input(shape=(max_sequence,), dtype='int32', name='main_input')
embedded_sequences = embedding_layer(main_input)

lstm_out = LSTM(32)(embedded_sequences)

auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

auxiliary_input = Input(shape=(6,), name='aux_input')

x = concatenate([lstm_out, auxiliary_input])

x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)

main_output = Dense(1, activation='sigmoid', name='main_output')(x)

In [93]:
model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])

In [94]:
model.compile(optimizer='adam',
              loss={'main_output': 'binary_crossentropy', 'aux_output': 'binary_crossentropy'},
              loss_weights={'main_output': 1., 'aux_output': 0.2},
              metrics=['accuracy'])

In [95]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 291)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 291, 50)      1324350     main_input[0][0]                 
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 32)           10624       embedding_2[0][0]                
__________________________________________________________________________________________________
aux_input (InputLayer)          (None, 6)            0                                            
__________________________________________________________________________________________________
concatenat

In [97]:
model.fit({'main_input': X_text_train, 'aux_input': X_number_train},
          {'main_output': y_train, 'aux_output': y_train},
          validation_data=[{'main_input': X_text_test, 'aux_input': X_number_test}, {'main_output': y_test, 'aux_output': y_test}],
          epochs=50, 
          batch_size=512)

Train on 20000 samples, validate on 7609 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50


Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50

KeyboardInterrupt: 