In [1]:
import pandas as pd
import numpy as np
import re

from keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from keras.models import Model
from keras.utils import plot_model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
df = pd.read_csv('new_labeled_tweets1.csv')

In [3]:
df = df.dropna(subset=['text'])
splits = df['text'].str.split(' ')
to_remove = splits.apply(lambda x: len(x)).sort_values(ascending=False)[:10].index
df = df.drop(to_remove).reset_index(drop=True)

In [4]:
df['text'] = df['text'].str.replace('<quoted_status>', '<quoted_status> ')
df['text'] = df['text'].str.replace('<hashtag>', '<hashtag> ')
df['text'] = df['text'].apply(lambda x: re.sub( '\s+', ' ', x ).strip())

In [5]:
df_clean_split = df['text'].str.split(' ', expand=True)
words = df_clean_split.stack().unique()
max_sequence = df_clean_split.shape[1]

In [6]:
tockenizer = Tokenizer(words.shape[0]) 
tockenizer.fit_on_texts(df['text'])
sequences = tockenizer.texts_to_sequences(df['text'])

In [7]:
word_index = tockenizer.word_index
data = pad_sequences(sequences, maxlen=max_sequence)

In [8]:
embeddings_index = {}
f = open('glove.twitter.27B.200d.txt', encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [10]:
embedding_matrix = np.zeros((len(word_index) + 1, 200))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [23]:
X_text_train = data[:5000000,:]
X_text_test = data[5000000:,:]
X_train = np.array(X_text_train )
X_test = np.array(X_text_test)

In [24]:
number_data = df[['retweet_count', 'favorite_count', 'reply_count', 'hashtag_count', 'mention_count', 'url_count']].values

X_number_train = number_data[:5000000,:]
X_number_test = number_data[5000000:,:]

In [25]:
y_vals = df['BotOrNot'].values

y_train = y_vals[:5000000].reshape(-1,1)
y_test = y_vals[5000000:].reshape(-1,1)

y_train1= np.array(y_train)
y_test1= np.array(y_test)

In [26]:
embedding_layer = Embedding(len(word_index) + 1,
                            200,
                            weights=[embedding_matrix],
                            input_length=max_sequence,
                            trainable=True)

In [27]:
main_input = Input(shape=(max_sequence,), dtype='int32', name='main_input')
embedded_sequences = embedding_layer(main_input)

lstm_out = LSTM(32)(embedded_sequences)

auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

auxiliary_input = Input(shape=(6,), name='aux_input')

x = concatenate([lstm_out, auxiliary_input])

x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)

main_output = Dense(1, activation='sigmoid', name='main_output')(x)

In [28]:
model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])

In [29]:
model.compile(optimizer='adam',
              loss={'main_output': 'binary_crossentropy', 'aux_output': 'binary_crossentropy'},
              loss_weights={'main_output': 1., 'aux_output': 0.2},
              metrics=['accuracy'])

In [None]:
model.fit({'main_input': X_text_train, 'aux_input': X_number_train},
          {'main_output': y_train, 'aux_output': y_train},
          validation_data=[{'main_input': X_text_test, 'aux_input': X_number_test}, {'main_output': y_test, 'aux_output': y_test}],
          epochs=20, 
          batch_size=16384)

Instructions for updating:
Use tf.cast instead.
Train on 1100000 samples, validate on 399508 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

In [None]:
#train_X.reshape(train_X.shape[1:])
from sklearn.metrics import classification_report
predicted_classes = model.predict(X_train)
#predicted_classes.shape , y_test.shape, y_train.shape
#from sklearn.preprocessing import MinMaxScaler
target_names = ["Class {}".format(i) for i in range(2)]
predicted_classes = np.argmax(np.round(predicted_classes),axis=1)
print(classification_report(y_train1, predicted_classes, target_names=target_names))