In [32]:
import pandas as pd
import numpy as np
import re

from keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from keras.models import Model
from keras.utils import plot_model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [33]:
df = pd.read_csv('new_labeled_tweets1.csv', nrows=500000)

In [3]:
df = df.dropna(subset=['text'])
splits = df['text'].str.split(' ')
to_remove = splits.apply(lambda x: len(x)).sort_values(ascending=False)[:10].index
df = df.drop(to_remove).reset_index(drop=True)

In [4]:
df['text'] = df['text'].str.replace('<quoted_status>', '<quoted_status> ')
df['text'] = df['text'].str.replace('<hashtag>', '<hashtag> ')
df['text'] = df['text'].apply(lambda x: re.sub( '\s+', ' ', x ).strip())

In [5]:
df_clean_split = df['text'].str.split(' ', expand=True)
words = df_clean_split.stack().unique()
max_sequence = df_clean_split.shape[1]

In [6]:
tockenizer = Tokenizer(words.shape[0]) 
tockenizer.fit_on_texts(df['text'])
sequences = tockenizer.texts_to_sequences(df['text'])

In [7]:
word_index = tockenizer.word_index
data = pad_sequences(sequences, maxlen=max_sequence)

In [8]:
word_index = tockenizer.word_index
data = pad_sequences(sequences, maxlen=max_sequence)

In [9]:
embeddings_index = {}
f = open('glove.twitter.27B.50d.txt', encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [10]:
embedding_matrix = np.zeros((len(word_index) + 1, 50))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
X_text_train = data[:300000,:]
X_text_test = data[300000:,:]

In [12]:
number_data = df[['retweet_count', 'favorite_count', 'reply_count', 'hashtag_count', 'mention_count', 'url_count']].values

X_number_train = number_data[:300000,:]
X_number_test = number_data[300000:,:]

In [13]:
y_vals = df['BotOrNot'].values

y_train = y_vals[:300000].reshape(-1,1)
y_test = y_vals[300000:].reshape(-1,1)

In [14]:
embedding_layer = Embedding(len(word_index) + 1,
                            50,
                            weights=[embedding_matrix],
                            input_length=max_sequence,
                            trainable=False)

In [15]:
main_input = Input(shape=(max_sequence,), dtype='int32', name='main_input')
embedded_sequences = embedding_layer(main_input)

lstm_out = LSTM(32)(embedded_sequences)

#auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

#auxiliary_input = Input(shape=(6,), name='aux_input')

#x = concatenate([lstm_out, auxiliary_input])
x =lstm_out

x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)

main_output = Dense(1, activation='sigmoid', name='main_output')(x)

Instructions for updating:
Colocations handled automatically by placer.


In [16]:
model = Model(inputs=main_input, outputs=main_output)

In [17]:
model.compile(optimizer='adam',
              loss={'main_output': 'binary_crossentropy'},
              loss_weights={'main_output': 1.},
              metrics=['accuracy'])

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 152)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 152, 50)           9512500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                10624     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               4224      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
main_output (Dense)          (None, 1)                 65        
Total params: 9,535,669
Trainable params: 23,169
Non-trainable params: 9,512,500
_____________________________________________________________

In [19]:
model.fit({'main_input': X_text_train},
          {'main_output': y_train},
          validation_data=[{'main_input': X_text_test}, {'main_output': y_test}],
          epochs=20, 
          batch_size=1024)

Instructions for updating:
Use tf.cast instead.
Train on 300000 samples, validate on 199833 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2cefae58128>

In [26]:
#train_X.reshape(train_X.shape[1:])
from sklearn.metrics import classification_report
Y_pred_train = model.predict(np.array(X_text_train))
#predicted_classes.shape , y_test.shape, y_train.shape
#from sklearn.preprocessing import MinMaxScaler
target_names = ["Class {}".format(i) for i in range(2)]
Y_pred_train = np.argmax(np.round(Y_pred_train),axis=1)
print(classification_report(np.array(y_train), Y_pred_train, target_names=target_names))

              precision    recall  f1-score   support

     Class 0       0.45      1.00      0.62    134672
     Class 1       0.00      0.00      0.00    165328

   micro avg       0.45      0.45      0.45    300000
   macro avg       0.22      0.50      0.31    300000
weighted avg       0.20      0.45      0.28    300000



In [23]:
#train_X.reshape(train_X.shape[1:])
from sklearn.metrics import classification_report
Y_pred_test = model.predict(np.array(X_text_test))
#predicted_classes.shape , y_test.shape, y_train.shape
#from sklearn.preprocessing import MinMaxScaler
target_names = ["Class {}".format(i) for i in range(2)]
Y_pred_test = np.argmax(np.round(Y_pred_test),axis=1)
print(classification_report(np.array(y_test), Y_pred_test, target_names=target_names))

              precision    recall  f1-score   support

     Class 0       0.45      1.00      0.62     89870
     Class 1       0.00      0.00      0.00    109963

   micro avg       0.45      0.45      0.45    199833
   macro avg       0.22      0.50      0.31    199833
weighted avg       0.20      0.45      0.28    199833



In [22]:
def get_accuracy(AL, y, verbose=1):
    
    try:
        AL = np.array(AL)
        y = np.array(y)

        AL = AL.reshape(-1)
        y = y.reshape(-1)

        AL = AL > 0.5
        AL = AL.astype(int)

        y = y > 0.5
        y = y.astype(int)

        total = AL.shape[0]

        TP = np.sum(np.logical_and(AL==1, y==1))
        TN = np.sum(np.logical_and(AL==0, y==0))

        FP = np.sum(np.logical_and(AL==1, y==0))
        FN = np.sum(np.logical_and(AL==0, y==1))

        P = TP / (TP + FP)
        R = TP / (TP + FN)
        F1 = (2 * P * R) / (P + R)


        acc = np.sum(AL == y)/total


        if verbose == 1:
            print("\nAccuracy: {} \n".format(acc))
            print("True Positive: {} \nTrue Negative: {}\nFalse Positive: {} \nFalse Negative: {}\n".format(TP, TN, FP, FN))
            print("Precision: {} \nRecall: {} \nF1 Score: {}\n".format(P, R, F1))
        
        return acc
    except:
        return 0

In [27]:
get_accuracy(Y_pred_train, y_train)


Accuracy: 0.4489066666666667 

True Positive: 0 
True Negative: 134672
False Positive: 0 
False Negative: 165328

Precision: nan 
Recall: 0.0 
F1 Score: nan





0.4489066666666667

In [43]:
get_accuracy( Y_pred_test, y_test)


Accuracy: 0.44972552080987627 

True Positive: 0 
True Negative: 89870
False Positive: 0 
False Negative: 109963

Precision: nan 
Recall: 0.0 
F1 Score: nan





0.44972552080987627

In [41]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_pred_test, y_test))

Accuracy: 0.44972552080987627


In [46]:
number_data[0]

array([0., 0., 0., 0., 0., 0.])