In [1]:
import pandas as pd
import numpy as np

from keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from keras.models import Model, load_model
from keras.utils import plot_model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def get_accuracy(AL, y):
    AL = AL.reshape(-1)
    y = y.reshape(-1)
    
    AL = AL > 0.5
    AL = AL.astype(int)
    
    total = AL.shape[0]
    
    TP = np.sum(np.logical_and(AL==1, y==1))
    TN = np.sum(np.logical_and(AL==0, y==0))

    FP = np.sum(np.logical_and(AL==1, y==0))
    FN = np.sum(np.logical_and(AL==0, y==1))

    P = TP / (TP + FP)
    R = TP / (TP + TN)
    F1 = (2 * P * R) / (P + R)


    acc = np.sum(AL == y)/total


    print("\nAccuracy: {} \n".format(acc))
    print("True Positive: {} \nTrue Negative: {}\nFalse Positive: {} \nFalse Negative: {}\n".format(TP, TN, FP, FN))
    print("Precision: {} \nRecall: {} \nF1 Score: {}\n".format(P, R, F1))

In [19]:
df = pd.read_csv('../../../data/collection_data/profile/storage/raw/userData.csv')
bot_accounts = pd.read_csv('bots.csv')
clean_accounts = pd.read_csv('users.csv')

In [20]:
df['username'] = df['username'].str.lower()
bot_accounts['username'] = bot_accounts['username'].str.lower()

In [21]:
bot_details = df[df['username'].isin(bot_accounts['username'])]
bot_details = bot_details.reset_index(drop=True)

clean_details = df[df['username'].isin(clean_accounts['username'])]
clean_details = clean_details.reset_index(drop=True)
clean_details = clean_details[:bot_details.shape[0]]

bot_details['BotOrNot'] = 1
clean_details['BotOrNot'] = 0

combined_df = clean_details.append(bot_details, ignore_index=True)

new_df = combined_df.sample(frac=1).reset_index(drop=True)
new_df['created'] = pd.to_datetime(new_df['created'])
new_df['age'] = (pd.Timestamp.today() - new_df['created']).apply(lambda x: x.days)

In [22]:
combined_df = combined_df.drop_duplicates().reset_index(drop=True)

In [23]:
new_df = new_df[['username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified', 'BotOrNot']]
bot_df = new_df[new_df['BotOrNot'] == 1].reset_index(drop=True)
human_df = new_df[new_df['BotOrNot'] == 0][:bot_df.shape[0]]
new_df = pd.concat([bot_df, human_df]).reset_index(drop=True)
new_df = new_df.sample(frac=1).reset_index(drop=True)

In [38]:
cersi_df = pd.read_csv('Twitter-Bot-or-Not/cersi_processed.csv')[:2000]

In [39]:
training_df = new_df.drop('username', axis=1)[:int(new_df.shape[0] * 0.8)]
test_df = new_df.drop('username', axis=1)[int(new_df.shape[0] * 0.8):]

In [40]:
training_df = pd.concat([training_df, cersi_df.drop('username', axis=1)])

In [41]:
columns_to_standardize = ['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes']

training_df_mean = training_df[columns_to_standardize].mean()
training_df_std = training_df[columns_to_standardize].std()

training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std
test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std

# training_df_mean = training_df.mean()
# training_df_std = training_df.std()

# training_df = (training_df - training_df_mean)/training_df_std
# test_df = (test_df - training_df_mean)/training_df_std

# max_vals = training_df.max()

# training_df = training_df/max_vals
# test_df = test_df/max_vals

In [42]:
X_train = training_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_train = training_df['BotOrNot'].values.reshape(-1,1)

X_test = test_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_test = test_df['BotOrNot'].values.reshape(-1,1)

In [43]:
cersi = load_model('cersi_2017.h5')
ytrain_cersi = cersi.predict(X_train)
get_accuracy(ytrain_cersi, y_train)


ytest_cersi = cersi.predict(X_test)
get_accuracy(ytest_cersi, y_test)

cersi.fit(x=X_train, y=y_train, batch_size=16, epochs=20, validation_data=(X_test, y_test))

ytrain_cersi = cersi.predict(X_train)
get_accuracy(ytrain_cersi, y_train)


ytest_cersi = cersi.predict(X_test)
get_accuracy(ytest_cersi, y_test)


Accuracy: 0.6460650128314799 

True Positive: 1764 
True Negative: 1257
False Positive: 916 
False Negative: 739

Precision: 0.6582089552238806 
Recall: 0.5839126117179742 
F1 Score: 0.6188388002104893


Accuracy: 0.6746268656716418 

True Positive: 171 
True Negative: 281
False Positive: 52 
False Negative: 166

Precision: 0.7668161434977578 
Recall: 0.37831858407079644 
F1 Score: 0.5066666666666666

Train on 4676 samples, validate on 670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 0.886655260906758 

True Positive: 2187 
True Negative: 1959
False Positive: 214 
False Negative: 316

Precision: 0.9108704706372345 
Recall: 0.5274963820549927 
F1 Score: 0.6680922559951121


Accuracy: 0.8238805970149253 

True Positive: 257 
True Negative: 295
False Positive: 38 
False Negative: 80

Pr

In [44]:
inp = Input(shape=[10])

another = Dense(500, activation='relu')(inp)
another = Dense(200, activation='relu')(another)
another = Dense(1, activation='sigmoid')(another)

mod = Model(inp, another)
mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [45]:
mod.fit(x=X_train, y=y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test))

Train on 4676 samples, validate on 670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f844a84a320>

In [46]:
ytrain_mod = mod.predict(X_train)
get_accuracy(ytrain_mod, y_train)


ytest_mod = mod.predict(X_test)
get_accuracy(ytest_mod, y_test)


Accuracy: 0.8714713430282293 

True Positive: 2270 
True Negative: 1805
False Positive: 368 
False Negative: 233

Precision: 0.8605003790750568 
Recall: 0.5570552147239264 
F1 Score: 0.6762997169670788


Accuracy: 0.8238805970149253 

True Positive: 278 
True Negative: 274
False Positive: 59 
False Negative: 59

Precision: 0.8249258160237388 
Recall: 0.5036231884057971 
F1 Score: 0.6254218222722159



In [None]:
#need to fix the cleaners