In [45]:
import pandas as pd
import numpy as np
w
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pickle

In [73]:
columns_to_standardize = ['friends_count', 'statuses_count', 'listed_count', 'followers_count', 'favourites_count']

In [89]:
def get_accuracy(AL, y, verbose=1):
    
    try:
        AL = np.array(AL)
        y = np.array(y)

        AL = AL.reshape(-1)
        y = y.reshape(-1)

        AL = AL > 0.5
        AL = AL.astype(int)

        y = y > 0.5
        y = y.astype(int)
        
        auc = roc_auc_score(y, AL)

        total = AL.shape[0]

        TP = np.sum(np.logical_and(AL==1, y==1))
        TN = np.sum(np.logical_and(AL==0, y==0))

        FP = np.sum(np.logical_and(AL==1, y==0))
        FN = np.sum(np.logical_and(AL==0, y==1))

        P = TP / (TP + FP)
        R = TP / (TP + FN)
        F1 = (2 * P * R) / (P + R)


        acc = np.sum(AL == y)/total


        if verbose == 1:
            print("\nAccuracy: {} \n".format(acc))
            print("AUC: {}".format(auc))
            print("True Positive: {} \nTrue Negative: {}\nFalse Positive: {} \nFalse Negative: {}\n".format(TP, TN, FP, FN))
            print("Precision: {} \nRecall: {} \nF1 Score: {}\n".format(P, R, F1))
        
        return acc
    except:
        return 0
        
    

In [90]:
def load_data():
    '''
    Returns:
    ________
    
    df (pandas dataframe):
    Dataframe containing userinfo
    
    users_bots (pandas dataframe):
    dataframe containing score from botometer
    '''
    
    users_bots = pd.read_csv('username_botometer.csv')
    df = pd.read_csv('../../data/new_profiles_cleaned.csv')
    election_accounts = pd.read_csv('../../data/collection_data/profile/extractedUsers.csv')
    election_accounts.columns = ['username']

    election_accounts['username'] = election_accounts['username'].str.lower()
    df['username'] = df['username'].str.lower()
    users_bots['username'] = users_bots['username'].str.lower()
    
    users_bots = users_bots[users_bots['username'].isin(election_accounts['username'])].reset_index(drop=True)
    df = df[df['username'].isin(election_accounts['username'])].reset_index(drop=True)
    
    return df, users_bots

In [201]:
df, users_bots = load_data()

In [202]:
users_bots = pd.read_csv('username_botometer.csv')
users_bots['username'] = users_bots['username'].str.lower()
training_df = pd.read_csv('training_set.csv')

non_training_df = users_bots[~users_bots['username'].isin(training_df['username'])].reset_index(drop=True)
non_training_df = non_training_df.merge(df).drop_duplicates('username').reset_index(drop=True)
print(non_training_df.shape)

training_df = training_df.drop('username', axis=1)
print(training_df.shape)

test_df = pd.read_csv('test_set.csv').drop('username', axis=1)
print(test_df.shape)

(44580, 12)
(26475, 11)
(1394, 11)


In [203]:
training_df_mean = training_df[columns_to_standardize].mean()
training_df_std = training_df[columns_to_standardize].std()

training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std
test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std
non_training_df[columns_to_standardize] = (non_training_df[columns_to_standardize] - training_df_mean)/training_df_std


X_train = training_df.drop(['BotOrNot'], axis=1).values
y_train = training_df['BotOrNot'].values.reshape(-1,1)

X_test = test_df.drop(['BotOrNot'], axis=1).values
y_test = test_df['BotOrNot'].values.reshape(-1,1)

X_non = non_training_df.drop(['username', 'botometer'], axis=1).values
y_non = non_training_df['botometer'].values

In [204]:
training_df_mean.to_csv('mean.csv')
training_df_std.to_csv('std.csv')

In [205]:
s = SMOTE()
smote_X, smote_y = s.fit_resample(X_train, y_train.reshape(-1))

e = EditedNearestNeighbours()
r_X, r_y = e.fit_resample(smote_X, smote_y)

a = AdaBoostClassifier(n_estimators=150, random_state=0)

a.fit(r_X, r_y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=150, random_state=0)

In [206]:
# r_X, r_y = X_train, y_train.reshape(-1)
# a = RandomForestClassifier(n_estimators=100, max_depth=2)
# a.fit(r_X, r_y)

In [207]:
y_predict = a.predict(r_X)

In [222]:
get_accuracy(y_predict, r_y)


Accuracy: 0.855826678199996 

AUC: 0.8545674252215391
True Positive: 19618 
True Negative: 22926
False Positive: 3059 
False Negative: 4108

Precision: 0.8651056136173215 
Recall: 0.8268566129983984 
F1 Score: 0.8455487791737604



0.855826678199996

In [209]:
y_test_predict = a.predict(X_test)

In [210]:
get_accuracy(y_test_predict, y_test)


Accuracy: 0.8773314203730272 

AUC: 0.7452668632860813
True Positive: 14 
True Negative: 1209
False Positive: 162 
False Negative: 9

Precision: 0.07954545454545454 
Recall: 0.6086956521739131 
F1 Score: 0.1407035175879397



0.8773314203730272

In [211]:
non_training_predict = a.predict(X_non)

In [212]:
get_accuracy(non_training_predict, (y_non > 0.5).astype(int))


Accuracy: 0.8544638851502916 

AUC: 0.788356421032658
True Positive: 664 
True Negative: 37428
False Positive: 6229 
False Negative: 259

Precision: 0.09632960974902074 
Recall: 0.7193932827735645 
F1 Score: 0.1699078812691914



0.8544638851502916

In [213]:
# def predict_all(df, best_model, training_df_mean, training_df_std):
#     '''
#     Parameters:
#     ___________
#     df (pandas dataframe):
#     Dataframe containing userinfo 
    
#     best_model (model):
#    a Keras model that turned out best
    
#     training_df_mean (float)
#     training_df_std (float)
    
    
#     Returns:
#     ________
#     df_with_predictions (Dataframe with predictions from the best model)
#     '''
#     global columns_to_standardize
    
#     df[columns_to_standardize] = (df[columns_to_standardize] - training_df_mean)/training_df_std
#     df_as_X = df.drop('username', axis=1).values
#     predicted_df = best_model.predict(df_as_X)
    
#     df['predicted'] = predicted_df
#     print("The percentage of bots is: {}".format((sum(df['predicted'] > 0.5)/len(df)) * 100))
#     return df

In [214]:
df[columns_to_standardize] = (df[columns_to_standardize] - training_df_mean)/training_df_std

In [215]:
df['predicted'] = a.predict(df.drop('username', axis=1).values)

In [216]:
sum(df['predicted'])

274875

In [217]:
(sum(df['predicted'])/len(df))*100

10.408225858974456

In [219]:
df[columns_to_standardize] = ((df[columns_to_standardize] * training_df_std)+  training_df_mean).astype(int)

In [220]:
df.to_csv('profiles_with_bot_or_not.csv', index=None)

In [221]:
filename = 'bot_detection.sav'
pickle.dump(a, open(filename, 'wb'))