In [2]:
import pandas as pd
import numpy as np

from keras.layers import Dense, Dense, Input
from keras.models import Model, load_model
from keras.utils import plot_model
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

from multiprocessing.pool import Pool

Using TensorFlow backend.


In [3]:
columns_to_standardize = ['friends_count', 'statuses_count', 'listed_count', 'followers_count', 'favourites_count']

In [4]:
def get_accuracy(AL, y, verbose=1):
    
    try:
        AL = np.array(AL)
        y = np.array(y)

        AL = AL.reshape(-1)
        y = y.reshape(-1)

        AL = AL > 0.5
        AL = AL.astype(int)

        y = y > 0.5
        y = y.astype(int)

        total = AL.shape[0]

        TP = np.sum(np.logical_and(AL==1, y==1))
        TN = np.sum(np.logical_and(AL==0, y==0))

        FP = np.sum(np.logical_and(AL==1, y==0))
        FN = np.sum(np.logical_and(AL==0, y==1))

        P = TP / (TP + FP)
        R = TP / (TP + FN)
        F1 = (2 * P * R) / (P + R)


        acc = np.sum(AL == y)/total


        if verbose == 1:
            print("\nAccuracy: {} \n".format(acc))
            print("True Positive: {} \nTrue Negative: {}\nFalse Positive: {} \nFalse Negative: {}\n".format(TP, TN, FP, FN))
            print("Precision: {} \nRecall: {} \nF1 Score: {}\n".format(P, R, F1))
        
        return acc
    except:
        return 0
        
    

In [7]:
def load_data():
    '''
    Returns:
    ________
    
    df (pandas dataframe):
    Dataframe containing userinfo
    
    users_bots (pandas dataframe):
    dataframe containing score from botometer
    '''
    
    users_bots = pd.read_csv('username_botometer.csv')
    df = pd.read_csv('../../data/new_profiles_cleaned.csv')
    election_accounts = pd.read_csv('../../data/collection_data/profile/extractedUsers.csv')
    election_accounts.columns = ['username']

    election_accounts['username'] = election_accounts['username'].str.lower()
    df['username'] = df['username'].str.lower()
    users_bots['username'] = users_bots['username'].str.lower()
    
    users_bots = users_bots[users_bots['username'].isin(election_accounts['username'])].reset_index(drop=True)
    df = df[df['username'].isin(election_accounts['username'])].reset_index(drop=True)
    
    return df, users_bots

In [8]:
df, users_bots = load_data()

In [9]:
%matplotlib inline

In [14]:
import matplotlib.pyplot as plt

In [10]:
df.shape

(2640940, 11)

In [5]:
def apply_threshold(users_bots, bot_threshold, user_threshold):
    '''
    Parameters:
    ___________
    
    users_bots (pandas dataframe):
    Dataframe containing username and botometer score
    
    bot_threshold (float):
    Threshold above which we will classify as bot during training
    
    user_threshold (float):
    Threshold below which we will classify as humans during training
    
    Returns:
    ________
    bot_accounts, clean_accounts
    '''
    bot_accounts = users_bots[users_bots['botometer'] > bot_threshold].reset_index(drop=True)
    clean_accounts = users_bots[users_bots['botometer'] < user_threshold].reset_index(drop=True)
    return bot_accounts, clean_accounts

In [6]:
def get_training_test(df, bot_accounts, clean_accounts, ratio, verbose=1):
    '''
    Parameters:
    ___________
    df (pandas dataframe):
    Dataframe containing user info
    
    bot_accounts (dataframe)
    
    clean_accounts (dataframe) 
    
    ratio (int):
    How many clean accounts per bot accounts
    '''
    
    #Remove any duplicates if they exist here but this function is too big
    bot_details = df[df['username'].isin(bot_accounts['username'])]
    bot_details = bot_details.reset_index(drop=True)

    clean_details = df[df['username'].isin(clean_accounts['username'])]
    clean_details = clean_details.reset_index(drop=True)
    clean_details = clean_details[:int(bot_details.shape[0] * ratio)]

    bot_details['BotOrNot'] = 1
    clean_details['BotOrNot'] = 0

    combined_df = clean_details.append(bot_details, ignore_index=True)

    combined_df = combined_df.drop_duplicates()
    
    if verbose == 1:
        print("Combined Details: {}".format(combined_df.shape))
        print("Bot Details: {}".format(bot_details.shape))
        print("Clean Details: {}".format(clean_details.shape))

    new_df = combined_df.sample(frac=1).reset_index(drop=True)
    bot_df = new_df[new_df['BotOrNot'] == 1].reset_index(drop=True)
    human_df = new_df[new_df['BotOrNot'] == 0]
    new_df = pd.concat([bot_df, human_df]).reset_index(drop=True)
    new_df = new_df.sample(frac=1).reset_index(drop=True)
    to_see = users_bots[~users_bots['username'].isin(combined_df['username'])].reset_index(drop=True)
    comparision_df = df[df['username'].isin(to_see['username'])]
    test_df = pd.merge(comparision_df, to_see,on="username")
    test_df['botometer'] = (test_df['botometer'] > 0.5).astype(int)
    
    total_bots = sum(test_df['botometer'] > 0.5)
    test_df = pd.concat([test_df[test_df['botometer'] == 0][:total_bots], test_df[test_df['botometer'] == 1]]).reset_index(drop=True)
    
    training_df = new_df.drop('username', axis=1)
    test_df = test_df.drop('username', axis=1)

    global columns_to_standardize

    training_df_mean = training_df[columns_to_standardize].mean()
    training_df_std = training_df[columns_to_standardize].std()

    training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std
    test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std
    X_train = training_df.drop(['BotOrNot'], axis=1).values
    y_train = training_df['BotOrNot'].values.reshape(-1,1)

    X_test = test_df.drop(['botometer'], axis=1).values
    y_test = test_df['botometer'].values.reshape(-1,1)
    
    return training_df_mean, training_df_std, X_train, y_train, X_test, y_test

In [7]:
def perform_all(df, users_bots, bot_threshold, human_threshold, ratio, verbose=1):
    '''
    Parameters:
    ___________
    df (pandas dataframe):
    Dataframe containing userinfo
    
    users_bots (pandas dataframe):
    dataframe containing score from botometer
    
    bot_threshold (float):
    Threshold above which we will classify as bot during training
    
    human_threshold (float):
    Threshold below which we will classify as humans during training
    
    ratio (int):
    How many clean accounts per bot accounts
    
    verbose (int):
    Display output and debug information if set to 1, dosen't if set to 0
    
    Returns:
    ________
    val_acc (float):
    accuracy of the validation set
    
    train_acc (float:)
    accuracy of the training set
    
    best_model (model):
    Keras model that turned out best
    
    training_df_mean (float)
    training_df_std (float)
    '''
    
    bot_accounts, clean_accounts = apply_threshold(users_bots, bot_threshold, human_threshold)

    training_df_mean, training_df_std, X_train, y_train, X_test, y_test = get_training_test(df, bot_accounts, clean_accounts, ratio, verbose=verbose)

    inp = Input(shape=[10])

    another = Dense(500, activation='relu')(inp)
    another = Dense(200, activation='relu')(another)
    another = Dense(1, activation='sigmoid')(another)

    mod = Model(inp, another)


    es = EarlyStopping(monitor='val_loss', mode='min', verbose=verbose, patience=200)
    mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=verbose, save_best_only=True)

    mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    mod.fit(x=X_train, y=y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), callbacks=[es, mc], verbose=verbose)

    best_model = load_model('best_model.h5')

    ytrain_mod = best_model.predict(X_train)
    train_acc = get_accuracy(ytrain_mod, y_train, verbose=verbose)


    ytest_mod = best_model.predict(X_test)
    val_acc = get_accuracy(ytest_mod, y_test, verbose=verbose)
    return val_acc, train_acc, best_model, training_df_mean, training_df_std

In [9]:
def find_hyperparameters():
    global df
    global users_bots
    bot_threshold = [0.39, 0.5, 0.6, 0.7, 0.8, 0.9]
    human_threshold = [0.4, 0.3, 0.2, 0.15, 0.1, 0.05]
    ratios = [1, 2]
    results = pd.DataFrame(columns=['bot', 'human', 'ratio', 'val_acc', 'train_acc'])

    i = 1
    total = len(bot_threshold) * len(human_threshold) * len(ratios)

    for bot in bot_threshold:
        for human in human_threshold:
            for ratio in ratios:
                print("{} of {}".format(i, total))
                val_acc, train_acc, best_model, training_df_mean, training_df_std = perform_all(df, users_bots, bot, human, ratio, verbose=0)            

                currentResult = pd.Series({'bot': bot, 'human': human, 'ratio': ratio, 'val_acc': val_acc, 'train_acc': train_acc})
                results = results.append(currentResult, ignore_index=True)
                results.to_csv('current_results.csv', index=None)
                i = i + 1
                
# find_hyperparameters()

In [11]:
results = pd.read_csv('current_results.csv')

In [12]:
results['val_train_avg'] = (results['val_acc'] + results['train_acc'])/2

In [13]:
def predict_all(df, best_model, training_df_mean, training_df_std):
    '''
    Parameters:
    ___________
    df (pandas dataframe):
    Dataframe containing userinfo 
    
    best_model (model):
    Keras model that turned out best
    
    training_df_mean (float)
    training_df_std (float)
    
    
    Returns:
    ________
    df_with_predictions (Dataframe with predictions from the best model)
    '''
    global columns_to_standardize
    
    df[columns_to_standardize] = (df[columns_to_standardize] - training_df_mean)/training_df_std
    df_as_X = df.drop('username', axis=1).values
    predicted_df = best_model.predict(df_as_X)
    
    df['predicted'] = predicted_df
    print("The percentage of bots is: {}".format((sum(df['predicted'] > 0.5)/len(df)) * 100))
    return df

In [14]:
# for idx, config in results.sort_values('val_train_avg', ascending=False)[:10].iterrows():
    
#     print(config)
#     val_acc, train_acc, best_model, training_df_mean, training_df_std = perform_all(df, users_bots, config['bot'], config['human'], config['ratio'])
#     df_with_predictions = predict_all(df, best_model, training_df_mean, training_df_std)
    
#     try:
#         df = df.drop('predicted', axis=1)
#     except:
#         pass

In [17]:
try:
    df = df.drop('predicted', axis=1)
except:
    pass
val_acc, train_acc, best_model, training_df_mean, training_df_std = perform_all(df, users_bots, 0.75, 0.15, 1)

Combined Details: (3844, 12)
Bot Details: (1922, 12)
Clean Details: (1922, 12)
Train on 3844 samples, validate on 12780 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.70321, saving model to best_model.h5
Epoch 2/20

Epoch 00002: val_acc did not improve from 0.70321
Epoch 3/20

Epoch 00003: val_acc improved from 0.70321 to 0.70399, saving model to best_model.h5
Epoch 4/20

Epoch 00004: val_acc improved from 0.70399 to 0.70743, saving model to best_model.h5
Epoch 5/20

Epoch 00005: val_acc improved from 0.70743 to 0.71369, saving model to best_model.h5
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.71369
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.71369
Epoch 8/20

Epoch 00008: val_acc improved from 0.71369 to 0.71377, saving model to best_model.h5
Epoch 9/20

Epoch 00009: val_acc did not improve from 0.71377
Epoch 10/20

Epoch 00010: val_acc improved from 0.71377 to 0.71588, saving model to best_model.h5
Epoch 11/20

Epoch 00011: val_acc improved f

In [18]:
df_with_predictions = predict_all(df, best_model, training_df_mean, training_df_std)

The percentage of bots is: 31.557589343188408
