In [3]:
import pandas as pd

In [146]:
bot_threshold = 0.5
user_threshold = 0.3
# ratio = 3

In [147]:
def load_data():
    '''
    Returns:
    ________
    
    df (pandas dataframe):
    Dataframe containing userinfo
    
    users_bots (pandas dataframe):
    dataframe containing score from botometer
    '''
    
    users_bots = pd.read_csv('username_botometer.csv')
    df = pd.read_csv('../../data/profiles_in_tweets.csv')
    election_accounts = pd.read_csv('../../data/collection_data/profile/extractedUsers.csv')
    election_accounts.columns = ['username']

    election_accounts['username'] = election_accounts['username'].str.lower()
    df['username'] = df['username'].str.lower()
    users_bots['username'] = users_bots['username'].str.lower()
    
    users_bots = users_bots[users_bots['username'].isin(election_accounts['username'])].reset_index(drop=True)
    df = df[df['username'].isin(election_accounts['username'])].reset_index(drop=True)
    
    return df, users_bots

In [148]:
df, users_bots = load_data()

In [149]:
def apply_threshold(users_bots, bot_threshold, user_threshold):
    '''
    Parameters:
    ___________
    
    users_bots (pandas dataframe):
    Dataframe containing username and botometer score
    
    bot_threshold (float):
    Threshold above which we will classify as bot during training
    
    user_threshold (float):
    Threshold below which we will classify as humans during training
    
    Returns:
    ________
    bot_accounts, clean_accounts
    '''
    bot_accounts = users_bots[users_bots['botometer'] > bot_threshold].reset_index(drop=True)
    clean_accounts = users_bots[users_bots['botometer'] < user_threshold].reset_index(drop=True)
    return bot_accounts, clean_accounts

In [150]:
bot_accounts, clean_accounts = apply_threshold(users_bots, bot_threshold, user_threshold)

In [151]:
bot_accounts.shape

(1583, 2)

In [152]:
#Remove any duplicates if they exist here but this function is too big
bot_details = df[df['username'].isin(bot_accounts['username'])]
bot_details = bot_details.reset_index(drop=True)

clean_details = df[df['username'].isin(clean_accounts['username'])]
clean_details = clean_details.reset_index(drop=True)
# clean_details = clean_details[:int(bot_details.shape[0] * ratio)]

bot_details['BotOrNot'] = 1
clean_details['BotOrNot'] = 0

combined_df = clean_details.append(bot_details, ignore_index=True)

combined_df = combined_df.drop_duplicates()

print("Combined Details: {}".format(combined_df.shape))
print("Bot Details: {}".format(bot_details.shape))
print("Clean Details: {}".format(clean_details.shape))

new_df = combined_df.sample(frac=1).reset_index(drop=True)
bot_df = new_df[new_df['BotOrNot'] == 1].reset_index(drop=True)
human_df = new_df[new_df['BotOrNot'] == 0]
new_df = pd.concat([bot_df, human_df]).reset_index(drop=True)
new_df = new_df.sample(frac=1).reset_index(drop=True)
# new_df = new_df.sample(frac=1).reset_index(drop=True)

# training_df = new_df.drop('username', axis=1)
# test_df = test_df.drop('username', axis=1)

Combined Details: (27869, 12)
Bot Details: (513, 12)
Clean Details: (27356, 12)


In [163]:
split = int(new_df.shape[0] * .95)

In [164]:
train_df = new_df[:split]

In [165]:
test_df = new_df[split:]

In [166]:
train_df.to_csv('training_set.csv', index=None)

In [167]:
test_df.to_csv('test_set.csv', index=None)