# Using bot_classifier on metoo data



### Setup

###### Import the classifier and mongo client 

In [9]:
from bots import bot_classifier
from pymongo import MongoClient
import numpy as np
import pandas as pd
import labels as l

###### Set up the DB connection

In [10]:
#db connection
mongoDBConnectionSTR = "mongodb://localhost:27017"
client = MongoClient(mongoDBConnectionSTR)
db = client.twitter #chose your DB name here

###### Set the field configuration and instantiate the object

In [11]:
field_config = "id_str;created_at;reply_count;retweet_count;in_reply_to_screen_name;text"
usr_field_config = "screen_name;description;followers_count;friends_count;listed_count;favourties_count;statuses_count;created_at;verified"

classifier = bot_classifier(db, field_config, usr_field_config)

### Manual Labeling Process, using logical suggestions to find and label bots

###### functions written to suggest bots, used to manually label bots

In [4]:
suspected_bots = classifier.suggest_bots(0, 0)

for tweet in suspected_bots:
    print(tweet)
    print('\n')

In [5]:
accounts_with_bot_in_name = classifier.find_names_with_bot(0, 0)

for tweet in accounts_with_bot_in_name:
    print(tweet)
    print('\n')

In [6]:
accounts_with_low_followers_tweets = classifier.followers_tweets(0, 0)

for tweet in accounts_with_low_followers_tweets:
    print(tweet)
    print('\n')

### Data Processing and Classification

###### train with inital data (1221 bots and humans), predict with test data (1211 bots and humans) 
###### each iteration predict probability for 'step' number of tweets, take k most uncertain samples,
###### label those, and then add the predictions to the training set and repeat
###### training set grows by k each iteration, iterations will be (end/step + 1) if you start at 1

In [12]:
# make testing data that will never be used for training, about 10% bots, shuffled
human_df = classifier.make_dataframe_of_humans(0, 50000, 1100, classifier.bots_for_test_set)
bot_df = classifier.make_dataframe_of_bots(0, 200000, len(classifier.bots_for_test_set), classifier.bots_for_test_set)
human_df_labeled = classifier.label(human_df, False)
bot_df_labeled = classifier.label(bot_df, True)
test_df = classifier.mix(human_df_labeled, bot_df_labeled)

# make intial training data, same percentage but will increase size as classifier runs
human_df = classifier.make_dataframe_of_humans(0, 50000, 1100, classifier.bots_for_training)
bot_df = classifier.make_dataframe_of_bots(0, 600000, len(classifier.bots_for_training), classifier.bots_for_training)
human_df_labeled = classifier.label(human_df, False)
bot_df_labeled = classifier.label(bot_df, True)
train_df = classifier.mix(human_df_labeled, bot_df_labeled)

#split data
X_test, y_test = classifier.split(test_df)
X_train, y_train = classifier.split(train_df)

# now run this puppy!

# start and end are seq_no's, k is sample size, i is iteration
classifier.run(X_train, y_train, X_test, y_test, k=10000, start=0, end=1000000, step=100000, i=1)


X_train: (1221, 7) y_train: (1221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 1


y-test set: (1211, 1)


Accuracy rate for 94.880264 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 15734.33it/s]


X_train: (11221, 7) y_train: (11221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 2


y-test set: (1211, 1)


Accuracy rate for 95.953757 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 16923.05it/s]


X_train: (21221, 7) y_train: (21221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 3


y-test set: (1211, 1)


Accuracy rate for 95.953757 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 16963.82it/s]


X_train: (31221, 7) y_train: (31221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 4


y-test set: (1211, 1)


Accuracy rate for 95.871181 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 16038.08it/s]


X_train: (41221, 7) y_train: (41221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 5


y-test set: (1211, 1)


Accuracy rate for 95.871181 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 17796.03it/s]


X_train: (51221, 7) y_train: (51221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 6


y-test set: (1211, 1)


Accuracy rate for 95.871181 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 17298.43it/s]


X_train: (61221, 7) y_train: (61221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 7


y-test set: (1211, 1)


Accuracy rate for 95.871181 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 18501.81it/s]


X_train: (71221, 7) y_train: (71221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 8


y-test set: (1211, 1)


Accuracy rate for 95.953757 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 15833.08it/s]


X_train: (81221, 7) y_train: (81221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 9


y-test set: (1211, 1)


Accuracy rate for 95.953757 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 15676.26it/s]


X_train: (91221, 7) y_train: (91221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 10


y-test set: (1211, 1)


Accuracy rate for 95.375723 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 13875.11it/s]


X_train: (101221, 7) y_train: (101221, 1) X_test: (1211, 7) y_test: (1211, 1)
--------------------------------
Iteration: 11


y-test set: (1211, 1)


Accuracy rate for 95.706028 
--------------------------------


100%|██████████| 10000/10000 [00:00<00:00, 14738.11it/s]
