In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
import time
import datetime
def train(classifier, name, param_grid=None) :
    start_time = time.time()
    if param_grid == None :
        classifier.fit(X_train, y_train)
        results[name] = dict(model=classifier)
    else :
        grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy', n_jobs=2) # Do a 10-fold cross validation
        grid.fit(X, y) # fit the grid with data
        results[name] = dict(grid=grid, model=classifier)
    #total_time = datetime.datetime.fromtimestamp(time.time() - start_time)
    total_time = datetime.timedelta(seconds=time.time() - start_time)
    print("Training time : " + str(total_time))#.strftime('%H:%M:%S'))

In [3]:
import pandas as pd

#Load N features and add Label, and make label y
bot = pd.read_pickle("bot_features.pkl")
bot_features = bot['features']
bot_number = len(bot_features)
#print(bot_features)

hum = pd.read_pickle("hum_features.pkl")
hum_features = hum['features']
hum_number = len(hum_features)
#print(hum_features)

features = pd.concat([bot_features, hum_features])
feature_number = len(features)
print(features)

X = features.values
X = [list(feature.values()) for feature in X]

y = []
for i in range(0, bot_number) :
    y.append(False)
for i in range(bot_number, feature_number) :
    y.append(True)

print(len(X))
print(len(y))

0       {'friends/follower_ratio': 39.888888888888886,...
1       {'friends/follower_ratio': 0.16568559556786702...
2       {'friends/follower_ratio': 0.1044140625, 'bot_...
3       {'friends/follower_ratio': 0.5298765432098765,...
4       {'friends/follower_ratio': 183.5, 'bot_in_biog...
                              ...                        
1945    {'friends/follower_ratio': 0.00286612426035502...
1946    {'friends/follower_ratio': 0.11363636363636363...
1947    {'friends/follower_ratio': 0, 'bot_in_biograph...
1948    {'friends/follower_ratio': 1.0, 'bot_in_biogra...
1949    {'friends/follower_ratio': 0.00276783467666658...
Name: features, Length: 3038, dtype: object
3038
3038


In [4]:
# Divise dataset
def divide_dataset(X, y) :
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = divide_dataset(X, y)

In [5]:
results = dict()

name = "k-NN"
classifier = KNeighborsClassifier(weights='uniform')
k_range = list(range(1, 31)) # list of parameter values to test
param_grid = dict(n_neighbors=k_range)
train(classifier, name, param_grid)

Training time : 0:00:03.488848


In [6]:
from sklearn.neural_network import MLPClassifier
name = "Neural net"
#classifier = MLPClassifier(alpha=1)
classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
train(classifier, name)

Training time : 0:00:00.287048


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [7]:
name = "Random forest"
classifier = RandomForestClassifier()
d_range = list(range(1, 31)) # list of parameter values to test
#s_range = list(range(2, 10))
param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name, param_grid)

Training time : 0:00:24.162894


In [8]:
name = "Log. Regression"
classifier = LogisticRegression()
train(classifier, name)

Training time : 0:00:00.031913


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
from prettytable import PrettyTable
import operator
from sklearn import metrics
import math
t = PrettyTable(['Model', 'Best score', 'accuracy', 'precision', 'recall', 'F-M.', 'MCC', 'AUC'])
for clf_name, result in results.items() :
    model = result['model']
    if 'grid' in result :
        grid = result['grid']
        score = grid.best_score_
        # Compute false positives and false negatives
        model.__init__(**grid.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #print(result.best_estimator_)
    else : # For non grid_search models
        #training_error = clf.score(X_train, y_train)
        score = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    #print(clf_name + " tn=" + str(tn) + " fp=" + str(fp) + " fn=" + str(fn) + " tp=" + str(tp))
    accuracy = float(tp + tn) / (tp + tn + fp + fn)
    precision = float(tp) / (tp + fp)
    recall = float(tp) / (tp + fn) # a.k.a. sensitivity
    f_measure = float(2 * precision * recall) / (precision + recall)
    mcc = -1
    if fp!=0 and tp != 0 and tn != 0 and fn!= 0:
        mcc = float(tp * tn - fp * fn) / math.sqrt(float(tp+fn) * (tp+fp) * (tn+fp) * (tn+fn)) # Matthew Correlation Coefficient
    auc = metrics.auc(fpr, tpr)
    t.add_row([clf_name, round(score, 3), round(accuracy, 3), round(precision,3), round(recall,3), round(f_measure,3), round(mcc,3), round(auc,3)]) #fp, tn, fn, tp])

        
print(t.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True))

+-----------------+------------+----------+-----------+--------+-------+-------+-------+
|      Model      | Best score | accuracy | precision | recall |  F-M. |  MCC  |  AUC  |
+-----------------+------------+----------+-----------+--------+-------+-------+-------+
|  Random forest  |    0.96    |  0.958   |    0.96   | 0.975  | 0.967 |  0.91 | 0.952 |
| Log. Regression |   0.918    |  0.918   |   0.908   | 0.968  | 0.937 | 0.824 |  0.9  |
|    Neural net   |   0.909    |  0.909   |   0.893   | 0.973  | 0.931 | 0.805 | 0.887 |
|       k-NN      |   0.883    |  0.894   |   0.894   | 0.945  | 0.919 | 0.771 | 0.876 |
+-----------------+------------+----------+-----------+--------+-------+-------+-------+


In [10]:
with open("./datasets/Un_crawled_data/unlabeled.tsv", "r") as f :
    TSV = f.readlines()
classes = {"human":1, "bot":0}
dataset = []
for label in TSV:
    #All user_ids in TSV are bot.
    user_id = int(label.split()[0])
    label = label.split()[1]
    dataset.append([user_id, classes[label]])
print("Length of dataset used for unlabeled sample")
print(len(dataset))

print("Info of unlabeled_data")
unlabeled = pd.read_pickle("unlabeled_features.pkl")
unlabeled_number = len(unlabeled)
print(unlabeled)

Length of dataset used for unlabeled sample
829
Info of unlabeled_data
             id      screen_name  \
0    2718436417        DJGruuvan   
1    3287012484     penelope20mn   
2      93816184       flipcritic   
3    3027809025       leitmarvin   
4    4462881555  ShitpostBot5000   
..          ...              ...   
344  4518999021  shakethatbrass2   
345  3231781692       tweegeemee   
346  3317501195    rightnowbbcr2   
347   989135184   rightnow6music   
348  2433869952     trashcanlife   

                                              features  
0    {'friends/follower_ratio': 0.01036855500047129...  
1    {'friends/follower_ratio': 2.735031447908205e-...  
2    {'friends/follower_ratio': 0.00011402278394249...  
3    {'friends/follower_ratio': 0.00455166135639508...  
4    {'friends/follower_ratio': 4.228058749289927e-...  
..                                                 ...  
344  {'friends/follower_ratio': 0.2, 'bot_in_biogra...  
345  {'friends/follower_ratio': 0.000626

In [11]:
import numpy as np
import random

def maskData(true_labels, percentage):

    mask = np.ones((1,len(true_labels)),dtype=bool)[0]
    labels = true_labels.copy()
    
    for l, enc in zip(np.unique(true_labels),range(0,len(np.unique(true_labels)))):
        
        deck = np.argwhere(true_labels == l).flatten()        
        random.shuffle(deck)
        
        mask[deck[:int(percentage * len(true_labels[true_labels == l]))]] = False

        labels[labels == l] = enc

    labels[mask] = -1
    
    return np.array(labels).astype(int)

In [12]:
from sklearn.semi_supervised import LabelPropagation

features = unlabeled["features"]

X_unlabeled = features.values
X_unlabeled = [list(feature.values()) for feature in X_unlabeled]

y_unlabeled = list()

for i in range(unlabeled_number):
    y_unlabeled.append(True)
y_unlabeled = np.array(y_unlabeled)

y_unlabeled = maskData(y_unlabeled, 0.05)
y_unlabeled = y_unlabeled.tolist()
y = y_test+y_unlabeled
X = X_test+X_unlabeled

print("Semi supervised Learning by Label Propagation (Ratio of labeled:unlabeled is about 7:3)")
model = LabelPropagation()
model.fit(X, y)
pred = np.array(model.predict(X))

Semi supervised Learning by Label Propagation (Ratio of labeled:unlabeled is about 7:3)


In [13]:
count= 0
hum=0
bot=0
labels = y
for i, j in zip(labels, pred) :
    if (i==j) :
        count+=1
    if i==1:
        hum+=1
    if i==0:
        bot+=1
print("Human/Bot ratio in Test data")
print(hum, bot)
print("semi-supervised learning result")
print(count/len(labels))

Human/Bot ratio in Test data
965 387
semi-supervised learning result
0.9948224852071006
