In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
import time
import datetime
def train(classifier, name, param_grid=None) :
    start_time = time.time()
    if param_grid == None :
        classifier.fit(X_train, y_train)
        results[name] = dict(model=classifier)
    else :
        grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy', n_jobs=2) # Do a 10-fold cross validation
        grid.fit(X, y) # fit the grid with data
        results[name] = dict(grid=grid, model=classifier)
    #total_time = datetime.datetime.fromtimestamp(time.time() - start_time)
    total_time = datetime.timedelta(seconds=time.time() - start_time)
    print("Training time : " + str(total_time))#.strftime('%H:%M:%S'))

In [3]:
import pandas as pd

#Load N features and add Label, and make label y
bot_features = pd.read_pickle("bot_features.pkl")
bot_number = len(bot_features)

hum_features = pd.read_pickle("hum_features.pkl")
hum_number = len(hum_features)

features = pd.concat([bot_features, hum_features])
feature_number = len(features)
X = features.values.tolist()
y = []
for i in range(0, bot_number) :
    y.append(False)
for i in range(bot_number, feature_number) :
    y.append(True)

print(len(X))
print(len(y))

3037
3037


In [4]:
# Divise dataset
def divide_dataset(X, y) :
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = divide_dataset(X, y)

In [5]:
results = dict()

name = "k-NN"
classifier = KNeighborsClassifier(weights='uniform')
k_range = list(range(1, 31)) # list of parameter values to test
param_grid = dict(n_neighbors=k_range)
train(classifier, name, param_grid)

Training time : 0:00:02.053016


In [6]:
from sklearn.neural_network import MLPClassifier
name = "Neural net"
#classifier = MLPClassifier(alpha=1)
classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
train(classifier, name)

Training time : 0:00:00.062830


In [7]:
from prettytable import PrettyTable
import operator
from sklearn import metrics
import math
t = PrettyTable(['Model', 'Best score', 'accuracy', 'precision', 'recall', 'F-M.', 'MCC', 'AUC'])
for clf_name, result in results.items() :
    model = result['model']
    if 'grid' in result :
        grid = result['grid']
        score = grid.best_score_
        # Compute false positives and false negatives
        model.__init__(**grid.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #print(result.best_estimator_)
    else : # For non grid_search models
        #training_error = clf.score(X_train, y_train)
        score = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    #print(clf_name + " tn=" + str(tn) + " fp=" + str(fp) + " fn=" + str(fn) + " tp=" + str(tp))
    accuracy = float(tp + tn) / (tp + tn + fp + fn)
    precision = float(tp) / (tp + fp)
    recall = float(tp) / (tp + fn) # a.k.a. sensitivity
    f_measure = float(2 * precision * recall) / (precision + recall)
    mcc = -1
    if fp!=0 and tp != 0 and tn != 0 and fn!= 0:
        mcc = float(tp * tn - fp * fn) / math.sqrt(float(tp+fn) * (tp+fp) * (tn+fp) * (tn+fn)) # Matthew Correlation Coefficient
    auc = metrics.auc(fpr, tpr)
    t.add_row([clf_name, round(score, 3), round(accuracy, 3), round(precision,3), round(recall,3), round(f_measure,3), round(mcc,3), round(auc,3)]) #fp, tn, fn, tp])

        
print(t.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True))

+------------+------------+----------+-----------+--------+-------+-------+-------+
|   Model    | Best score | accuracy | precision | recall |  F-M. |  MCC  |  AUC  |
+------------+------------+----------+-----------+--------+-------+-------+-------+
| Neural net |   0.821    |  0.821   |   0.823   | 0.912  | 0.865 | 0.606 | 0.788 |
|    k-NN    |   0.808    |  0.817   |   0.835   | 0.885  | 0.859 | 0.599 | 0.792 |
+------------+------------+----------+-----------+--------+-------+-------+-------+
