In [1]:
#import statements
import pandas as pd
import numpy as np
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import preprocessing
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
#load dataset into dataframe
df = pd.read_csv('MLTEST_clean.csv')

In [3]:
#convert benign/bot class labels into numeric values
NewLabel = []
for i in df["Label"]:
    if i =="BENIGN":
        NewLabel.append(0)
    else:
        NewLabel.append(1)
df["Label"]=NewLabel

In [4]:
#assign feature matrix (x) and response vector (y)
y = df['Label'].values
del df['Label']
x = df.values

In [5]:
#transform feature matrix (x) to map original values to a more uniform distribution
scaler = QuantileTransformer(n_quantiles=1000, random_state=42)
scaled_df = scaler.fit_transform(x)
x = pd.DataFrame(scaled_df)

In [6]:
#split data into train and test set
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [7]:
#begin individual classifier training and evaluation
#AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=50, learning_rate=1)
ada.fit(xTrain,yTrain)
yPred = ada.predict(xTest)
print("AdaBoostClassifier Performance Metrics")
print("Accuracy Score: ", accuracy_score(yTest,yPred))
print("Precision Score: ", precision_score(yTest,yPred))
print("Recall Score: ", recall_score(yTest,yPred))
print("F1 Score: ", f1_score(yTest,yPred))
tn, fp, fn, tp = confusion_matrix(yTest, yPred).ravel()
fpr = (fp / (fp + tn)) * 100
print("False Positive Rate: " +str(fpr))
print("AdaBoostClassifier Confusion Matrix:")
print("True Positives: " +str(tp))
print("False Positives: " +str(fp))
print("True Negatives: " +str(tn))
print("False Negatives: " +str(fn))

AdaBoostClassifier Performance Metrics
Accuracy Score:  0.9997119136788624
Precision Score:  0.9856801909307876
Recall Score:  0.9880382775119617
F1 Score:  0.986857825567503
False Positive Rate: 0.015887726731100223
AdaBoostClassifier Confusion Matrix:
True Positives: 413
False Positives: 6
True Negatives: 37759
False Negatives: 5


In [8]:
#CART
cart = DecisionTreeClassifier(max_depth=3)
cart.fit(xTrain,yTrain)
yPred = cart.predict(xTest)
print("DecisionTreeClassifier Performance Metrics")
print("Accuracy Score: ", accuracy_score(yTest,yPred))
print("Precision Score: ", precision_score(yTest,yPred))
print("Recall Score: ", recall_score(yTest,yPred))
print("F1 Score: ", f1_score(yTest,yPred))
tn, fp, fn, tp = confusion_matrix(yTest, yPred).ravel()
fpr = (fp / (fp + tn)) * 100
print("False Positive Rate: " +str(fpr))
print("DecisionTreeClassifier Confusion Matrix:")
print("True Positives: " +str(tp))
print("False Positives: " +str(fp))
print("True Negatives: " +str(tn))
print("False Negatives: " +str(fn))

DecisionTreeClassifier Performance Metrics
Accuracy Score:  0.9953382395306812
Precision Score:  0.9651162790697675
Recall Score:  0.5956937799043063
F1 Score:  0.7366863905325445
False Positive Rate: 0.023831590096650337
DecisionTreeClassifier Confusion Matrix:
True Positives: 249
False Positives: 9
True Negatives: 37756
False Negatives: 169


In [9]:
#Naive Bayes
nb = GaussianNB()
nb.fit(xTrain, yTrain)
yPred = nb.predict(xTest)
print("Naive Bayes Performance Metrics")
print("Accuracy Score: ", accuracy_score(yTest,yPred))
print("Precision Score: ", precision_score(yTest,yPred))
print("Recall Score: ", recall_score(yTest,yPred))
print("F1 Score: ", f1_score(yTest,yPred))
tn, fp, fn, tp = confusion_matrix(yTest, yPred).ravel()
fpr = (fp / (fp + tn)) * 100
print("False Positive Rate: " +str(fpr))
print("Naive Bayes Confusion Matrix:")
print("True Positives: " +str(tp))
print("False Positives: " +str(fp))
print("True Negatives: " +str(tn))
print("False Negatives: " +str(fn))

Naive Bayes Performance Metrics
Accuracy Score:  0.6875049105622921
Precision Score:  0.033846153846153845
Recall Score:  1.0
F1 Score:  0.06547619047619048
False Positive Rate: 31.59539255924798
Naive Bayes Confusion Matrix:
True Positives: 418
False Positives: 11932
True Negatives: 25833
False Negatives: 0


In [10]:
#KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(xTrain, yTrain) 
yPred = neigh.predict(xTest)
print("KNN Performance Metrics")
print("Accuracy Score: ", accuracy_score(yTest,yPred))
print("Precision Score: ", precision_score(yTest,yPred))
print("Recall Score: ", recall_score(yTest,yPred))
print("F1 Score: ", f1_score(yTest,yPred))
tn, fp, fn, tp = confusion_matrix(yTest, yPred).ravel()
fpr = (fp / (fp + tn)) * 100
print("False Positive Rate: " +str(fpr))
print("KNN Confusion Matrix:")
print("True Positives: " +str(tp))
print("False Positives: " +str(fp))
print("True Negatives: " +str(tn))
print("False Negatives: " +str(fn))

KNN Performance Metrics
Accuracy Score:  0.9992143100332609
Precision Score:  0.9597156398104265
Recall Score:  0.9688995215311005
F1 Score:  0.9642857142857143
False Positive Rate: 0.04501522573811731
KNN Confusion Matrix:
True Positives: 405
False Positives: 17
True Negatives: 37748
False Negatives: 13


In [11]:
#RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=None)
rf.fit(xTrain, yTrain)
yPred = rf.predict(xTest)
print("Random Forest Performance Metrics")
print("Accuracy Score: ", accuracy_score(yTest,yPred))
print("Precision Score: ", precision_score(yTest,yPred))
print("Recall Score: ", recall_score(yTest,yPred))
print("F1 Score: ", f1_score(yTest,yPred))
tn, fp, fn, tp = confusion_matrix(yTest, yPred).ravel()
fpr = (fp / (fp + tn)) * 100
print("False Positive Rate: " +str(fpr))
print("Random Forest Confusion Matrix:")
print("True Positives: " +str(tp))
print("False Positives: " +str(fp))
print("True Negatives: " +str(tn))
print("False Negatives: " +str(fn))

Random Forest Performance Metrics
Accuracy Score:  0.9996595343477463
Precision Score:  0.9975429975429976
Recall Score:  0.9712918660287081
F1 Score:  0.9842424242424243
False Positive Rate: 0.0026479544551833707
Random Forest Confusion Matrix:
True Positives: 406
False Positives: 1
True Negatives: 37764
False Negatives: 12
