In [1]:
from sklearn import preprocessing
from sklearn.externals import joblib
import sklearn as skl
from sklearn import tree
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_x = pd.read_pickle('train_x.pkl')
train_Y = pd.read_pickle('train_Y.pkl')
test_x = pd.read_pickle('test_x.pkl')
test_Y = pd.read_pickle('test_Y.pkl')
label_names = ['Benign','Bot','Brute Force-Web','Brute Force-XSS','DDOS attack-HOIC', 'DDOS attack-LOIC-UDP', 'DDoS attacks-LOIC-HTTP',
'DoS attacks-GoldenEye', 'DoS attacks-Hulk', 'DoS attacks-SlowHTTPTest', 'DoS attacks-Slowloris', 'FTP-BruteForce',
'Infilteration', 'Label', 'SQL Injection', 'SSH-Bruteforce']

In [3]:
# 5-class classification version
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss, accuracy_score

DTclassifier = DecisionTreeClassifier(random_state=10)
DTclassifier.fit(train_x, train_Y)

pred_y = DTclassifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)
accuracy_score = accuracy_score(test_Y, pred_y)

print("Decision Tree Classifier Results\n======================\nConfusion Matrix:\n{}\nError Value:{}\nAccuracy_Score:{}\n".format(results, error, accuracy_score))

Decision Tree Classifier Results
Confusion Matrix:
[[2506309      24       1   32724       1       1     239       1     286
        1     114       0  137528       0    1448       0]
 [  56389     882       0       0       0       0       0       0       0
        0       0       0       0       0       0       0]
 [    113       0       0       2       0       0       0       0       0
        0       0       0       4       0       5       0]
 [     29       0       0      25       0       0       0       0       0
        0       0       0       0       0       0       0]
 [ 137231       0       0       0       0       0       0       0       0
        0       0       0       0       0       0       0]
 [      0       0       0       0       0     274      41       0       0
        0       0       0       0       0       0       0]
 [  61508       0       0       0       0      91   13481       0       0
        0       0       0   40200       0       0       0]
 [   6728       0 

In [4]:
print('pred_y Labels by Count (Total: {})\n{}\n\n================\ntest_Y Labels by Count (Total: {})\n{}\n\n'.format(pd.Series(pred_y).sum(), pd.Series(pred_y).value_counts(), pd.Series(test_Y).sum(), pd.Series(test_Y).value_counts()))

pred_y Labels by Count (Total: 3238622)
0     2909897
12     183093
11      44812
3       32838
9       21324
8       18805
6       13764
14       1479
1         906
5         368
10        149
15         11
7           1
4           1
2           1
dtype: int64

test_Y Labels by Count (Total: 3736828)
0     2678677
4      137231
6      115280
8       92063
1       57271
11      38598
15      37256
12      32026
9       28104
7        8267
10       2155
5         315
2         124
3          54
14         15
13         13
Name: Label, dtype: int64




In [None]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
RFclassifier = RandomForestClassifier(n_estimators = 10, random_state = 10, n_jobs=-1, )
RFclassifier.fit(train_x, train_Y)

# Predicting the Test set results
y_pred = RFclassifier.predict(test_x)

results = confusion_matrix(test_Y, y_pred)
error = zero_one_loss(test_Y, y_pred)


print("Random Forest Decision Tree Classifier Results\n======================\nConfusion Matrix:\n{}\nError Value:{}\n".format(results, error))

Random Forest Decision Tree Classifier Results
Confusion Matrix:
[[2673351       0       0       0       0       0       0       0       0
        0       0       0    5326       0       0       0]
 [  29557   27714       0       0       0       0       0       0       0
        0       0       0       0       0       0       0]
 [     90       0      34       0       0       0       0       0       0
        0       0       0       0       0       0       0]
 [     30       0       0      24       0       0       0       0       0
        0       0       0       0       0       0       0]
 [ 104421       0       0       0   32810       0       0       0       0
        0       0       0       0       0       0       0]
 [      0       0       0       0       0     239      76       0       0
        0       0       0       0       0       0       0]
 [ 113641       0       0       0       0      48    1591       0       0
        0       0       0       0       0       0       0]
 [  

In [None]:
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(RFclassifier, filename)

['finalized_model.sav']

In [None]:
print('y_pred Labels by Count (Total: {})\n{}\n\n================\ntest_Y Labels by Count (Total: {})\n{}\n\n'.format(pd.Series(y_pred).sum(), pd.Series(y_pred).value_counts(), pd.Series(test_Y).sum(), pd.Series(test_Y).value_counts()))

y_pred Labels by Count (Total: 453929)
0     3143436
4       32810
1       27714
15      12461
12       5799
8        2249
6        1667
10        695
5         287
7         267
2          34
3          24
14          6
dtype: int64

test_Y Labels by Count (Total: 3736828)
0     2678677
4      137231
6      115280
8       92063
1       57271
11      38598
15      37256
12      32026
9       28104
7        8267
10       2155
5         315
2         124
3          54
14         15
13         13
Name: Label, dtype: int64




In [None]:
probability = RFclassifier.predict_proba(test_x)
RFresults_df = pd.DataFrame (data=probability,
                            columns=label_names)
RFresults_df['Attack Probability Total'] = RFresults_df.iloc[:, 1:].sum(axis=1)
RFresults_df.head(20)

Unnamed: 0,Benign,Bot,Brute Force-Web,Brute Force-XSS,DDOS attack-HOIC,DDOS attack-LOIC-UDP,DDoS attacks-LOIC-HTTP,DoS attacks-GoldenEye,DoS attacks-Hulk,DoS attacks-SlowHTTPTest,DoS attacks-Slowloris,FTP-BruteForce,Infilteration,Label,SQL Injection,SSH-Bruteforce,Attack Probability Total
0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.1
1,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.3
2,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.860583,3.1e-05,9.22925e-06,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,1.1e-05,0.0,0.139361,0.0,3.981482e-06,0.0,0.139417
5,0.99536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00464,0.0,0.0,0.0,0.00464
6,0.99037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00963,0.0,0.0,0.0,0.00963
7,0.994483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005517,0.0,0.0,0.0,0.005517
8,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.3
9,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.166667


In [None]:
RFresults_df.loc[35:40]

Unnamed: 0,Benign,Bot,Brute Force-Web,Brute Force-XSS,DDOS attack-HOIC,DDOS attack-LOIC-UDP,DDoS attacks-LOIC-HTTP,DoS attacks-GoldenEye,DoS attacks-Hulk,DoS attacks-SlowHTTPTest,DoS attacks-Slowloris,FTP-BruteForce,Infilteration,Label,SQL Injection,SSH-Bruteforce,Attack Probability Total
35,0.604953,0.0,2e-06,0.0,0.194909,0.0,0.000118,0.0,0.0,0.0,0.0,0.0,0.200018,0.0,2.439798e-07,0.0,0.395047
36,0.885911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114089,0.0,0.0,0.0,0.114089
37,0.645528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116504,0.1,0.083399,0.054549,0.0,0.0,2e-05,0.354472
38,0.999731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000269,0.0,0.0,0.0,0.000269
39,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1
40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
