# Importing the Data

**Goal**: Classify attacks assuming a multiclass classification system has alerted an attack.

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 

from sklearn.metrics import classification_report, accuracy_score

In [0]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Research/'  #change dir to your project folder

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#cleaned data without non-attack values
X = pd.read_csv(root_path+'Data/Features.csv', index_col=0)
Y = pd.read_csv(root_path+'Data/Target.csv', index_col=0)

# Feature Engineering

In [0]:
# %%capture warnings1
# cluster and score
from sklearn.cluster import Birch
from sklearn.metrics.cluster import fowlkes_mallows_score

score = []
for i in range(len(X.columns)): # loop number of features
  K = Birch(n_clusters=len(Y.attack_cat.unique())-1)
  pred = K.fit_predict(X.iloc[:, [i]].values)
  s = fowlkes_mallows_score(Y[Y.columns[0]].values,pred)
  score.append(s)
  




In [0]:
# Rank the features and sort

s2 = score
np.asarray(s2)

s1 = []
for i in range(len(X.columns)):
  s1.append(i)
  
np.asarray(s1)

li = list(zip(s1, s2))

def sortSecond(val): 
    return val[1] 
  
li.sort(key = sortSecond, reverse=True) 


print("Index of Feature , fowlkes_mallows_score\n")
for i in range(len(X.columns)):
    print(li[i])


Index of Feature , fowlkes_mallows_score

(9, 0.8749375133406012)
(32, 0.7848758646734929)
(1, 0.7848197753371678)
(6, 0.7803805243934263)
(14, 0.7800959020266208)
(15, 0.7800908073922788)
(30, 0.7734103792242389)
(17, 0.7674346483100938)
(16, 0.7614765747421549)
(0, 0.7355938628152945)
(28, 0.7316695228277279)
(29, 0.7298533147503066)
(5, 0.7190812053976248)
(19, 0.7157412249506845)
(18, 0.7115411942039832)
(2, 0.6985628990468433)
(34, 0.6942741606030993)
(35, 0.6942741606030993)
(22, 0.6937957072639206)
(11, 0.6937166552182957)
(27, 0.693705183475551)
(33, 0.6932420775295349)
(26, 0.6929806794515802)
(23, 0.6926566377166409)
(12, 0.6925181802679139)
(7, 0.6925062831048096)
(3, 0.6925045864743575)
(13, 0.692457283402267)
(20, 0.6923633863277526)
(31, 0.6923536279882361)
(8, 0.6923501887079928)
(4, 0.6923501814083546)
(21, 0.6922324749061872)
(24, 0.6843422304916076)
(25, 0.6843422304916076)
(10, 0.6617785136584845)
(39, 0.48990609404133845)
(42, 0.44227169417228707)
(37, 0.43998264349

In [0]:
# Create a copy of X dataframe with columns sorted by score

titles = []

for i in range(len(X.columns)):
  p = X.columns[li[i][0]]
  titles.append(p)


X1 = pd.DataFrame(columns=titles)

for i in range(len(X.columns)):
  X1[X1.columns[i]] = X[X.columns[li[i][0]]]




In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import accuracy_score

In [0]:
# RF w/out recursive Feature Elimination
x_train, x_test, y_train, y_test = train_test_split(X1, Y, random_state=11)
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred, digits=4))

  This is separate from the ipykernel package so we can avoid doing imports until


                precision    recall  f1-score   support

      Analysis     0.6726    0.1826    0.2872       619
      Backdoor     0.6203    0.0864    0.1517       567
           DoS     0.3189    0.2756    0.2957      4075
      Exploits     0.6335    0.8063    0.7095     11236
       Fuzzers     0.8961    0.8645    0.8800      6127
       Generic     0.9972    0.9855    0.9913     53886
Reconnaissance     0.9197    0.7374    0.8185      3385
     Shellcode     0.6927    0.6392    0.6649       388
         Worms     0.8824    0.3947    0.5455        38

      accuracy                         0.8902     80321
     macro avg     0.7370    0.5525    0.5938     80321
  weighted avg     0.8942    0.8902    0.8877     80321



In [0]:
# %%capture warnings
# Recursive Feature Elemination from # of features to 0 and keep the accuracy score of each

accuracy = []
X2 = X1.copy()

# for i in range(len(X1.columns)-1,-1,-1):
for i in range(len(X1.columns)-1): 
  x_train, x_test, y_train, y_test = train_test_split(X1, Y, test_size=0.2, random_state=11)
  X1.drop(X1.columns[len(X1.columns)-1], axis=1, inplace=True)
  clf = rf(random_state=0, n_jobs=-1)
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  accuracy.append(accuracy_score(y_test, y_pred)) 

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

In [0]:
index = accuracy.index(max(accuracy))
# for i in accuracy:
#   print(i)
  

# print(accuracy)

print(max(accuracy), "max accuracy")

print(index, "index")


X3 = X2.iloc[:,0:len(X.columns)-index]

0.8918561401870614 max accuracy
11 index


# Performance Analysis

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X3, Y, test_size=0.2, random_state=11)

In [0]:
from sklearn.ensemble import RandomForestClassifier as rf
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [0]:
x = list(zip(X3.columns, clf.feature_importances_))

def sortSecond(val): 
    return val[1] 
  
x.sort(key = sortSecond, reverse=True) 

for i in range(len(X3.columns)):
    print(x[i])

('service', 0.1765598402579533)
('smeansz', 0.16628796144337787)
('sbytes', 0.13435838538288009)
('proto', 0.12213605560253778)
('dur', 0.10226708730122687)
('dbytes', 0.05873399243813171)
('Spkts', 0.04098363314077131)
('dttl', 0.028190288839289084)
('dmeansz', 0.01998284070421096)
('sttl', 0.01781309207622855)
('Sjit', 0.015610510426119056)
('Sintpkt', 0.013178305818837787)
('Dload', 0.011142449590782633)
('sloss', 0.010199852803191068)
('stcpb', 0.009482131374493713)
('Djit', 0.009222174889627621)
('cprtt', 0.008891099437190506)
('synack', 0.00886679858116798)
('ackdat', 0.008377823160348566)
('Dpkts', 0.008293633061989816)
('dloss', 0.00824919959846256)
('Dintpkt', 0.007330237187846854)
('dtcpb', 0.006326495903729833)
('ct_flw_http_mthd', 0.003414872439399072)
('trans_depth', 0.0015861099609539586)
('state', 0.0010106286549626544)
('swin', 0.0006541787860206982)
('is_ftp_login', 0.00030221479650586336)
('ct_state_ttl', 0.0002524809863647272)
('dwin', 0.00024192038138239614)
('ct_ft

In [0]:
clf.score(x_train, y_train) 

0.9251165251764413

In [0]:
print("Accuracy")
clf.score(x_test, y_test) 

Accuracy


0.8918561401870614

In [0]:
y_pred = clf.predict(x_test)

print("Classification\n")
print(classification_report(y_test,y_pred, digits=4))

Classification

                precision    recall  f1-score   support

      Analysis     0.6481    0.1414    0.2322       495
      Backdoor     0.6418    0.0917    0.1604       469
           DoS     0.3644    0.1765    0.2378      3235
      Exploits     0.6163    0.8714    0.7220      8944
       Fuzzers     0.8613    0.8316    0.8462      4846
       Generic     0.9964    0.9872    0.9918     43219
Reconnaissance     0.9313    0.7444    0.8275      2696
     Shellcode     0.5990    0.3769    0.4627       321
         Worms     0.5333    0.2500    0.3404        32

      accuracy                         0.8919     64257
     macro avg     0.6880    0.4968    0.5357     64257
  weighted avg     0.8913    0.8919    0.8835     64257

