# Importing the Data

**Goal**: Classify attacks assuming a multiclass classification system has alerted an attack.

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 

from sklearn.metrics import classification_report, accuracy_score

In [0]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Research/'  #change dir to your project folder

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#cleaned data without non-attack values
X = pd.read_csv(root_path+'Data/Features.csv', index_col=0)
Y = pd.read_csv(root_path+'Data/Target.csv', index_col=0)

# Feature Engineering

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=11)

In [0]:
from sklearn.ensemble import RandomForestClassifier as rf
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [0]:
x = list(zip(X.columns, clf.feature_importances_))

def sortSecond(val): 
    return val[1] 
  
x.sort(key = sortSecond, reverse=True) 

for i in range(len(X.columns)):
    print(x[i])

('service', 0.1481878906660714)
('ct_srv_dst', 0.11396218558954178)
('sbytes', 0.10484271865442937)
('smeansz', 0.09830263634516959)
('proto', 0.06986076860310732)
('ct_src_ ltm', 0.057807285216489854)
('dbytes', 0.05617395375535115)
('Sload', 0.04453260476991735)
('ct_srv_src', 0.043562440271205496)
('dmeansz', 0.022809307921695825)
('ct_dst_src_ltm', 0.019538244733776203)
('ct_dst_sport_ltm', 0.016209165757063895)
('dttl', 0.014957181183652347)
('Stime', 0.014144569609000865)
('sttl', 0.014032426571740652)
('synack', 0.013128448804589446)
('dur', 0.01169588775380155)
('dloss', 0.010531467652292655)
('Ltime', 0.01017775183514636)
('ct_dst_ltm', 0.009678233710764332)
('Sjit', 0.008943068987135764)
('Djit', 0.008934598054916174)
('Sintpkt', 0.008842369199198927)
('cprtt', 0.008443010153880243)
('ackdat', 0.008396351610242193)
('Dload', 0.007703895931185758)
('Spkts', 0.006890119698709007)
('ct_src_dport_ltm', 0.006741063750287929)
('sloss', 0.006275593583805317)
('Dintpkt', 0.0057632448

# Performance Analysis

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X[['service', 'ct_srv_dst', 'sbytes', 'smeansz', 'proto', 'ct_src_ ltm' , 'dbytes']], Y, test_size=0.2, random_state=11)

In [0]:
from sklearn.ensemble import RandomForestClassifier as rf
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
print("Accuracy")
clf.score(x_train, y_train) 

Accuracy


0.9217472162349334

In [0]:
y_pred = clf.predict(x_test)

print("Classification\n")
print(classification_report(y_test,y_pred, digits=4))

Classification

                precision    recall  f1-score   support

      Analysis     0.6693    0.1717    0.2733       495
      Backdoor     0.7031    0.0959    0.1689       469
           DoS     0.3269    0.2244    0.2661      3235
      Exploits     0.6294    0.8301    0.7159      8944
       Fuzzers     0.8976    0.8553    0.8760      4846
       Generic     0.9963    0.9879    0.9921     43219
Reconnaissance     0.9135    0.7556    0.8270      2696
     Shellcode     0.7143    0.7009    0.7075       321
         Worms     0.6765    0.7188    0.6970        32

      accuracy                         0.8934     64257
     macro avg     0.7252    0.5934    0.6138     64257
  weighted avg     0.8944    0.8934    0.8883     64257

