# Importing the Data

**Goal**: Classify attacks assuming a multiclass classification system has alerted an attack.

In [0]:



import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
 

from sklearn.metrics import classification_report, accuracy_score

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Research/'  #change dir to your project folder

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#cleaned data without non-attack values
X = pd.read_csv(root_path+'Data/Features.csv', index_col=0)
Y = pd.read_csv(root_path+'Data/Target.csv', index_col=0)

# Mini-Experiments

In [4]:
Y.head()

Unnamed: 0,attack_cat
0,Exploits
1,Exploits
2,Reconnaissance
3,Exploits
4,Exploits


In [5]:
len(Y.attack_cat.unique())

9

In [6]:
X[X.columns[1]].values

array([-0.45662979,  1.81115277,  1.81115277, ...,  4.07893533,
        4.07893533,  4.07893533])

In [7]:
X.iloc[:, [1]].values

array([[-0.45662979],
       [ 1.81115277],
       [ 1.81115277],
       ...,
       [ 4.07893533],
       [ 4.07893533],
       [ 4.07893533]])

In [8]:
print(X.shape[1])
print(len(X.columns))
print(X.columns[1])

43
43
state


In [9]:
Y.iloc[:, [0]].values

array([['Exploits'],
       ['Exploits'],
       ['Reconnaissance'],
       ...,
       ['DoS'],
       ['DoS'],
       ['Exploits']], dtype=object)

# Feature Engineering

In [0]:
%%capture warnings1
# cluster and score
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_score as hs

score = []
for i in range(len(X.columns)): # loop number of features
  K = KMeans(n_clusters=len(Y[Y.columns[0]].unique()), random_state=0)
  pred = K.fit_predict(X.iloc[:, [i]].values)
  s = hs(Y[Y.columns[0]].values,pred)
  score.append(s)
  


In [11]:
# Rank the features and sort

s2 = score
np.asarray(s2)

s1 = []
for i in range(len(X.columns)):
  s1.append(i)
  
np.asarray(s1)

li = list(zip(s1, s2))

def sortSecond(val): 
    return val[1] 
  
li.sort(key = sortSecond, reverse=True) 


print("Index of Feature , Homogeneity Score\n")
for i in range(len(X.columns)):
    print(li[i])


Index of Feature , Homogeneity Score

(9, 0.5207695153723191)
(41, 0.4142744905105768)
(36, 0.41036338781504883)
(37, 0.4072794076716928)
(40, 0.39718819516257775)
(42, 0.3883290516706941)
(38, 0.37111109211908616)
(39, 0.3120358646245406)
(18, 0.29787944624575063)
(10, 0.2361394483802898)
(19, 0.22955413365459518)
(32, 0.2000892968235508)
(1, 0.1998217658223299)
(28, 0.1950739634786004)
(6, 0.19401906633262922)
(14, 0.18929063032108048)
(15, 0.18927970359288043)
(30, 0.18854984970054992)
(17, 0.17539604737631084)
(16, 0.17449540882453976)
(29, 0.17183249944109008)
(2, 0.1583913539321163)
(0, 0.12225474910637356)
(27, 0.10973537879125332)
(5, 0.0986455027153642)
(22, 0.09076964544739254)
(33, 0.048443291198256175)
(20, 0.046190906656726864)
(23, 0.04142751524576079)
(24, 0.04013969067939793)
(25, 0.04012608127444784)
(11, 0.031092726479068546)
(12, 0.02753047090082387)
(26, 0.012965173965768295)
(13, 0.012207367989602026)
(8, 0.009434214074816706)
(34, 0.009117026092838924)
(35, 0.0091

In [0]:
# Create a copy of X dataframe with columns sorted by score

titles = []

for i in range(len(X.columns)):
  p = X.columns[li[i][0]]
  titles.append(p)


X1 = pd.DataFrame(columns=titles)

for i in range(len(X.columns)):
  X1[X1.columns[i]] = X[X.columns[li[i][0]]]




In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import accuracy_score

In [14]:
# RF w/out recursive Feature Elimination
x_train, x_test, y_train, y_test = train_test_split(X1, Y, random_state=11)
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred, digits=4))

  This is separate from the ipykernel package so we can avoid doing imports until


                precision    recall  f1-score   support

      Analysis     0.6786    0.1842    0.2897       619
      Backdoor     0.6282    0.0864    0.1519       567
           DoS     0.3142    0.2707    0.2908      4075
      Exploits     0.6362    0.8067    0.7114     11236
       Fuzzers     0.9013    0.8673    0.8840      6127
       Generic     0.9969    0.9862    0.9915     53886
Reconnaissance     0.9252    0.7492    0.8279      3385
     Shellcode     0.6712    0.6314    0.6507       388
         Worms     0.6667    0.1579    0.2553        38

      accuracy                         0.8911     80321
     macro avg     0.7132    0.5267    0.5615     80321
  weighted avg     0.8947    0.8911    0.8884     80321



In [0]:
%%capture warnings
# Recursive Feature Elemination from # of features to 0 and keep the accuracy score of each

accuracy = []
X2 = X1.copy()

# for i in range(len(X1.columns)-1,-1,-1):
for i in range(len(X1.columns)-1): 
  x_train, x_test, y_train, y_test = train_test_split(X1, Y, test_size=0.2, random_state=11)
  X1.drop(X1.columns[len(X1.columns)-1], axis=1, inplace=True)
  clf = rf(random_state=0, n_jobs=-1)
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  accuracy.append(accuracy_score(y_test, y_pred)) 

In [16]:
index = accuracy.index(max(accuracy))
# for i in accuracy:
#   print(i)
  

# print(accuracy)

print(max(accuracy), "max accuracy")

print(index, "index")


X3 = X2.iloc[:,0:len(X.columns)-index]

0.8915760150645066 max accuracy
2 index


# Performance Analysis

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X3, Y, test_size=0.2, random_state=11)

In [18]:
from sklearn.ensemble import RandomForestClassifier as rf
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [25]:
# li = list(zip(X.columns, clf.feature_importances_))

# def sortSecond(val): 
#     return val[1] 
  
# li.sort(key = sortSecond, reverse=True) 

# for i in range(len(.columns)):
#     print(x[i], i)

clf.feature_importances_

array([2.03675159e-01, 4.16681788e-03, 6.47812366e-02, 2.59724883e-02,
       7.17125795e-02, 1.53060187e-02, 5.23185101e-02, 8.18104247e-02,
       9.83033222e-02, 1.51814140e-02, 2.04643464e-02, 4.92809642e-04,
       3.04782211e-03, 5.33874625e-03, 1.97141050e-03, 3.70779076e-03,
       1.56066681e-04, 4.83686077e-03, 1.00897807e-02, 4.46436924e-03,
       7.34774692e-03, 8.98146792e-03, 7.84842804e-02, 6.03297423e-03,
       2.17552436e-02, 1.35595071e-02, 2.50902740e-03, 2.64854141e-03,
       5.39558673e-03, 1.24060882e-02, 1.14237024e-02, 1.53146404e-02,
       2.04945905e-02, 1.04619200e-02, 7.49547585e-03, 6.98262134e-03,
       5.29239136e-05, 4.25944759e-05, 1.67926414e-02, 7.89069446e-03,
       5.61297570e-02])

In [26]:
clf.score(x_train, y_train) 

0.9283496611237774

In [27]:
print("Accuracy")
clf.score(x_test, y_test) 

Accuracy


0.8915760150645066

In [28]:
y_pred = clf.predict(x_test)

print("Classification\n")
print(classification_report(y_test,y_pred, digits=4))

Classification

                precision    recall  f1-score   support

      Analysis     0.7479    0.1798    0.2899       495
      Backdoor     0.5244    0.0917    0.1561       469
           DoS     0.3153    0.2665    0.2888      3235
      Exploits     0.6346    0.8123    0.7125      8944
       Fuzzers     0.8992    0.8690    0.8838      4846
       Generic     0.9969    0.9861    0.9915     43219
Reconnaissance     0.9310    0.7407    0.8250      2696
     Shellcode     0.6817    0.6137    0.6459       321
         Worms     1.0000    0.2188    0.3590        32

      accuracy                         0.8916     64257
     macro avg     0.7479    0.5309    0.5725     64257
  weighted avg     0.8951    0.8916    0.8886     64257

