# Importing the Data

**Goal**: Classify attacks assuming a multiclass classification system has alerted an attack.

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 

from sklearn.metrics import classification_report, accuracy_score

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Research/Benchmarks/'  #change dir to your project folder

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#cleaned data without non-attack values
X = pd.read_csv(root_path+'Lung-Cancer.csv',  usecols=[i for i in range(1,57)])
Y = pd.read_csv(root_path+'Lung-Cancer.csv',  usecols=[0])

In [0]:
#preprocessing - replace missing values with the most frequent values in the columns
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values='?', strategy='most_frequent')
X = imp.fit_transform(X)
X = pd.DataFrame(X)

# Feature Engineering

In [5]:
# cluster and score
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.cluster import completeness_score

score = []
for i in range(len(X.columns)): # loop number of features
  K = MiniBatchKMeans(n_clusters=len(Y[Y.columns[0]].unique())-1, random_state=0)
  pred = K.fit_predict(X.iloc[:, [i]].values)
  s = completeness_score(Y[Y.columns[0]].values,pred)
  score.append(s)
  

# Rank the features and sort

s2 = score
np.asarray(s2)

s1 = []
for i in range(len(X.columns)):
  s1.append(i)
  
np.asarray(s1)

li = list(zip(s1, s2))

def sortSecond(val): 
    return val[1] 
  
li.sort(key = sortSecond, reverse=True) 



    
# Create a copy of X dataframe with columns sorted by score

titles = []

for i in range(len(X.columns)):
  p = X.columns[li[i][0]]
  titles.append(p)


X1 = pd.DataFrame(columns=titles)

for i in range(len(X.columns)):
  X1[X1.columns[i]] = X[X.columns[li[i][0]]]
  
  

# imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import accuracy_score

# Recursive Feature Elemination from # of features to 0 and keep the accuracy score of each

accuracy = []
X2 = X1.copy()

# for i in range(len(X1.columns)-1,-1,-1):
for i in range(len(X1.columns)-1): 
  x_train, x_test, y_train, y_test = train_test_split(X1, Y, test_size=0.2, random_state=11)
  X1.drop(X1.columns[len(X1.columns)-1], axis=1, inplace=True)
  clf = rf(random_state=0, n_jobs=-1)
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  accuracy.append(accuracy_score(y_test, y_pred)) 
  

  
# best score calcuation
index = accuracy.index(max(accuracy))

X = X2.iloc[:,0:len(X.columns)-index]



# Performance Analysis

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=11)

In [7]:
from sklearn.ensemble import RandomForestClassifier as rf
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [8]:

x = list(zip(X.columns, clf.feature_importances_))

def sortSecond(val): 
    return val[1] 
  
x.sort(key = sortSecond, reverse=True) 

for i in range(len(X.columns)):
    print(x[i])
    if (i==len(X.columns)-1):
      print("features",i+1)

(19, 0.17723848884229967)
(32, 0.09129573002887643)
(52, 0.07793108732871752)
(2, 0.07611612937583602)
(1, 0.07400528489754174)
(13, 0.06788089541488282)
(5, 0.0588517201459393)
(18, 0.053545156225784506)
(16, 0.04846559661465729)
(26, 0.037743654661687435)
(22, 0.029270236551321917)
(38, 0.028051507004386585)
(36, 0.02705646511599468)
(51, 0.02287023865971236)
(48, 0.02015753669889009)
(39, 0.01960662232913542)
(55, 0.017276824909781652)
(47, 0.015789473684210527)
(14, 0.012321976111449796)
(50, 0.01111969111969112)
(8, 0.00994764397905759)
(45, 0.009612138454243715)
(9, 0.008124488124488124)
(53, 0.002965250965250966)
(44, 0.002694386694386696)
(6, 6.177606177606442e-05)
(42, 0.0)
(7, 0.0)
(46, 0.0)
(0, 0.0)
(49, 0.0)
features 31


In [9]:
print(clf.score(x_train, y_train), "train")

0.9583333333333334 train


In [10]:
y_pred = clf.predict(x_test)

print("Classification\n")
print(classification_report(y_test,y_pred, digits=4))

Classification

              precision    recall  f1-score   support

           1     1.0000    1.0000    1.0000         2
           2     1.0000    0.7500    0.8571         4
           3     0.5000    1.0000    0.6667         1

    accuracy                         0.8571         7
   macro avg     0.8333    0.9167    0.8413         7
weighted avg     0.9286    0.8571    0.8707         7

