# Importing the Data

**Goal**: Classify attacks assuming a multiclass classification system has alerted an attack.

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics import classification_report, accuracy_score

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Research/Benchmarks/'  #change dir to your project folder

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#cleaned data without non-attack values
X = pd.read_csv(root_path+'Lung-Cancer.csv',  usecols=[i for i in range(1,57)])
Y = pd.read_csv(root_path+'Lung-Cancer.csv',  usecols=[0])

In [0]:
#preprocessing - replace missing values with the most frequent values in the columns
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values='?', strategy='most_frequent')
X = imp.fit_transform(X)
X = pd.DataFrame(X)

# Feature Engineering

In [5]:
!pip install boruta



In [6]:
from boruta import BorutaPy as BP
X = X.values
Y = Y.values.ravel()

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
from sklearn.ensemble import RandomForestClassifier

rff = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BP(rff, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, Y)

print(feat_selector.support_)

# check ranking of features
print(feat_selector.ranking_)

# call transform() on X to filter it down to selected features
X = feat_selector.transform(X)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	56
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	45
Iteration: 	9 / 100
Confirmed: 	4
Tentative: 	7
Rejected: 	45
Iteration: 	10 / 100
Confirmed: 	4
Tentative: 	7
Rejected: 	45
Iteration: 	11 / 100
Confirmed: 	4
Tentative: 	7
Rejected: 	45
Iteration: 	12 / 100
Confirmed: 	4
Tentative: 	5
Rejected: 	47
Iteration: 	13 / 100
Confirmed: 	4
Tentative: 	5
Rejected: 	47
Iteration: 	14 / 100
Confirmed: 	4
Tentative: 	5
Rejected: 	47
Iteration: 	15 / 100
Confirmed: 	4
Tentative: 	5
Rejected: 	47
Iteration: 	16 / 100
Confirmed: 	4
Tentative: 	5
Rejected: 	47


In [0]:
X = pd.DataFrame(X)

# Performance Analysis

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=11)

In [9]:
from sklearn.ensemble import RandomForestClassifier as rf
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [10]:

x = list(zip(X.columns, clf.feature_importances_))

def sortSecond(val): 
    return val[1] 
  
x.sort(key = sortSecond, reverse=True) 

for i in range(len(X.columns)):
    print(x[i])
    if (i==len(X.columns)-1):
      print("features",i+1)

(3, 0.41490908307243257)
(0, 0.23706592496700923)
(1, 0.17854278327181086)
(2, 0.16948220868874733)
features 4


In [11]:
print(clf.score(x_train, y_train), "train")

0.9166666666666666 train


In [12]:
y_pred = clf.predict(x_test)

print("Classification\n")
print(classification_report(y_test,y_pred, digits=4))

Classification

              precision    recall  f1-score   support

           1     1.0000    0.5000    0.6667         2
           2     0.7500    0.7500    0.7500         4
           3     0.5000    1.0000    0.6667         1

    accuracy                         0.7143         7
   macro avg     0.7500    0.7500    0.6944         7
weighted avg     0.7857    0.7143    0.7143         7

