# Importing the Data

**Goal**: Classify attacks assuming a multiclass classification system has alerted an attack.

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 

from sklearn.metrics import classification_report, accuracy_score

In [0]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Research/Benchmarks/'  #change dir to your project folder

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
#cleaned data without non-attack values
X = pd.read_csv(root_path+'Lung-Cancer.csv',  usecols=[i for i in range(1,57)])
Y = pd.read_csv(root_path+'Lung-Cancer.csv',  usecols=[0])

In [0]:
#preprocessing - replace missing values with the most frequent values in the columns
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values='?', strategy='most_frequent')
X = imp.fit_transform(X)
X = pd.DataFrame(X)

In [0]:
# Data Balancing
from collections import Counter
print('Original dataset shape %s' % Counter(Y))
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0)
X, Y = sm.fit_resample(X, Y)
print('Resampled dataset shape %s' % Counter(Y))
X, Y = pd.DataFrame(X), pd.DataFrame(Y)

Original dataset shape Counter({'1': 1})
Resampled dataset shape Counter({1: 13, 2: 13, 3: 13})


  y = column_or_1d(y, warn=True)


# Feature Engineering

In [0]:
# RF RFE includes cross validation to choose the best number of features.

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.feature_selection import RFECV
rfc = rf(n_jobs=-1,random_state=0)
rfe = RFECV(rfc)
X1,Y1 = X.copy(), Y.copy()
X1 = rfe.fit_transform(X1, Y1.values.ravel())
X = pd.DataFrame(X1)

  y = column_or_1d(y, warn=True)


# Performance Analysis

In [0]:
# must have X and Y

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier as rf

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score

# handle multiclass classification
def multiclass_roc_auc_score(y_test, y_pred, average="weighted"):
  lb = LabelBinarizer()
  lb.fit(y_test)
  y_test = lb.transform(y_test)
  y_pred = lb.transform(y_pred)
  return roc_auc_score(y_test, y_pred, average=average)

# define metrics
accuracy = []
precision = []
recall =[]
f1=[]
roc_auc=[]

# begin cross-validation
kf = StratifiedKFold(n_splits=5, random_state=0)
for train, test in kf.split(X,Y):
  # classifyer
  r = rf(random_state=0, n_jobs=-1) 
  # train test split
  X1 = X.iloc[train]
  X2 = X.iloc[test]
  Y1 = Y.iloc[train]
  Y2 = Y.iloc[test]
  # fit
  r.fit(X1,Y1)
  # predict
  Y_pred = r.predict(X2)
  Y_pred = pd.DataFrame(Y_pred)

  # metrics
  accuracy.append(accuracy_score(Y2, Y_pred))
  f1.append(f1_score(Y2, Y_pred, average="weighted"))
  precision.append(precision_score(Y2, Y_pred, average="weighted"))
  recall.append(recall_score(Y2, Y_pred, average="weighted"))
  roc_auc.append(multiclass_roc_auc_score(Y2, Y_pred, average="weighted"))

# print averages
print("Average Accuracy: ",np.mean(accuracy))
print("Average Precision: ",np.mean(precision))
print("Average Recall: ",np.mean(recall))
print("Average F1: ",np.mean(f1))
print("Average ROC_AUC: ", np.mean(roc_auc))
print("Features Selected ", X.shape[1])



Average Accuracy:  0.7666666666666666
Average Precision:  0.7733333333333333
Average Recall:  0.7666666666666666
Average F1:  0.7442857142857143
Average ROC_AUC:  0.825
Features Selected  9
