# Importing the Data

**Goal**: Classify attacks assuming a multiclass classification system has alerted an attack.

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 

from sklearn.metrics import classification_report, accuracy_score

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Research/Benchmarks/'  #change dir to your project folder

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
data_set = 'breast-cancer-wisconsin_data'  #@param {type: "string"}

#cleaned data without non-attack values
X = pd.read_csv(root_path+data_set+'.csv', header=None, usecols=[i for i in range(10)])
Y = pd.read_csv(root_path+data_set+'.csv', header=None, usecols=[10])

In [0]:
#preprocessing - replace missing values with the most frequent values in the columns
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values='?', strategy='most_frequent')
X = imp.fit_transform(X)
X = pd.DataFrame(X)

In [0]:
from sklearn.preprocessing import StandardScaler
X=X.astype('float')
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

# Feature Engineering

In [6]:
# cluster and score
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_score as hs


score = []
for i in range(len(X.columns)): # loop number of features
  K = KMeans(n_clusters=len(Y[Y.columns[0]].unique())-1, random_state=0)
  pred = K.fit_predict(X.iloc[:, [i]].values)
  s = hs(Y[Y.columns[0]].values,pred)
  score.append(s)
  

# Rank the features and sort

s2 = score
np.asarray(s2)

s1 = []
for i in range(len(X.columns)):
  s1.append(i)
  
np.asarray(s1)

li = list(zip(s1, s2))

def sortSecond(val): 
    return val[1] 
  
li.sort(key = sortSecond, reverse=True) 



    
# Create a copy of X dataframe with columns sorted by score

titles = []

for i in range(len(X.columns)):
  p = X.columns[li[i][0]]
  titles.append(p)


X1 = pd.DataFrame(columns=titles)

for i in range(len(X.columns)):
  X1[X1.columns[i]] = X[X.columns[li[i][0]]]
  
  

# imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import accuracy_score

# Recursive Feature Elemination from # of features to 0 and keep the accuracy score of each

accuracy = []
X2 = X1.copy()

# for i in range(len(X1.columns)-1,-1,-1):
for i in range(len(X1.columns)-1): 
  x_train, x_test, y_train, y_test = train_test_split(X1, Y, test_size=0.2, random_state=11)
  X1.drop(X1.columns[len(X1.columns)-1], axis=1, inplace=True)
  clf = rf(random_state=0, n_jobs=-1)
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  accuracy.append(accuracy_score(y_test, y_pred)) 
  

  
# best score calcuation
index = accuracy.index(max(accuracy))

X = X2.iloc[:,0:len(X.columns)-index]



# Performance Analysis

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=11)

In [8]:
from sklearn.ensemble import RandomForestClassifier as rf
clf = rf(random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [9]:

x = list(zip(X.columns, clf.feature_importances_))

def sortSecond(val): 
    return val[1] 
  
x.sort(key = sortSecond, reverse=True) 

for i in range(len(X.columns)):
    print(x[i])
    if (i==len(X.columns)-1):
      print("features",i+1)

(7, 0.24076392878800681)
(2, 0.20619147337252333)
(6, 0.1884823566081754)
(3, 0.17740021357862001)
(5, 0.08576891478326722)
(1, 0.037997467126167084)
(8, 0.025594233558914672)
(0, 0.020021078539182703)
(4, 0.008966447566199177)
(9, 0.008813886078943737)
features 10


In [10]:
print(clf.score(x_train, y_train), "train")

0.998211091234347 train


In [11]:
y_pred = clf.predict(x_test)

print("Classification\n")
print(classification_report(y_test,y_pred, digits=4))

Classification

              precision    recall  f1-score   support

           2     0.9759    0.9643    0.9701        84
           4     0.9474    0.9643    0.9558        56

    accuracy                         0.9643       140
   macro avg     0.9616    0.9643    0.9629       140
weighted avg     0.9645    0.9643    0.9643       140

