In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('/Users/shanmugapriyan/Downloads/csv_preprocessed.csv')

In [3]:
X = data.drop(columns=["Attack Type"])

In [4]:
Y = data["Attack Type"]

In [5]:
def mutual_info(X, Y):
  mutual_info_arr = mutual_info_classif(X, Y)
  series_info = pd.Series(mutual_info_arr)
  series_info.index = X.columns
  series_top = series_info.sort_values(ascending=False)[:20]
  return series_top

In [6]:
# result = mutual_info(X, Y)
# result.plot.bar(figsize=(20,8))

In [7]:
# new_data = data[result.keys()]
# new_data.head()

In [8]:
# Hard coding now to skip the mutual information step
new_data = data[["SrcWin", "DstWin", "dHops", "dTtl", "TotBytes", "SrcBytes", "sMeanPktSz", "DstGap", "SrcGap", "dTos", "DstTCPBase", "SrcTCPBase", "TcpRtt", "Proto_udp", "DstBytes", "AckDat" , "dMeanPktSz", "Proto_tcp", "SynAck", "Load"]]
new_data.head()

Unnamed: 0,SrcWin,DstWin,dHops,dTtl,TotBytes,SrcBytes,sMeanPktSz,DstGap,SrcGap,dTos,DstTCPBase,SrcTCPBase,TcpRtt,Proto_udp,DstBytes,AckDat,dMeanPktSz,Proto_tcp,SynAck,Load
0,3.442533e-07,6.217791e-08,-0.973059,-0.00732,-0.117591,-0.099523,0.16466,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,0.0,-0.066388,-0.388062,-0.287785,0.0,-0.044552,-0.008291
1,3.442533e-07,6.217791e-08,-0.973059,-0.00732,-0.117591,-0.099523,0.16466,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,0.0,-0.066388,-0.388062,-0.287785,0.0,-0.044552,-0.008291
2,3.442533e-07,6.217791e-08,-0.973059,-0.00732,8.131316,9.877388,8.044846,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,1.0,0.224388,-0.388062,0.972879,0.0,-0.044552,-0.007624
3,3.442533e-07,6.217791e-08,-0.973059,-0.00732,7.224216,8.73438,8.598644,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,1.0,0.259298,-0.388062,0.922515,0.0,-0.044552,-0.007698
4,3.442533e-07,6.217791e-08,-0.973059,-0.00732,9.162383,11.165238,8.769752,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,1.0,0.201274,-0.388062,0.81159,0.0,-0.044552,-0.007541


In [9]:
def concat_column_for_plot(pca_data, column_name):
  for_plot = pd.concat([pca_data, data[column_name]], axis = 1)
  return for_plot

In [10]:
new_data = concat_column_for_plot(new_data, "Attack Type")

In [11]:
X_train, X_test, y_train, y_test = train_test_split(new_data.loc[:, new_data.columns != 'Attack Type'], new_data['Attack Type'],
                                                    stratify=new_data['Attack Type'],
                                                    test_size=0.15)

X_train = pd.DataFrame(X_train, columns=new_data.columns.to_list()[:-1])
X_test = pd.DataFrame(X_test, columns=new_data.columns.to_list()[:-1])
y_train = pd.DataFrame(y_train, columns=['Attack Type'])
y_test = pd.DataFrame(y_test, columns=['Attack Type'])

print("Training dataset size:", X_train.shape)
print("Testing dataset size:", X_test.shape)
print("Training target size:", y_train.shape)
print("Testing target size:", y_test.shape)

Training dataset size: (1033323, 20)
Testing dataset size: (182352, 20)
Training target size: (1033323, 1)
Testing target size: (182352, 1)


In [12]:
def get_pca_df(scaled_data, no_of_components):
  pca = PCA(n_components=no_of_components)
  Principal_components=pca.fit_transform(scaled_data)
  column_names = ["PC "+str(i) for i in range(1, no_of_components+1)]
  pca_df = pd.DataFrame(data = Principal_components, columns = column_names)
  return pca_df, pca

In [13]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
Y = new_data['Attack Type']
X_train, pca = get_pca_df(X_train, 15)
for train_index, test_index in skf.split(X_train, y_train):
    X1_train, X1_test = X_train.iloc[train_index], X_train.iloc[test_index]
    y1_train, y1_test = y_train.iloc[train_index], y_train.iloc[test_index]
    # Initialize kNN classifier
    knn = KNeighborsClassifier(n_neighbors=19)
    # Train the classifier
    knn.fit(X1_train, y1_train)
    # Predict on the test set
    y_pred = knn.predict(X1_test)
    # Calculate evaluation metrics and store them
    accuracy_scores.append(accuracy_score(y1_test, y_pred))
    precision_scores.append(precision_score(y1_test, y_pred, average='weighted'))
    recall_scores.append(recall_score(y1_test, y_pred, average='weighted'))
    f1_scores.append(f1_score(y1_test, y_pred, average='weighted'))

In [15]:
x_test = pca.transform(X_test)


In [16]:
x_test

array([[-5.65514375e-01,  1.99562033e-01,  1.29644220e-03, ...,
         2.13513484e-01, -1.00236638e-01, -2.22777964e-02],
       [-5.65514374e-01,  1.99562032e-01,  1.29644234e-03, ...,
         2.13513485e-01, -1.00236637e-01, -2.22777965e-02],
       [ 3.17940889e+00, -1.90947067e+00,  1.10608083e-01, ...,
        -1.39797310e+00, -1.67440195e+00,  3.73384319e-01],
       ...,
       [ 1.00360785e+00, -1.22096187e+00, -2.00502025e-01, ...,
         5.90617636e-01,  3.81359870e-01,  2.84443356e-01],
       [-2.08280012e-01, -2.14409782e-01, -1.92131711e-01, ...,
         1.14540587e+00, -3.48267721e-01,  1.62946016e-01],
       [-5.65514375e-01,  1.99562033e-01,  1.29644220e-03, ...,
         2.13513484e-01, -1.00236638e-01, -2.22777964e-02]])

In [17]:
y1_pred = knn.predict(x_test)

In [18]:
accuracy_score(y_test, y1_pred)

0.9863999298060894

In [19]:
accuracy_scores

[0.9858902087920064,
 0.9859966612633972,
 0.9862144049548787,
 0.9860449812255642,
 0.9859530445554137]