In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [None]:
!unzip /content/drive/MyDrive/Colab\ Notebooks/csv_preprocessed.zip

Archive:  /content/drive/MyDrive/Colab Notebooks/csv_preprocessed.zip
  inflating: csv_preprocessed.csv    
  inflating: __MACOSX/._csv_preprocessed.csv  


In [2]:
data = pd.read_csv('csv_preprocessed.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215675 entries, 0 to 1215674
Data columns (total 63 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Dur           1215675 non-null  float64
 1   sTos          1215675 non-null  float64
 2   dTos          1215675 non-null  float64
 3   sTtl          1215675 non-null  float64
 4   dTtl          1215675 non-null  float64
 5   sHops         1215675 non-null  float64
 6   dHops         1215675 non-null  float64
 7   TotPkts       1215675 non-null  float64
 8   SrcPkts       1215675 non-null  float64
 9   DstPkts       1215675 non-null  float64
 10  TotBytes      1215675 non-null  float64
 11  SrcBytes      1215675 non-null  float64
 12  DstBytes      1215675 non-null  float64
 13  Offset        1215675 non-null  float64
 14  sMeanPktSz    1215675 non-null  float64
 15  dMeanPktSz    1215675 non-null  float64
 16  Load          1215675 non-null  float64
 17  SrcLoad       1215675 non-n

In [3]:
X = data.drop(columns=["Attack Type"])

In [4]:
Y = data["Attack Type"]

In [5]:
def mutual_info(X, Y):
  mutual_info_arr = mutual_info_classif(X, Y)
  series_info = pd.Series(mutual_info_arr)
  series_info.index = X.columns
  series_top = series_info.sort_values(ascending=False)[:20]
  return series_top

In [6]:
# result = mutual_info(X, Y)
# result.plot.bar(figsize=(20,8))

In [7]:
# new_data = data[result.keys()]
# new_data.head()

In [8]:
# Hard coding now to skip the mutual information step
new_data = data[["SrcWin", "DstWin", "dHops", "dTtl", "TotBytes", "SrcBytes", "sMeanPktSz", "DstGap", "SrcGap", "dTos", "DstTCPBase", "SrcTCPBase", "TcpRtt", "Proto_udp", "DstBytes", "AckDat" , "dMeanPktSz", "Proto_tcp", "SynAck", "Load"]]
new_data.head()

Unnamed: 0,SrcWin,DstWin,dHops,dTtl,TotBytes,SrcBytes,sMeanPktSz,DstGap,SrcGap,dTos,DstTCPBase,SrcTCPBase,TcpRtt,Proto_udp,DstBytes,AckDat,dMeanPktSz,Proto_tcp,SynAck,Load
0,3.442533e-07,6.217791e-08,-0.973059,-0.00732,-0.117591,-0.099523,0.16466,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,0.0,-0.066388,-0.388062,-0.287785,0.0,-0.044552,-0.008291
1,3.442533e-07,6.217791e-08,-0.973059,-0.00732,-0.117591,-0.099523,0.16466,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,0.0,-0.066388,-0.388062,-0.287785,0.0,-0.044552,-0.008291
2,3.442533e-07,6.217791e-08,-0.973059,-0.00732,8.131316,9.877388,8.044846,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,1.0,0.224388,-0.388062,0.972879,0.0,-0.044552,-0.007624
3,3.442533e-07,6.217791e-08,-0.973059,-0.00732,7.224216,8.73438,8.598644,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,1.0,0.259298,-0.388062,0.922515,0.0,-0.044552,-0.007698
4,3.442533e-07,6.217791e-08,-0.973059,-0.00732,9.162383,11.165238,8.769752,-0.006334,-0.00383,-0.094739,-4.275809e-14,3.187841e-12,-0.267969,1.0,0.201274,-0.388062,0.81159,0.0,-0.044552,-0.007541


In [9]:
def concat_column_for_plot(pca_data, column_name):
  for_plot = pd.concat([pca_data, data[column_name]], axis = 1)
  return for_plot

In [10]:
new_data = concat_column_for_plot(new_data, "Attack Type")

In [11]:
X_train, X_test, y_train, y_test = train_test_split(new_data.loc[:, new_data.columns != 'Attack Type'], new_data['Attack Type'],
                                                    stratify=new_data['Attack Type'],
                                                    test_size=0.15)

X_train = pd.DataFrame(X_train, columns=new_data.columns.to_list()[:-1])
X_test = pd.DataFrame(X_test, columns=new_data.columns.to_list()[:-1])
y_train = pd.DataFrame(y_train, columns=['Attack Type'])
y_test = pd.DataFrame(y_test, columns=['Attack Type'])

print("Training dataset size:", X_train.shape)
print("Testing dataset size:", X_test.shape)
print("Training target size:", y_train.shape)
print("Testing target size:", y_test.shape)

Training dataset size: (1033323, 20)
Testing dataset size: (182352, 20)
Training target size: (1033323, 1)
Testing target size: (182352, 1)


In [12]:
def get_pca_df(scaled_data, no_of_components):

  pca = PCA(n_components=no_of_components)
  Principal_components=pca.fit_transform(scaled_data)
  column_names = ["PC "+str(i) for i in range(1, no_of_components+1)]
  pca_df = pd.DataFrame(data = Principal_components, columns = column_names)
  return pca_df, pca

In [13]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
Y = new_data['Attack Type']
X_train, pca = get_pca_df(X_train, 15)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
x_test = pca.transform(X_test)

In [None]:
param_grid = {
    'n_neighbors': [9, 19, 29, 23],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn = KNeighborsClassifier()

# Perform grid search
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [None]:
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Test accuracy: ", accuracy)

In [None]:
#y1_pred = knn.predict(x_test)

In [None]:
#accuracy_score(y_test, y1_pred)

In [None]:
#accuracy_scores