In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier


In [33]:
data = pd.read_csv('csv_preprocessed.csv')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215675 entries, 0 to 1215674
Data columns (total 63 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Dur           1215675 non-null  float64
 1   sTos          1215675 non-null  float64
 2   dTos          1215675 non-null  float64
 3   sTtl          1215675 non-null  float64
 4   dTtl          1215675 non-null  float64
 5   sHops         1215675 non-null  float64
 6   dHops         1215675 non-null  float64
 7   TotPkts       1215675 non-null  float64
 8   SrcPkts       1215675 non-null  float64
 9   DstPkts       1215675 non-null  float64
 10  TotBytes      1215675 non-null  float64
 11  SrcBytes      1215675 non-null  float64
 12  DstBytes      1215675 non-null  float64
 13  Offset        1215675 non-null  float64
 14  sMeanPktSz    1215675 non-null  float64
 15  dMeanPktSz    1215675 non-null  float64
 16  Load          1215675 non-null  float64
 17  SrcLoad       1215675 non-n

In [17]:
X = data.drop(columns=["Attack Type"])
Y = data["Attack Type"]

def mutual_info(X, Y):
  mutual_info_arr = mutual_info_classif(X, Y)
  series_info = pd.Series(mutual_info_arr)
  series_info.index = X.columns
  series_top = series_info.sort_values(ascending=False)[:20]
  return series_top

new_data = data[["SrcWin", "DstWin", "dHops", "dTtl", "TotBytes", "SrcBytes", "sMeanPktSz", "DstGap", "SrcGap", "dTos", "DstTCPBase", "SrcTCPBase", "TcpRtt", "Proto_udp", "DstBytes", "AckDat" , "dMeanPktSz", "Proto_tcp", "SynAck", "Load"]]

def concat_column_for_plot(pca_data, column_name):
  for_plot = pd.concat([pca_data, data[column_name]], axis = 1)
  return for_plot

new_data = concat_column_for_plot(new_data, "Attack Type")

X_train, X_test, y_train, y_test = train_test_split(new_data.loc[:, new_data.columns != 'Attack Type'], new_data['Attack Type'],
                                                    stratify=new_data['Attack Type'],
                                                    test_size=0.15)

X_train = pd.DataFrame(X_train, columns=new_data.columns.to_list()[:-1])
X_test = pd.DataFrame(X_test, columns=new_data.columns.to_list()[:-1])
y_train = pd.DataFrame(y_train, columns=['Attack Type'])
y_test = pd.DataFrame(y_test, columns=['Attack Type'])

def get_pca_df(scaled_data, no_of_components):

  pca = PCA(n_components=no_of_components)
  Principal_components=pca.fit_transform(scaled_data)
  column_names = ["PC "+str(i) for i in range(1, no_of_components+1)]
  pca_df = pd.DataFrame(data = Principal_components, columns = column_names)
  return pca_df, pca

In [18]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [30]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


In [31]:
class WeightedKNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def predict(self, X_test):
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)

    def _predict(self, x):
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        k_distances = [distances[i] for i in k_indices]

        # Calculate weights as inverse of distance
        weights = [1/d if d != 0 else float('inf') for d in k_distances]
        
        # Use weighted voting to predict the class
        class_votes = {}
        for label, weight in zip(k_nearest_labels, weights):
            class_votes[label] = class_votes.get(label, 0) + weight
        return max(class_votes, key=class_votes.get)


In [22]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
Y = new_data['Attack Type']
X_train, pca = get_pca_df(X_train, 15)


In [23]:
model = WeightedKNN(k=3)
model.fit(X_train, y_train)

In [34]:
x_test = pca.transform(X_test)
x_test = pd.DataFrame(x_test, columns=X_train.columns)
x_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182352 entries, 0 to 182351
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   PC 1    182352 non-null  float64
 1   PC 2    182352 non-null  float64
 2   PC 3    182352 non-null  float64
 3   PC 4    182352 non-null  float64
 4   PC 5    182352 non-null  float64
 5   PC 6    182352 non-null  float64
 6   PC 7    182352 non-null  float64
 7   PC 8    182352 non-null  float64
 8   PC 9    182352 non-null  float64
 9   PC 10   182352 non-null  float64
 10  PC 11   182352 non-null  float64
 11  PC 12   182352 non-null  float64
 12  PC 13   182352 non-null  float64
 13  PC 14   182352 non-null  float64
 14  PC 15   182352 non-null  float64
dtypes: float64(15)
memory usage: 20.9 MB


In [32]:
predictions = model.predict(x_test)

print("Predictions:", predictions)


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [38]:
adaboost_classifier = AdaBoostClassifier(estimator=model, n_estimators=50, algorithm='SAMME', random_state=42)
adaboost_classifier.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


ValueError: WeightedKNN doesn't support sample_weight.