In [9]:
!pip install imbalanced-learn > /dev/null 2>&1

In [10]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from joblib import dump, load

from imblearn.over_sampling import SMOTE

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier

## 0.03

In [11]:
train = pd.read_csv("/notebooks/FinalDataset/top003_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top003_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 22)
(123504,)
(41169, 22)
(41169,)


### Unbalanced

In [12]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = XGBClassifier(eval_metric="mlogloss", objective="multi:softprob",n_jobs=-1,random_state=42)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False)
model.save_model("/notebooks/Models/XGB/XGB_003_Unbalanced.ubj")

model.load_model("/notebooks/Models/XGB/XGB_003_Unbalanced.ubj")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64




              precision    recall  f1-score   support

           0       0.79      0.18      0.29       665
           1       0.92      0.08      0.15       583
           2       0.54      0.10      0.17      4088
           3       0.62      0.94      0.75     11057
           4       0.94      0.87      0.90      6056
           5       1.00      0.98      0.99     14755
           6       0.93      0.78      0.85      3496
           7       0.76      0.82      0.79       415
           8       0.70      0.69      0.69        54

    accuracy                           0.82     41169
   macro avg       0.80      0.60      0.62     41169
weighted avg       0.83      0.82      0.79     41169



### Balanced

In [13]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = XGBClassifier(eval_metric="mlogloss", objective="multi:softprob",n_jobs=-1,random_state=42)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False)
model.save_model("/notebooks/Models/XGB/XGB_003_Balanced.ubj")

model.load_model("/notebooks/Models/XGB/XGB_003_Balanced.ubj")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64




              precision    recall  f1-score   support

           0       0.20      0.27      0.23       665
           1       0.11      0.24      0.15       583
           2       0.36      0.73      0.48      4088
           3       0.88      0.53      0.66     11057
           4       0.95      0.85      0.90      6056
           5       1.00      0.98      0.99     14755
           6       0.85      0.83      0.84      3496
           7       0.51      0.92      0.66       415
           8       0.35      0.72      0.47        54

    accuracy                           0.78     41169
   macro avg       0.58      0.67      0.60     41169
weighted avg       0.85      0.78      0.80     41169



## 0.02

In [14]:
train = pd.read_csv("/notebooks/FinalDataset/top002_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top002_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 154)
(123504,)
(41169, 154)
(41169,)


## Unbalanced

In [15]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = XGBClassifier(eval_metric="mlogloss", objective="multi:softprob",n_jobs=-1,random_state=42)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False)
model.save_model("/notebooks/Models/XGB/XGB_002_Unbalanced.ubj")

model.load_model("/notebooks/Models/XGB/XGB_002_Unbalanced.ubj")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64




              precision    recall  f1-score   support

           0       0.76      0.19      0.31       665
           1       0.92      0.08      0.15       583
           2       0.53      0.13      0.20      4088
           3       0.63      0.93      0.75     11057
           4       0.95      0.87      0.91      6056
           5       1.00      0.98      0.99     14755
           6       0.94      0.78      0.85      3496
           7       0.73      0.82      0.77       415
           8       0.66      0.70      0.68        54

    accuracy                           0.82     41169
   macro avg       0.79      0.61      0.62     41169
weighted avg       0.83      0.82      0.80     41169



### Balanced

In [16]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = XGBClassifier(eval_metric="mlogloss", objective="multi:softprob",n_jobs=-1,random_state=42)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False)
model.save_model("/notebooks/Models/XGB/XGB_002_Balanced.ubj")

model.load_model("/notebooks/Models/XGB/XGB_002_Balanced.ubj")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64




              precision    recall  f1-score   support

           0       0.31      0.23      0.27       665
           1       0.11      0.20      0.14       583
           2       0.37      0.76      0.49      4088
           3       0.87      0.54      0.67     11057
           4       0.95      0.86      0.90      6056
           5       1.00      0.98      0.99     14755
           6       0.85      0.83      0.84      3496
           7       0.51      0.93      0.66       415
           8       0.38      0.80      0.51        54

    accuracy                           0.79     41169
   macro avg       0.59      0.68      0.61     41169
weighted avg       0.85      0.79      0.80     41169



## 0.01

In [17]:
train = pd.read_csv("/notebooks/FinalDataset/top001_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top001_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 168)
(123504,)
(41169, 168)
(41169,)


### Unbalanced

In [18]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = XGBClassifier(eval_metric="mlogloss", objective="multi:softprob",n_jobs=-1,random_state=42)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False)
model.save_model("/notebooks/Models/XGB/XGB_001_Unbalanced.ubj")

model.load_model("/notebooks/Models/XGB/XGB_001_Unbalanced.ubj")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64




              precision    recall  f1-score   support

           0       0.78      0.19      0.30       665
           1       0.89      0.08      0.15       583
           2       0.55      0.11      0.18      4088
           3       0.63      0.94      0.75     11057
           4       0.94      0.87      0.91      6056
           5       1.00      0.98      0.99     14755
           6       0.92      0.78      0.84      3496
           7       0.72      0.78      0.75       415
           8       0.66      0.61      0.63        54

    accuracy                           0.82     41169
   macro avg       0.79      0.59      0.61     41169
weighted avg       0.83      0.82      0.79     41169



### Balanced

In [19]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = XGBClassifier(eval_metric="mlogloss", objective="multi:softprob",n_jobs=-1,random_state=42)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False)
model.save_model("/notebooks/Models/XGB/XGB_001_Balanced.ubj")

model.load_model("/notebooks/Models/XGB/XGB_001_Balanced.ubj")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64




              precision    recall  f1-score   support

           0       0.36      0.23      0.28       665
           1       0.16      0.19      0.17       583
           2       0.36      0.77      0.49      4088
           3       0.85      0.56      0.68     11057
           4       0.96      0.86      0.91      6056
           5       1.00      0.98      0.99     14755
           6       0.87      0.81      0.84      3496
           7       0.54      0.92      0.68       415
           8       0.40      0.72      0.51        54

    accuracy                           0.79     41169
   macro avg       0.61      0.67      0.62     41169
weighted avg       0.85      0.79      0.81     41169

