In [1]:
!pip install imbalanced-learn > /dev/null 2>&1

In [2]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from joblib import dump, load

from imblearn.over_sampling import SMOTE

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

## 0.03

In [3]:
train = pd.read_csv("/notebooks/FinalDataset/top003_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top003_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 22)
(123504,)
(41169, 22)
(41169,)


### Unbalanced

In [4]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/DT/DT_003_Unbalanced")

model = load("/notebooks/Models/DT/DT_003_Unbalanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.56      0.17      0.26       665
           1       0.47      0.08      0.13       583
           2       0.39      0.22      0.28      4088
           3       0.63      0.85      0.72     11057
           4       0.92      0.85      0.88      6056
           5       0.99      0.98      0.99     14755
           6       0.91      0.76      0.83      3496
           7       0.71      0.71      0.71       415
           8       0.66      0.54      0.59        54

    accuracy                           0.80     41169
   macro avg       0.69      0.57      0.60     41169
weighted avg       0.80      0.80      0.79     41169



### Balanced

In [5]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/DT/DT_003_Balanced")

model = load("/notebooks/Models/DT/DT_003_Balanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.14      0.26      0.18       665
           1       0.08      0.22      0.12       583
           2       0.35      0.68      0.46      4088
           3       0.86      0.52      0.65     11057
           4       0.95      0.84      0.89      6056
           5       0.99      0.98      0.99     14755
           6       0.87      0.80      0.83      3496
           7       0.63      0.76      0.69       415
           8       0.39      0.74      0.51        54

    accuracy                           0.77     41169
   macro avg       0.59      0.64      0.59     41169
weighted avg       0.84      0.77      0.79     41169



## 0.02

In [6]:
train = pd.read_csv("/notebooks/FinalDataset/top002_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top002_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 154)
(123504,)
(41169, 154)
(41169,)


## Unbalanced

In [7]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/DT/DT_002_Unbalanced")

model = load("/notebooks/Models/DT/DT_002_Unbalanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.57      0.18      0.28       665
           1       0.48      0.08      0.14       583
           2       0.35      0.32      0.33      4088
           3       0.65      0.78      0.71     11057
           4       0.91      0.85      0.88      6056
           5       0.99      0.99      0.99     14755
           6       0.90      0.77      0.83      3496
           7       0.68      0.71      0.69       415
           8       0.59      0.56      0.57        54

    accuracy                           0.80     41169
   macro avg       0.68      0.58      0.60     41169
weighted avg       0.80      0.80      0.79     41169



### Balanced

In [8]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/DT/DT_002_Balanced")

model = load("/notebooks/Models/DT/DT_002_Balanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.20      0.18      0.19       665
           1       0.08      0.11      0.09       583
           2       0.35      0.68      0.46      4088
           3       0.80      0.57      0.67     11057
           4       0.93      0.85      0.89      6056
           5       0.99      0.98      0.99     14755
           6       0.88      0.78      0.82      3496
           7       0.59      0.71      0.64       415
           8       0.41      0.67      0.51        54

    accuracy                           0.78     41169
   macro avg       0.58      0.61      0.58     41169
weighted avg       0.83      0.78      0.79     41169



## 0.01

In [9]:
train = pd.read_csv("/notebooks/FinalDataset/top001_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top001_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 168)
(123504,)
(41169, 168)
(41169,)


### Unbalanced

In [10]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/DT/DT_001_Unbalanced")

model = load("/notebooks/Models/DT/DT_001_Unbalanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.57      0.16      0.25       665
           1       0.37      0.07      0.12       583
           2       0.33      0.33      0.33      4088
           3       0.65      0.76      0.70     11057
           4       0.91      0.84      0.87      6056
           5       0.99      0.98      0.99     14755
           6       0.90      0.76      0.83      3496
           7       0.64      0.63      0.63       415
           8       0.50      0.52      0.51        54

    accuracy                           0.79     41169
   macro avg       0.65      0.56      0.58     41169
weighted avg       0.79      0.79      0.79     41169



### Balanced

In [11]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/DT/DT_001_Balanced")

model = load("/notebooks/Models/DT/DT_001_Balanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.26      0.17      0.21       665
           1       0.12      0.08      0.10       583
           2       0.34      0.69      0.45      4088
           3       0.77      0.58      0.66     11057
           4       0.93      0.85      0.89      6056
           5       0.99      0.98      0.99     14755
           6       0.89      0.77      0.82      3496
           7       0.57      0.68      0.62       415
           8       0.33      0.59      0.43        54

    accuracy                           0.78     41169
   macro avg       0.58      0.60      0.57     41169
weighted avg       0.82      0.78      0.79     41169

