In [1]:
!pip install imbalanced-learn > /dev/null 2>&1

In [2]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from joblib import dump, load

from imblearn.over_sampling import SMOTE

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

## 0.03

In [3]:
train = pd.read_csv("/notebooks/FinalDataset/top003_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top003_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 22)
(123504,)
(41169, 22)
(41169,)


### Unbalanced

In [4]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/RF/RF_003_Unbalanced")

model = load("/notebooks/Models/RF/RF_003_Unbalanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.75      0.18      0.29       665
           1       0.61      0.07      0.13       583
           2       0.41      0.18      0.25      4088
           3       0.63      0.88      0.73     11057
           4       0.90      0.87      0.89      6056
           5       1.00      0.98      0.99     14755
           6       0.91      0.77      0.83      3496
           7       0.71      0.72      0.71       415
           8       0.60      0.50      0.55        54

    accuracy                           0.81     41169
   macro avg       0.73      0.57      0.60     41169
weighted avg       0.81      0.81      0.79     41169



### Balanced

In [5]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/RF/RF_003_Balanced")

model = load("/notebooks/Models/RF/RF_003_Balanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.14      0.27      0.19       665
           1       0.08      0.22      0.12       583
           2       0.36      0.67      0.47      4088
           3       0.86      0.54      0.66     11057
           4       0.94      0.85      0.90      6056
           5       1.00      0.98      0.99     14755
           6       0.87      0.80      0.83      3496
           7       0.62      0.80      0.70       415
           8       0.36      0.69      0.47        54

    accuracy                           0.77     41169
   macro avg       0.58      0.65      0.59     41169
weighted avg       0.85      0.77      0.79     41169



## 0.02

In [6]:
train = pd.read_csv("/notebooks/FinalDataset/top002_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top002_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 154)
(123504,)
(41169, 154)
(41169,)


## Unbalanced

In [7]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/RF/RF_002_Unbalanced")

model = load("/notebooks/Models/RF/RF_002_Unbalanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.77      0.19      0.30       665
           1       0.73      0.08      0.15       583
           2       0.36      0.26      0.30      4088
           3       0.65      0.83      0.73     11057
           4       0.90      0.87      0.89      6056
           5       1.00      0.98      0.99     14755
           6       0.91      0.77      0.84      3496
           7       0.70      0.71      0.70       415
           8       0.51      0.35      0.42        54

    accuracy                           0.81     41169
   macro avg       0.73      0.56      0.59     41169
weighted avg       0.81      0.81      0.80     41169



### Balanced

In [8]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/RF/RF_002_Balanced")

model = load("/notebooks/Models/RF/RF_002_Balanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.24      0.21      0.22       665
           1       0.07      0.12      0.09       583
           2       0.36      0.65      0.46      4088
           3       0.81      0.59      0.68     11057
           4       0.93      0.86      0.89      6056
           5       1.00      0.98      0.99     14755
           6       0.87      0.80      0.83      3496
           7       0.55      0.84      0.66       415
           8       0.47      0.69      0.56        54

    accuracy                           0.78     41169
   macro avg       0.59      0.64      0.60     41169
weighted avg       0.83      0.78      0.80     41169



## 0.01

In [9]:
train = pd.read_csv("/notebooks/FinalDataset/top001_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top001_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 168)
(123504,)
(41169, 168)
(41169,)


### Unbalanced

In [10]:
proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/RF/RF_001_Unbalanced")

model = load("/notebooks/Models/RF/RF_001_Unbalanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.80      0.17      0.28       665
           1       0.72      0.08      0.14       583
           2       0.35      0.23      0.28      4088
           3       0.64      0.84      0.73     11057
           4       0.90      0.87      0.88      6056
           5       1.00      0.98      0.99     14755
           6       0.92      0.77      0.84      3496
           7       0.70      0.63      0.66       415
           8       0.50      0.15      0.23        54

    accuracy                           0.80     41169
   macro avg       0.73      0.53      0.56     41169
weighted avg       0.81      0.80      0.79     41169



### Balanced

In [11]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
dump(model, "/notebooks/Models/RF/RF_001_Balanced")

model = load("/notebooks/Models/RF/RF_001_Balanced")
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
              precision    recall  f1-score   support

           0       0.31      0.21      0.25       665
           1       0.13      0.09      0.11       583
           2       0.36      0.70      0.47      4088
           3       0.81      0.60      0.68     11057
           4       0.92      0.86      0.89      6056
           5       1.00      0.98      0.99     14755
           6       0.87      0.81      0.84      3496
           7       0.52      0.86      0.65       415
           8       0.37      0.56      0.44        54

    accuracy                           0.79     41169
   macro avg       0.59      0.63      0.59     41169
weighted avg       0.83      0.79      0.80     41169

