In [1]:
!pip install imbalanced-learn > /dev/null 2>&1

In [2]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import tensorflow as tf

from joblib import dump, load

from imblearn.over_sampling import SMOTE

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

## 0.03

In [3]:
train = pd.read_csv("/notebooks/FinalDataset/top003_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top003_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 22)
(123504,)
(41169, 22)
(41169,)


### Unbalanced

In [4]:
y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train_ohe.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(x_train, y_train_ohe, validation_data=(x_test, y_test_ohe), epochs=5000, callbacks=[early_stopping])
model.save('/notebooks/Models/MLP/MLP_003_Unbalanced.h5')

model = load_model("/notebooks/Models/MLP/MLP_003_Unbalanced.h5")
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
              precision    recall  f1-score   support

           0       0.53      0.11      0.18       665
           1       0.00      0.00      0.00       583
           2       0.41      0.08      0.13      4088
           3       0.58      0.84      0.69     11057
           4       0.73      0.77      0.75      6056
           5       1.00      0.97      0.99     14755


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Balanced

In [5]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train_ohe.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(x_train, y_train_ohe, validation_data=(x_test, y_test_ohe), epochs=5000, callbacks=[early_stopping])
model.save('/notebooks/Models/MLP/MLP_003_Balanced.h5')

model = load_model("/notebooks/Models/MLP/MLP_003_Balanced.h5")
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
              precision    recall  f1-score   support

           0       0.09      0.27      0.13       665
           1       0.06      0.58      0.11       583
           2       0.34      0.13      0.19      4088
           3       0.84      0.29      0.43     11057
           4       0.70      0.77      0.73      6056
           5       1.00      0.97      0.99     14755
           6       0.54      0.62      0.57      3496
           7       0.11      0.62      0.19       415
           8       0.04      0.74      0.07        54

    accuracy                           0.62     41169
   macr

## 0.02

In [6]:
train = pd.read_csv("/notebooks/FinalDataset/top002_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top002_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 154)
(123504,)
(41169, 154)
(41169,)


## Unbalanced

In [7]:
y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train_ohe.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(x_train, y_train_ohe, validation_data=(x_test, y_test_ohe), epochs=5000, callbacks=[early_stopping])
model.save('/notebooks/Models/MLP/MLP_002_Unbalanced.h5')

model = load_model("/notebooks/Models/MLP/MLP_002_Unbalanced.h5")
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
              precision    recall  f1-score   support

           0       0.66      0.14      0.22       665
           1       0.50      0.01      0.01       583
           2       0.41      0.13      0.20      4088
           3       0.61      0.89      0.73     11057
          

### Balanced

In [8]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train_ohe.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(x_train, y_train_ohe, validation_data=(x_test, y_test_ohe), epochs=5000, callbacks=[early_stopping])
model.save('/notebooks/Models/MLP/MLP_002_Balanced.h5')

model = load_model("/notebooks/Models/MLP/MLP_002_Balanced.h5")
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
              precision    recall  f1-score   support

           0       0.09      0.29      0.14       665
           1       0.06      0.25      0.10       583
           2       0.35      0.52      0.42      4088
           3       0.88      0.43      0.58     11057
           4       0.88      0.78      0.83      6056
           5       1.00      0.97      0.99     14755
           6       0.78      0.81      0.80      3496
           7       0.30      0.69      0.41       415
           8       0.06      0.89      0.11        54

    accuracy                           0.72     41169
   macro avg       0.

## 0.01

In [9]:
train = pd.read_csv("/notebooks/FinalDataset/top001_train_encoded.csv")
test = pd.read_csv("/notebooks/FinalDataset/top001_test_encoded.csv")

x_train = train.drop('attack_cat', axis=1)
y_train = train['attack_cat']

x_test = test.drop('attack_cat', axis=1)
y_test = test['attack_cat']

del train, test

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(123504, 168)
(123504,)
(41169, 168)
(41169,)


### Unbalanced

In [10]:
y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train_ohe.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(x_train, y_train_ohe, validation_data=(x_test, y_test_ohe), epochs=5000, callbacks=[early_stopping])
model.save('/notebooks/Models/MLP/MLP_001_Unbalanced.h5')

model = load_model("/notebooks/Models/MLP/MLP_001_Unbalanced.h5")
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

5    35.720301
3    27.098717
4    14.728268
2     9.930852
6     8.494462
0     1.629097
1     1.413719
7     0.887421
8     0.097163
Name: attack_cat, dtype: float64
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
              precision    recall  f1-score   support

           0       0.69      0.14      0.23       665
           1       1.00      0.00      0.00       583
           2       0.44      0.07      0.12      4088
           3       0.60      0.91      0.73     11057
           4       0.83      0.82      0.83      6056
           5       1.00      0.97      0.99     14755
           6       0.79      0.67      0.73      3496
           7    

### Balanced

In [11]:
oversample = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = oversample.fit_resample(x_train, y_train)
x_train, y_train = shuffle(x_train, y_train, random_state=42)

y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)

proportions = y_train.value_counts(normalize=True) * 100
print(proportions)

model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train_ohe.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(x_train, y_train_ohe, validation_data=(x_test, y_test_ohe), epochs=5000, callbacks=[early_stopping])
model.save('/notebooks/Models/MLP/MLP_001_Balanced.h5')

model = load_model("/notebooks/Models/MLP/MLP_001_Balanced.h5")
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

1    11.111111
5    11.111111
0    11.111111
6    11.111111
7    11.111111
8    11.111111
2    11.111111
4    11.111111
3    11.111111
Name: attack_cat, dtype: float64
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
              precision    recall  f1-score   support

           0       0.11      0.23      0.15       665
           1       0.06      0.33      0.10       583
           2       0.35      0.52      0.42      4088
           3       0.90      0.42      0.58     11057
           4       0.87      0.81      0.84      6056
           5       1.00      0.97      0.99     14755
           6       0.76      0.80      0.78      3496
           7       0.29      0.65      0.40       415
           8       0.06      0.76      0.12        54

    accuracy                           0.72     41169
   macr