In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [5]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset("agrawalchaitany/cyberbert_dataset")
df = ds['train'].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 2522362/2522362 [00:26<00:00, 95065.83 examples/s] 


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522362 entries, 0 to 2522361
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             float64
 1   Flow Duration                float64
 2   Total Fwd Packets            float64
 3   Total Backward Packets       float64
 4   Total Length of Fwd Packets  float64
 5   Total Length of Bwd Packets  float64
 6   Fwd Packet Length Max        float64
 7   Fwd Packet Length Min        float64
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        float64
 11  Bwd Packet Length Min        float64
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 float64
 19  

In [8]:
df.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [9]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,55054.0,109.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,55055.0,52.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,46236.0,34.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,54863.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [None]:
# Drop ID, timestamp, duplicate and low-variance features
drop_cols = [
    'Flow ID','Source IP','Source Port','Destination IP','Timestamp',
    'Destination Port',  
    'Fwd Header Length.1',
    'Average Packet Size', 'Packet Length Mean',
    'Fwd Packet Length Mean','Avg Fwd Segment Size',
    'Bwd Packet Length Mean','Avg Bwd Segment Size',
    'Flow Bytes/s','Flow Packets/s'
]
for col in drop_cols:
    if col in df.columns:
        df.drop(col, axis=1, inplace=True)
print('Shape after drop:', df.shape)

Shape after drop: (2522362, 69)


In [13]:
# Prepare binary classification: BENIGN vs Attack
df['is_attack'] = np.where(df['Label'] == 'BENIGN', 0, 1)
X = df.drop(['Label','is_attack'], axis=1)
y = df['is_attack']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print('Binary train class distribution:', np.bincount(y_train))

Binary train class distribution: [1677187  340702]


In [14]:
# Scale features using QuantileTransformer
scaler = QuantileTransformer(output_distribution='normal')
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [15]:
# Compute class weights for binary classification
class_weights = compute_class_weight(
    class_weight='balanced', classes=np.unique(y_train), y=y_train
)
class_weights = dict(enumerate(class_weights))
print('Class weights:', class_weights)

Class weights: {0: np.float64(0.6015694731714472), 1: np.float64(2.96136946657196)}


In [16]:
# Build binary ANN
model_bin = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model_bin.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['Recall', 'Precision']
)
es = EarlyStopping(monitor='val_recall', patience=5, mode='max', restore_best_weights=True)
history_bin = model_bin.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=30, batch_size=1024,
    class_weight=class_weights,
    callbacks=[es]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m1971/1971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - Precision: 0.7702 - Recall: 0.9284 - loss: 0.1696 - val_Precision: 0.9685 - val_Recall: 0.9812 - val_loss: 0.0289
Epoch 2/30
[1m  14/1971[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7s[0m 4ms/step - Precision: 0.9583 - Recall: 0.9804 - loss: 0.0419    

  current = self.get_monitor_value(logs)


[1m1971/1971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - Precision: 0.9472 - Recall: 0.9841 - loss: 0.0430 - val_Precision: 0.9591 - val_Recall: 0.9879 - val_loss: 0.0312
Epoch 3/30
[1m1971/1971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - Precision: 0.9556 - Recall: 0.9861 - loss: 0.0361 - val_Precision: 0.9795 - val_Recall: 0.9866 - val_loss: 0.0182
Epoch 4/30
[1m1971/1971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - Precision: 0.9622 - Recall: 0.9882 - loss: 0.0317 - val_Precision: 0.9802 - val_Recall: 0.9920 - val_loss: 0.0188
Epoch 5/30
[1m1971/1971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - Precision: 0.9634 - Recall: 0.9889 - loss: 0.0301 - val_Precision: 0.9848 - val_Recall: 0.9903 - val_loss: 0.0182
Epoch 6/30
[1m1971/1971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - Precision: 0.9679 - Recall: 0.9894 - loss: 0.0276 - val_Precision: 0.9802 - val_Recall: 0.9911 - val_loss

In [17]:
print(model_bin.evaluate(X_val_scaled, y_val, verbose=0))

[0.014066895470023155, 0.9934605956077576, 0.9869256019592285]


In [18]:
# Prepare multi-class classification
df_attacks = df[df['is_attack'] == 1].copy()
X2 = df_attacks.drop(['Label','is_attack'], axis=1)
y2 = df_attacks['Label']
X2_train, X2_val, y2_train, y2_val = train_test_split(
    X2, y2, test_size=0.2, stratify=y2, random_state=42
)
print('Multi-class train distribution:')
print(y2_train.value_counts())

Multi-class train distribution:
Label
DoS Hulk                      138279
DDoS                          102413
PortScan                       72655
DoS GoldenEye                   8229
FTP-Patator                     4746
DoS slowloris                   4308
DoS Slowhttptest                4182
SSH-Patator                     2575
Bot                             1562
Web Attack � Brute Force        1176
Web Attack � XSS                 522
Infiltration                      29
Web Attack � Sql Injection        17
Heartbleed                         9
Name: count, dtype: int64


In [19]:
# Scale features for multi-class
X2_train_scaled = scaler.fit_transform(X2_train)
X2_val_scaled = scaler.transform(X2_val)

In [None]:
# SMOTE oversampling
#smote = SMOTE(random_state=42)
#X2_res, y2_res = smote.fit_resample(X2_train_scaled, y2_train)
#print('After SMOTE:', pd.Series(y2_res).value_counts())

After SMOTE: Label
DDoS                          138279
DoS Hulk                      138279
PortScan                      138279
DoS slowloris                 138279
DoS GoldenEye                 138279
FTP-Patator                   138279
Web Attack � XSS              138279
Web Attack � Brute Force      138279
SSH-Patator                   138279
DoS Slowhttptest              138279
Bot                           138279
Web Attack � Sql Injection    138279
Infiltration                  138279
Heartbleed                    138279
Name: count, dtype: int64


In [25]:
# Encode labels
labels = sorted(y2_train.unique())
label_to_idx = {label: idx for idx, label in enumerate(labels)}
y2_res_idx = np.array([label_to_idx[l] for l in y2_train])
y2_val_idx = np.array([label_to_idx[l] for l in y2_val])
y2_res_cat = to_categorical(y2_res_idx, num_classes=len(labels))

In [26]:
# Compute class weights for multi-class
cw_multi = compute_class_weight(
    class_weight='balanced', classes=np.arange(len(labels)), y=y2_res_idx
)
cw_multi = dict(enumerate(cw_multi))
print('Multi-class weights:', cw_multi)

Multi-class weights: {0: np.float64(15.579934150356685), 1: np.float64(0.23762468771403183), 2: np.float64(2.957328611357047), 3: np.float64(0.1759909830332671), 4: np.float64(5.819191091070574), 5: np.float64(5.648991908741213), 6: np.float64(5.127656372283426), 7: np.float64(2703.9841269841268), 8: np.float64(839.167487684729), 9: np.float64(0.3349508931643678), 10: np.float64(9.450818307905687), 11: np.float64(20.693756073858115), 12: np.float64(1431.5210084033613), 13: np.float64(46.62041598248495)}


In [27]:
# Build multi-class ANN
model_multi = Sequential([
    Dense(128, activation='relu', input_shape=(X2_res.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(len(labels), activation='softmax')
])
model_multi.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['Recall', 'Precision']
)
es2 = EarlyStopping(monitor='val_recall', patience=5, mode='max', restore_best_weights=True)
history_multi = model_multi.fit(
    X2_train_scaled, y2_res_cat,
    validation_data=(X2_val_scaled, to_categorical(y2_val_idx, num_classes=len(labels))),
    epochs=30, batch_size=512,
    class_weight=cw_multi,
    callbacks=[es2]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - Precision: 0.7593 - Recall: 0.2992 - loss: 2.3484 - val_Precision: 0.9810 - val_Recall: 0.8316 - val_loss: 0.3800
Epoch 2/30
[1m 39/666[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 3ms/step - Precision: 0.8735 - Recall: 0.6676 - loss: 0.9938

  current = self.get_monitor_value(logs)


[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - Precision: 0.8900 - Recall: 0.6632 - loss: 1.3039 - val_Precision: 0.9899 - val_Recall: 0.8727 - val_loss: 0.3446
Epoch 3/30
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - Precision: 0.8831 - Recall: 0.7259 - loss: 1.0813 - val_Precision: 0.9874 - val_Recall: 0.9385 - val_loss: 0.2241
Epoch 4/30
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - Precision: 0.8947 - Recall: 0.7923 - loss: 0.8744 - val_Precision: 0.9840 - val_Recall: 0.9269 - val_loss: 0.2598
Epoch 5/30
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - Precision: 0.9181 - Recall: 0.8334 - loss: 0.7944 - val_Precision: 0.9880 - val_Recall: 0.9332 - val_loss: 0.2113
Epoch 6/30
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - Precision: 0.9368 - Recall: 0.8625 - loss: 0.6567 - val_Precision: 0.9906 - val_Recall: 0.9518 - val_loss: 0.1386
E

In [24]:
print(model_multi.evaluate(X2_val_scaled, to_categorical(y2_val_idx, num_classes=len(labels)), verbose=0))

[0.0129367271438241, 0.995233416557312, 0.9953736066818237]
