In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
import torchmetrics
import matplotlib.pyplot as plt
import wandb
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import hiplot as hip
import plotly.express as px
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

In [3]:
component_columns = [
    "Attack ID", "Detect count", "Card", "Victim IP", "Port number",
    "Attack code", "Significant flag", "Packet speed", "Data speed",
    "Avg packet len", "Source IP count", "Time"
]

# Events
event_columns = [
    "Attack ID", "Card", "Victim IP", "Port number", "Attack code",
    "Detect count", "Significant flag", "Packet speed", "Data speed",
    "Avg packet len", "Avg source IP count", "Start time", "End time", "Whitelist flag", "Type"
]

df_A_components = pd.read_csv("../datasets/SCLDDoS2024_SetA_components.csv")
df_A_events = pd.read_csv("../datasets/SCLDDoS2024_SetA_events.csv")

df_D_components = pd.read_csv("../datasets/SCLDDoS2024_SetD_components.csv")
df_D_events = pd.read_csv("../datasets/SCLDDoS2024_SetD_events.csv")

df_B_components = pd.read_csv("../datasets/SCLDDoS2024_SetB_components.csv", header=None, names=component_columns)
df_B_events = pd.read_csv("../datasets/SCLDDoS2024_SetB_events.csv", header=None, names=event_columns)

df_C_components = pd.read_csv("../datasets/SCLDDoS2024_SetC_components.csv", header=None, names=component_columns)
df_C_events = pd.read_csv("../datasets/SCLDDoS2024_SetC_events.csv", header=None, names=event_columns)

In [4]:
df_A_events = df_A_events.drop(df_A_events[df_A_events['End time']=="0"].index)
df_B_events = df_B_events.drop(df_B_events[df_B_events['End time']=="0"].index)
df_C_events = df_C_events.drop(df_C_events[df_C_events['End time']=="0"].index)
df_D_events = df_D_events.drop(df_D_events[df_D_events['End time']=="0"].index)
df_A_events['Avg packet len'] = df_A_events['Avg packet len'].replace(0, 1)
df_B_events['Avg packet len'] = df_B_events['Avg packet len'].replace(0, 1)
df_C_events['Avg packet len'] = df_C_events['Avg packet len'].replace(0, 1)
df_D_events['Avg packet len'] = df_D_events['Avg packet len'].replace(0, 1)
df_A_events['Start time']=pd.to_datetime(df_A_events['Start time'])
df_A_events['End time']=pd.to_datetime(df_A_events['End time'])
df_A_components['Time']=pd.to_datetime(df_A_components['Time'])
df_B_events['Start time']=pd.to_datetime(df_B_events['Start time'])
df_B_events['End time']=pd.to_datetime(df_B_events['End time'])
df_B_components['Time']=pd.to_datetime(df_B_components['Time'])
df_C_events['Start time']=pd.to_datetime(df_C_events['Start time'])
df_C_events['End time']=pd.to_datetime(df_C_events['End time'])
df_C_components['Time']=pd.to_datetime(df_C_components['Time'])
df_D_events['Start time']=pd.to_datetime(df_D_events['Start time'])
df_D_events['End time']=pd.to_datetime(df_D_events['End time'])
df_D_components['Time']=pd.to_datetime(df_D_components['Time'])
df_A_events = df_A_events.drop(columns=['Significant flag', 'Whitelist flag'])
df_A_components = df_A_components.drop(columns=['Significant flag'])
df_B_events = df_B_events.drop(columns=['Significant flag', 'Whitelist flag'])
df_B_components = df_B_components.drop(columns=['Significant flag'])
df_C_events = df_C_events.drop(columns=['Significant flag', 'Whitelist flag'])
df_C_components = df_C_components.drop(columns=['Significant flag'])
df_D_events = df_D_events.drop(columns=['Significant flag', 'Whitelist flag'])
df_D_components = df_D_components.drop(columns=['Significant flag'])
df_A_events = df_A_events.drop(columns=['Card'])
df_B_events = df_B_events.drop(columns=['Card'])
df_C_events = df_C_events.drop(columns=['Card'])
df_D_events = df_D_events.drop(columns=['Card'])
df_A_events['Duration']=(df_A_events['End time'] - df_A_events['Start time']).dt.total_seconds()
df_B_events['Duration']=(df_B_events['End time'] - df_B_events['Start time']).dt.total_seconds()
df_C_events['Duration']=(df_C_events['End time'] - df_C_events['Start time']).dt.total_seconds()
df_D_events['Duration']=(df_D_events['End time'] - df_D_events['Start time']).dt.total_seconds()

In [5]:
csvs = [df_A_events, df_B_events, df_C_events, df_D_events]
for df in csvs:
    #TIME FEATURES
    df['Start time'] = pd.to_datetime(df['Start time'])
    df['End time'] = pd.to_datetime(df['End time'])
    df['Hour'] = df['Start time'].dt.hour
    df['Hour sin'] = np.sin(df['Hour'] * (2. * np.pi / 24))
    df['Hour cos'] = np.cos(df['Hour'] * (2. * np.pi / 24))
    df['Day of Week'] = df['Start time'].dt.dayofweek
    df['Duration'] = (df['End time'] - df['Start time']).dt.total_seconds()
    #events under 1 second would be 0 duration, so we set them to 0.5
    df.loc[df['Duration'] == 0, 'Duration'] = 0.5

    #DERIVED FEATURES
    df['Packet Rate'] = df['Packet speed'] / df['Duration']
    df['Data Rate'] = df['Data speed'] / df['Duration']
    df['Packet Size to Speed Ratio'] = df['Avg packet len'] / df['Packet speed']
    df['Data to Packet Ratio'] = df['Data speed'] / df['Packet speed']  

features = [
    "Port number",
    "Detect count",  "Packet speed", "Data speed",
    "Avg packet len", "Avg source IP count", "Duration",
    'Packet Rate', 'Data Rate', 'Packet Size to Speed Ratio', 'Data to Packet Ratio',
    'Hour sin', 'Hour cos', 'Day of Week'
]
train_df = pd.concat([df_A_events, df_B_events])
val_df = df_C_events

In [7]:
features = [
    "Port number",
    "Detect count",  "Packet speed", "Data speed",
    "Avg packet len", "Avg source IP count", "Duration",
    'Packet Rate', 'Data Rate', 'Packet Size to Speed Ratio', 'Data to Packet Ratio',
    'Hour sin', 'Hour cos', 'Day of Week',
]
train_df = pd.concat([df_A_events, df_B_events])
val_df = df_C_events
test_df = df_D_events
X_train = train_df[features]
y_train = train_df['Type']
X_val = val_df[features]
y_val = val_df['Type']
X_test = test_df[features]
y_test = test_df['Type']
le = LabelEncoder()
y_test = le.fit_transform(y_test)
y_train = le.fit_transform(y_train)
y_val = le.fit_transform(y_val)
scaler = StandardScaler()

### 8. futás (legmagasabb test accuracy)

In [8]:
batch_size = 256
dropout_prob = 0.3
hidden_dim = 32
lr = 0.001
num_epochs = 5

In [9]:
def create_dataloader(X, y, batch_size):
  scaler.fit(X)
  X = scaler.transform(X).astype('float32')
  X = torch.from_numpy(X)
  y = y.astype('long')
  y = torch.from_numpy(y)
  dataset = TensorDataset(X, y)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers = 15)
  return dataloader

train_loader = create_dataloader(X_train, y_train, batch_size)
val_loader = create_dataloader(X_val, y_val, batch_size)
test_loader = create_dataloader(X_test, y_test, batch_size)

In [10]:
class FeedForwardNet(pl.LightningModule):
    def __init__(self, num_features, hidden_dim, lr, num_classes=3, dropout_prob=0.3):
        super(FeedForwardNet, self).__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.num_classes = num_classes

        self.layers = nn.Sequential(
            nn.Linear(num_features, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob),

            nn.Linear(hidden_dim, num_classes)  # output layer (logits)
        )

        self.loss = nn.CrossEntropyLoss()
        self.acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=num_classes)

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
        loss, acc = self._shared_eval_step(batch)
        self.log_dict({"loss": loss, "acc": acc}, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc = self._shared_eval_step(batch)
        self.log_dict({"val_loss": loss, "val_acc": acc}, prog_bar=False)
        return {"val_loss": loss, "val_acc": acc}

    def test_step(self, batch, batch_idx):
        loss, acc = self._shared_eval_step(batch)
        self.log_dict({"test_loss": loss, "test_acc": acc})
        return {"test_loss": loss, "test_acc": acc}

    def _shared_eval_step(self, batch):
        x, y = batch
        logits = self(x)
        loss = self.loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = self.acc(preds, y)
        return loss, acc

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

In [11]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbaczodomonkos[0m ([33mbaczodomonkos-budapesti-m-szaki-s-gazdas-gtudom-nyi-egyetem[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [12]:
torch.set_float32_matmul_precision('high')

In [13]:
model = FeedForwardNet(X_train.shape[1], hidden_dim, lr)
wandb_logger = pl.loggers.WandbLogger(project="ddos_classifier_best", log_model="all")
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_acc", mode="max")
trainer = pl.Trainer(
    max_epochs=num_epochs,
    precision="16-mixed",
    accelerator="gpu",
    devices=1,
    logger=wandb_logger,
    callbacks=[checkpoint_callback],
)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)
wandb.finish()

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name   | Type               | Params | Mode 
------------------------------------------------------
0 | layers | Sequential         | 1.8 K  | train
1 | loss   | CrossEntropyLoss   | 0      | train
2 | acc    | MulticlassAccuracy | 0      | train
------------------------------------------------------
1.8 K     Trainable params
0         Non-trainable params
1.8 K     Total params
0.007     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Epoch 0: 100%|██████████| 1035/1035 [00:15<00:00, 68.83it/s, v_num=64wc, loss=0.102, acc=0.968]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/508 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/508 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/508 [00:00<00:07, 66.82it/s][A
Validation DataLoader 0:   0%|          | 2/508 [00:00<00:07, 68.81it/s][A
Validation DataLoader 0:   1%|          | 3/508 [00:00<00:06, 77.99it/s][A
Validation DataLoader 0:   1%|          | 4/508 [00:00<00:06, 82.62it/s][A
Validation DataLoader 0:   1%|          | 5/508 [00:00<00:05, 84.54it/s][A
Validation DataLoader 0:   1%|          | 6/508 [00:00<00:05, 83.92it/s][A
Validation DataLoader 0:   1%|▏         | 7/508 [00:00<00:05, 83.58it/s][A
Validation DataLoader 0:   2%|▏         | 8/508 [00:00<00:05, 83.77it/s][A
Validation DataLoader 0:   2%|▏         | 9/508 [00:00<00:05, 86.41it/s][A
Validation DataLoader 0:   2%|▏         | 10/508

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1035/1035 [00:22<00:00, 46.01it/s, v_num=64wc, loss=0.0927, acc=0.952]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Testing DataLoader 0: 100%|██████████| 1710/1710 [00:16<00:00, 105.52it/s]


0,1
acc,▃▁▅▃▃▄▂▇▃▆▄▇█▃▄▅▇▅▇▅▁▆▇▃▅▁▅▅▆▁█▇▃▃▅▄▄▇▇▇
epoch,▁▁▁▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▆▆▆▆▆▆▆███████
loss,█▇▄▄▆▂▅▄▅▆▃▅▄▃▄▅▂▂▆▅▂▂▆▃▃▄▃▄▅▂▂▄▃▃▅▁▂▁▁▃
test_acc,▁
test_loss,▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
val_acc,▂█▇▁▁
val_loss,▁▃▃▇█

0,1
acc,0.96484
epoch,5.0
loss,0.0895
test_acc,0.92399
test_loss,0.42222
trainer/global_step,5175.0
val_acc,0.96516
val_loss,0.14908


In [14]:
all_preds = []
all_targets = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        x, y = batch
        logits = model(x)
        preds = torch.argmax(logits, dim=1)

        all_preds.append(preds.cpu())
        all_targets.append(y.cpu())

y_pred = torch.cat(all_preds).numpy()
y_true = torch.cat(all_targets).numpy()
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.57      0.67     14733
           1       0.93      1.00      0.96    394482
           2       0.67      0.06      0.10     28424

    accuracy                           0.92    437639
   macro avg       0.81      0.54      0.58    437639
weighted avg       0.91      0.92      0.90    437639

