In [1]:
!pip install -q wandb

[0m

In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
import torchmetrics
import matplotlib.pyplot as plt
import wandb
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

In [None]:
os.environ["WANDB_MODE"] = "offline"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
pl.seed_everything(42, workers=True)

In [None]:
os.makedirs("checkpoints", exist_ok=True)

In [3]:
train_df = pd.read_csv('./train_data.csv')
val_df = pd.read_csv('./validation_data.csv')
test_df = pd.read_csv('./test_data.csv')
feature_cols = ['Packet speed', 'Data speed', 'Packet Rate', 'Data Rate', 'Packet Size to Speed Ratio', 'Data to Packet Ratio',
                'Duration', 'Hour sin', 'Hour cos', 'Day of Week','Unique ports', 'Std packet speed', 'Max packet speed', 
                'Std data speed', 'Max data speed', 'Std packet length', 'Max packet length',
                'Std source IP count', 'Max source IP count', 'Mean time diff', 'Std time diff','pca_1','pca_2']

features_selected = ['Data to Packet Ratio', 'Packet Size to Speed Ratio', 'Packet speed', 'Data speed', 'Max packet length', 'Hour cos', 'Day of Week', 'Std packet length', 'Unique ports']
X_train = train_df[features_selected]
y_train = train_df['Type']
X_val = val_df[features_selected]
y_val = val_df['Type']
X_test = test_df[features_selected]
y_test = test_df['Type']
le = LabelEncoder()
y_test = le.fit_transform(y_test)
y_train = le.fit_transform(y_train)
y_val = le.fit_transform(y_val)
scaler = StandardScaler()

In [4]:
batch_size = 256
num_epochs = 20
lr = 0.002
hidden_dim = 64

In [None]:
num_workers = min(8, os.cpu_count())
pin_memory = torch.cuda.is_available()

In [5]:
def create_dataloader(X, y, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, fit_scaler=False):
    if fit_scaler:
        scaler.fit(X)
    X = scaler.transform(X).astype('float32')
    X = torch.from_numpy(X)
    y = y.astype('long')
    y = torch.from_numpy(y)
    dataset = TensorDataset(X, y)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)

train_loader = create_dataloader(X_train, y_train, fit_scaler=True)
val_loader = create_dataloader(X_val, y_val)
test_loader = create_dataloader(X_test, y_test)

In [6]:
class FeedForwardNet(pl.LightningModule):
    def __init__(self, num_features, hidden_dim, lr, num_classes=3, dropout_prob=0.3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.num_classes = num_classes

        self.layers = nn.Sequential(
            nn.Linear(num_features, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob),

            nn.Linear(hidden_dim, num_classes)
        )

        self.loss = nn.CrossEntropyLoss()
        self.acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=num_classes)

    def forward(self, x):
        return self.layers(x)

    def _shared_eval_step(self, batch):
        x, y = batch
        logits = self(x)
        loss = self.loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = self.acc(preds, y)
        return loss, acc

    def training_step(self, batch, batch_idx):
        loss, acc = self._shared_eval_step(batch)
        self.log_dict({"loss": loss, "acc": acc}, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc = self._shared_eval_step(batch)
        self.log_dict({"val_loss": loss, "val_acc": acc}, prog_bar=True)

    def test_step(self, batch, batch_idx):
        loss, acc = self._shared_eval_step(batch)
        self.log_dict({"test_loss": loss, "test_acc": acc})

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

In [7]:
# wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbaczodomonkos[0m ([33mbaczodomonkos-budapesti-m-szaki-s-gazdas-gtudom-nyi-egyetem[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [9]:
torch.set_float32_matmul_precision('high')
model = FeedForwardNet(num_features=X_train.shape[1], hidden_dim=hidden_dim, lr=lr).to(device)
wandb_logger = pl.loggers.WandbLogger(project="ddos_classifier_dl2", log_model="all")
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="val_acc", mode="max", save_top_k=1, dirpath="checkpoints", filename="best_model"
)

trainer = pl.Trainer(
    max_epochs=num_epochs,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1 if torch.cuda.is_available() else None,
    precision="16-mixed" if torch.cuda.is_available() else 32,
    deterministic=True,
    logger=wandb_logger,
    callbacks=[checkpoint_callback],
)

trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name   | Type               | Params | Mode 
------------------------------------------------------
0 | layers | Sequential         | 5.3 K  | train
1 | loss   | CrossEntropyLoss   | 0      | train
2 | acc    | MulticlassAccuracy | 0      | train
------------------------------------------------------
5.3 K     Trainable params
0         Non-trainable params
5.3 K     Total params
0.021     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
`Trainer.fit` stopped: `max_epochs=10` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sam

0,1
acc,▃▂▁▁▄▅▆▆▆▄▄▅▅▄▃█▇▆▅▆▆█▇▃▅▅▇▆▆▄▇▅▆▅▄▄▆▇█▃
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇██
loss,▅▃▂▃▅▃▂▃▂▄▃▂▁▂▆▂▃▃▄▄▂▃▂▂▂▄▆▃▃▄█▂▄▂▂▂▃▃▄▂
test_acc,▁
test_loss,▁
trainer/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
val_acc,█▁▂▁▁▂▁▂▅▂
val_loss,▁▄▂▄▃▃█▅▂▆

0,1
acc,0.98438
epoch,10.0
loss,0.08363
test_acc,0.91398
test_loss,0.42754
trainer/global_step,41370.0
val_acc,0.94312
val_loss,0.23242


In [None]:
wandb.finish()

In [10]:
model.eval()
all_preds, all_targets = [], []

with torch.no_grad():
    for batch in test_loader:
        x, y = batch
        logits = model(x)
        preds = torch.argmax(logits, dim=1)
        all_preds.append(preds.cpu())
        all_targets.append(y.cpu())

y_pred = torch.cat(all_preds).numpy()
y_true = torch.cat(all_targets).numpy()
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.36      0.49     14733
           1       0.92      1.00      0.96    394482
           2       0.54      0.01      0.01     28424

    accuracy                           0.91    437639
   macro avg       0.74      0.46      0.49    437639
weighted avg       0.89      0.91      0.88    437639



## Hiperparaméter optimalizáció

In [None]:
RUN_SWEEP = False