In [1]:
from pathlib import Path
import torch
import torchmetrics
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

## Create Dataset

### Define Dataset class

In [2]:
class CICIDSDataset(Dataset):
    """CIC-IDS-2017 Dataset."""

    def __init__(self, dataframe: pd.DataFrame, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        features = self.dataframe.iloc[idx, :-1]
        features = np.array(features).astype('float32')
        label = self.dataframe.iloc[idx, -1]

        sample = (features, label)

        if self.transform:
            sample = self.transform(sample)

        return sample

### Define Dataset transformer

In [3]:
class myToTensor:
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample: tuple):
        features, label = sample
        features = torch.from_numpy(features)
        return (features, label)

### Load CSV & Standardize features & Convert labels

In [4]:
csv_dir_path = Path('../../datasets/CIC-IDS-2017/MachineLearningCSV/MachineLearningCVE')

df_train = pd.read_csv(csv_dir_path / 'train.csv')
df_test = pd.read_csv(csv_dir_path / 'test.csv')

# Standardize features
epsilon = 1e-7  # avoid zero division
feature_columns = df_train.columns[df_train.columns != 'Label']
df_train[feature_columns] = (df_train[feature_columns] - df_train[feature_columns].mean()) / (df_train[feature_columns].std() + epsilon)
df_test[feature_columns] = (df_test[feature_columns] - df_test[feature_columns].mean()) / (df_test[feature_columns].std() + epsilon)

# Convert categorical variables to discrete numbers
encoder = LabelEncoder()
encoder.fit(df_train['Label'])
df_train['Label'], df_test['Label'] = encoder.transform(df_train['Label']), encoder.transform(df_test['Label'])

print(f"Encoded classes: {encoder.classes_}")
df_train['Label'].value_counts()

Encoded classes: ['BENIGN' 'Bot' 'DDoS' 'DoS' 'FTP-Patator' 'PortScan' 'SSH-Patator' 'Web']


0    60000
4     7000
3     6000
2     6000
5     6000
6     5000
7     2000
1     1500
Name: Label, dtype: int64

In [5]:
# Ensure that training data doesn't have NaN
columns_with_nan = list(df_train.columns[df_train.isna().any()])
assert columns_with_nan == []

### Instantiate Dataset

In [6]:
train_dataset = CICIDSDataset(
    dataframe=df_train,
    transform=myToTensor()
)
test_dataset = CICIDSDataset(
    dataframe=df_test,
    transform=myToTensor()
)

### Create DataLoader

In [7]:
# NEVER FAIL TO SHUFFLE the dataset, as it is aligned at this point.
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=64,
    shuffle=True
)
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=64
)

## Create model

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [9]:
n_features = len(df_train.columns) - 1  # num_of_features = num_of_all_columns - num_of_class_label
n_classes = len(encoder.classes_)

class AlertNet(nn.Module):
    def __init__(self):
        super(AlertNet, self).__init__()
        self.FCN_units = [n_features, 1024, 768, 512, 256, 128] # n_features is for input layer

        layers = []
        for idx in range(len(self.FCN_units)-1):
            layers += [
                nn.Linear(self.FCN_units[idx], self.FCN_units[idx+1]),
                nn.ReLU(),
                nn.BatchNorm1d(self.FCN_units[idx+1]),
                nn.Dropout(0.01)
            ]

        self.sequential_model = nn.Sequential(*layers)
        self.output_layer = nn.Sequential(
            nn.Linear(self.FCN_units[-1], n_classes),
            # nn.Softmax(dim=1)
        )

    def forward(self, x: torch.Tensor):
        x = self.sequential_model(x)
        x = self.output_layer(x)
        return x

In [10]:
model = AlertNet().to(device)
print(model)

AlertNet(
  (sequential_model): Sequential(
    (0): Linear(in_features=78, out_features=1024, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.01, inplace=False)
    (4): Linear(in_features=1024, out_features=768, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.01, inplace=False)
    (8): Linear(in_features=768, out_features=512, bias=True)
    (9): ReLU()
    (10): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.01, inplace=False)
    (12): Linear(in_features=512, out_features=256, bias=True)
    (13): ReLU()
    (14): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Dropout(p=0.01, inplace=False)
    (16): Linear(in_features=256, out_features=128, bias=True)
    (17): ReLU()
    (18): BatchNorm1d(128, eps=1e-05

### Define loss function and optimizer

In [11]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

### Training function

In [12]:
def train(dataloader: DataLoader, model: nn.Module, loss_function, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    print(f"[[ Train ]]")
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        # feed the data to the network
        pred = model(X)
        loss = loss_function(pred, y)
        # adjust the weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 400 == 0:
            loss, current = loss.item(), batch*len(X)
            # Accuracy should not be considered as metrics that represent how good this model behaves
            # (because classes are substantially imbalanced), but here we're calculating accuracy for reference
            accuracy = (pred.argmax(dim=1) == y).type(torch.float).sum().item() / torch.numel(y)
            print(f"| position of this batch: {current:>5d}/{size:>5d} |")
            print(f"Loss: {loss:>7f}")
            # print(f"Accuracy in this batch (for reference): {(100*accuracy):>0.1f}%")
            print()

### Testing function

In [13]:
def test(dataloader: DataLoader, model: nn.Module, loss_function, n_classes: int):
    num_batches = len(dataloader)
    print(f"[[ Test ]]")
    model.eval()
    test_loss = 0.0
    predicted_labels_all = []
    correct_labels_all = []
    with torch.no_grad():
        for X, y in dataloader:
            # add correct labels to calculate F1 score later
            correct_labels_all += y.tolist()
            # make prediction
            X, y = X.to(device), y.to(device)
            pred = model(X)
            pred_labels = pred.argmax(dim=1)
            predicted_labels_all += pred_labels.tolist()
            # accumulate the output of loss function
            test_loss += loss_function(pred, y).item()
        test_loss /= num_batches
        f1_score_calculator = torchmetrics.F1Score(num_classes=n_classes, average='weighted')
        f1_score = f1_score_calculator(
            torch.from_numpy(
                np.array(predicted_labels_all)
            ),
            torch.from_numpy(
                np.array(correct_labels_all)
            )
        )
        print(f"F1 score (weighted): {f1_score}")
        print(f"Average loss: {test_loss:>8f}")
        print()

## Execute training

In [14]:
epochs = 10
for epoch in range(1, epochs+1):
    print(f"------------------------------ Epoch {epoch} ------------------------------")
    train(train_dataloader, model, loss_function, optimizer)
    test(test_dataloader, model, loss_function, n_classes)

------------------------------ Epoch 1 ------------------------------
[[ Train ]]
| position of this batch:     0/93500 |
Loss: 2.307563

| position of this batch: 25600/93500 |
Loss: 0.136688

| position of this batch: 51200/93500 |
Loss: 0.150344

| position of this batch: 76800/93500 |
Loss: 0.121287

[[ Test ]]
F1 score (weighted): 0.9751631021499634
Average loss: 0.066150

------------------------------ Epoch 2 ------------------------------
[[ Train ]]
| position of this batch:     0/93500 |
Loss: 0.160525

| position of this batch: 25600/93500 |
Loss: 0.060201

| position of this batch: 51200/93500 |
Loss: 0.015956

| position of this batch: 76800/93500 |
Loss: 0.029680

[[ Test ]]
F1 score (weighted): 0.9790273904800415
Average loss: 0.055066

------------------------------ Epoch 3 ------------------------------
[[ Train ]]
| position of this batch:     0/93500 |
Loss: 0.028439

| position of this batch: 25600/93500 |
Loss: 0.160505

| position of this batch: 51200/93500 |
Loss

### Save model

In [15]:
torch.save(model.state_dict(), 'alert_net_state_dict.pt')