In [1]:
import os

os.chdir('..')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch

import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import average_precision_score

from torch.utils.data import DataLoader

from src.nids.memae import MemAE, EntropyLoss

from data.utils import load_kddcup99

## Hyperparameters

Hyperparameters were set as reported in the original paper, where possible, or copied from the authors' [repository](https://github.com/donggong1/memae-anomaly-detection). However, it should be noted that the code for experiments on KDD99 was not reported.

In [3]:
# Experimental setup

random_seed = 42
test_size = 0.5

# Hyperparameters

# From the code
num_epochs = 100

# From the original paper: [...] trained using the optimizer Adam [17] with a learning rate of 0.0001
learning_rate = 1e-4
# learning_rate = 1e-3

# Not reported in the original paper. In the video dataset, the authors used a batch size of 14, which is small for IDS datasets
batch_size = 16

mem_dim = 50

# From the original paper "In practice, alpha = 0.0002 leads to desirable results in all our experiments"
alpha = 0.0002

# From the code
shrink_thres = 0.0025

# NOTE: Results should be averaged across 20 runs


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
np.random.seed(random_seed)
torch.manual_seed(random_seed);

## KDD99

### Subsample selection

In the original paper, the authors report:
"Following the settings in [48], 80% of the samples labeled as “attack” in the original dataset are treated as normal samples."
This statement does not make a lot of sense: treating "attack" samples as normal sample would lead an anomaly detection model to learn wrong behavior.

The referenced paper is:
[48]: <i>Bo Zong, Qi Song, Martin Renqiang Min, Wei Cheng, Cristian Lumezanu, Daeki Cho, and Haifeng Chen. Deep autoencoding gaussian mixture model for unsupervised anomaly detection. In International Conference on Learning Representations, 2018.</i>

This paper reports: "We keep all the data samples labeled as “normal” and randomly draw samples labeled as “attack” so that the ratio between “normal” and “attack” is 4 : 1. In this way, we obtain a dataset with anomaly ratio 0.2" This seems more sensible.




In [6]:
# TODO: Use percent10=False to load the full dataset
X, y = load_kddcup99(percent10=True)

idx_normal = np.argwhere(y == 0).T[0]
idx_anomaly = np.argwhere(y == 1).T[0]

idx_anomaly_keep = np.random.choice(idx_anomaly, size=int(0.25 * idx_normal.shape[0]), replace=False)

idx = np.concatenate((idx_normal, idx_anomaly_keep))
idx = np.sort(idx)

X = X.iloc[idx].reset_index(drop=True)
y = y.iloc[idx].reset_index(drop=True)

y.value_counts()

0    97278
1    24319
Name: count, dtype: int64

In [7]:
y = y.to_numpy()

In [8]:
categorical_cols = ['protocol_type', 'service', 'flag']

In [9]:
## NOTE: The authors may have used this (wrong) preprocessing procedure

# X_cat = X[categorical_cols].to_numpy()
# X_num = X.drop(columns=categorical_cols).to_numpy()

# encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# X_cat = encoder.fit_transform(X_cat)

# X = np.concatenate((X_cat, X_num), axis=1)

# print(X.shape)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# NOTE: This is how correct preprocessing should be implemented, but old papers often use the wrong one (above)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_cat = X_train[categorical_cols].to_numpy()
X_train_num = X_train.drop(columns=categorical_cols).to_numpy()
X_train_cat = encoder.fit_transform(X_train_cat)


X_test_cat = X_test[categorical_cols].to_numpy()
X_test_num = X_test.drop(columns=categorical_cols).to_numpy()
X_test_cat = encoder.transform(X_test_cat)

X_train = np.concatenate((X_train_cat, X_train_num), axis=1).astype(np.float32)
X_test = np.concatenate((X_test_cat, X_test_num), axis=1).astype(np.float32)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train_benign = X_train[y_train == 0]

print(X_train.shape)

(97277, 117)


In [11]:
train_loader = DataLoader(torch.from_numpy(X_train_benign), batch_size=batch_size, shuffle=True)
x_test_tensor = torch.from_numpy(X_test).to(device)

In [12]:
model = MemAE(
    num_features=X_train.shape[1],
    mem_dim=mem_dim,
    shrink_thres=shrink_thres
)

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
mse_loss = torch.nn.MSELoss()
entropy_loss = EntropyLoss()
criterion = lambda pred, x: mse_loss(pred['output'], x) + alpha * entropy_loss(pred['att'])

In [13]:
for epoch in range(num_epochs):


    pbar = tqdm.tqdm(enumerate(train_loader), desc=f"Epoch {epoch + 1}/{num_epochs}")
    running_loss = 0.0
    for i, x_batch in pbar:
        x_batch = x_batch.to(device)

        optimizer.zero_grad()
        x_hat = model(x_batch)
        loss = criterion(x_hat, x_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pbar.set_postfix(loss=running_loss / (i + 1))

    with torch.no_grad():
        scores = model.evaluate_errors(x_test_tensor).cpu().numpy()
        average_precision = average_precision_score(y_test, scores)

    print(f"Epoch {epoch + 1}/{num_epochs} - Average precision: {average_precision:.4f}")

Epoch 1/100: 4861it [00:10, 483.19it/s, loss=0.461]


Epoch 1/100 - Average precision: 0.7005


Epoch 2/100: 4861it [00:09, 498.12it/s, loss=0.406]


Epoch 2/100 - Average precision: 0.7011


Epoch 3/100: 4861it [00:10, 483.82it/s, loss=0.377]


Epoch 3/100 - Average precision: 0.7049


Epoch 4/100: 4861it [00:09, 486.42it/s, loss=0.355]


Epoch 4/100 - Average precision: 0.7061


Epoch 5/100: 4861it [00:10, 463.19it/s, loss=0.338]


Epoch 5/100 - Average precision: 0.7182


Epoch 6/100: 4861it [00:10, 471.16it/s, loss=0.323]


Epoch 6/100 - Average precision: 0.7287


Epoch 7/100: 4861it [00:09, 495.72it/s, loss=0.308]


Epoch 7/100 - Average precision: 0.7635


Epoch 8/100: 4861it [00:09, 499.45it/s, loss=0.295]


Epoch 8/100 - Average precision: 0.7734


Epoch 9/100: 4861it [00:09, 494.76it/s, loss=0.283]


Epoch 9/100 - Average precision: 0.8083


Epoch 10/100: 4861it [00:09, 501.59it/s, loss=0.273]


Epoch 10/100 - Average precision: 0.8281


Epoch 11/100: 4861it [00:09, 488.36it/s, loss=0.264]


Epoch 11/100 - Average precision: 0.8485


Epoch 12/100: 4861it [00:09, 493.50it/s, loss=0.257]


Epoch 12/100 - Average precision: 0.8542


Epoch 13/100: 4861it [00:09, 489.88it/s, loss=0.251]


Epoch 13/100 - Average precision: 0.8685


Epoch 14/100: 4861it [00:09, 497.17it/s, loss=0.245]


Epoch 14/100 - Average precision: 0.8833


Epoch 15/100: 4861it [00:09, 496.40it/s, loss=0.241]


Epoch 15/100 - Average precision: 0.8918


Epoch 16/100: 4861it [00:09, 486.77it/s, loss=0.237]


Epoch 16/100 - Average precision: 0.8905


Epoch 17/100: 4861it [00:09, 496.44it/s, loss=0.234]


Epoch 17/100 - Average precision: 0.8963


Epoch 18/100: 4861it [00:09, 497.53it/s, loss=0.231]


Epoch 18/100 - Average precision: 0.8981


Epoch 19/100: 4861it [00:09, 497.35it/s, loss=0.228]


Epoch 19/100 - Average precision: 0.9055


Epoch 20/100: 4861it [00:09, 499.30it/s, loss=0.226]


Epoch 20/100 - Average precision: 0.9053


Epoch 21/100: 4861it [00:09, 496.56it/s, loss=0.224]


Epoch 21/100 - Average precision: 0.9064


Epoch 22/100: 4861it [00:09, 495.08it/s, loss=0.222]


Epoch 22/100 - Average precision: 0.9074


Epoch 23/100: 4861it [00:09, 498.47it/s, loss=0.22] 


Epoch 23/100 - Average precision: 0.9078


Epoch 24/100: 4861it [00:09, 505.27it/s, loss=0.218]


Epoch 24/100 - Average precision: 0.9086


Epoch 25/100: 4861it [00:09, 498.11it/s, loss=0.216]


Epoch 25/100 - Average precision: 0.9080


Epoch 26/100: 4861it [00:09, 499.91it/s, loss=0.215]


Epoch 26/100 - Average precision: 0.9083


Epoch 27/100: 4861it [00:09, 491.62it/s, loss=0.213]


Epoch 27/100 - Average precision: 0.9076


Epoch 28/100: 4861it [00:09, 488.52it/s, loss=0.211]


Epoch 28/100 - Average precision: 0.9095


Epoch 29/100: 4861it [00:09, 503.70it/s, loss=0.21] 


Epoch 29/100 - Average precision: 0.9094


Epoch 30/100: 4861it [00:09, 493.23it/s, loss=0.208]


Epoch 30/100 - Average precision: 0.9066


Epoch 31/100: 4861it [00:09, 487.06it/s, loss=0.207]


Epoch 31/100 - Average precision: 0.9085


Epoch 32/100: 4861it [00:10, 482.79it/s, loss=0.205]


Epoch 32/100 - Average precision: 0.9076


Epoch 33/100: 4861it [00:10, 477.88it/s, loss=0.204]


Epoch 33/100 - Average precision: 0.9079


Epoch 34/100: 4861it [00:10, 458.86it/s, loss=0.203]


Epoch 34/100 - Average precision: 0.9072


Epoch 35/100: 4861it [00:10, 454.51it/s, loss=0.201]


Epoch 35/100 - Average precision: 0.9080


Epoch 36/100: 4861it [00:10, 483.08it/s, loss=0.2]  


Epoch 36/100 - Average precision: 0.9105


Epoch 37/100: 4861it [00:10, 465.46it/s, loss=0.199]


Epoch 37/100 - Average precision: 0.9090


Epoch 38/100: 2200it [00:04, 453.25it/s, loss=0.226]


KeyboardInterrupt: 