In [1]:
import pandas as pd
import numpy as np
import os
import json
import torch
import torch.nn as nn
from model import get_flows
from config import whisper_config
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

MAX_LEN = whisper_config["n_fft"] * 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def fft_module(vec):
    ten = torch.tensor(vec)
    ten_fft = torch.fft.fft(ten, n=(whisper_config["n_fft"] // 2)+1)
    ten_power = torch.pow(ten_fft.real, 2) + torch.pow(ten_fft.imag, 2)
    ten_res = (ten_power.squeeze()+1).log2()
    ten_res = torch.where(torch.isnan(ten_res), torch.zeros_like(ten_res), ten_res)
    ten_res = torch.where(torch.isinf(ten_res), torch.zeros_like(ten_res), ten_res)
    return ten_res

def stft_module(vec):
    ten = torch.tensor(vec)
    # stft requirement: input_size > (n_fft // 2)
    # default return shape: (floor(n_fft/2)+1, n_frame, 2)
    ten_fft = torch.stft(ten, whisper_config["n_fft"])
    ten_power = torch.pow(ten_fft[:,:,0], 2) + torch.pow(ten_fft[:,:,1], 2)
    ten_res = ((ten_power.squeeze()+1).log2()).permute(1,0)
    ten_res = torch.where(torch.isnan(ten_res), torch.zeros_like(ten_res), ten_res)
    ten_res = torch.where(torch.isinf(ten_res), torch.zeros_like(ten_res), ten_res)
    # ten_res shape: (n_frame, floor(n_fft/2)+1)
    return ten_res

def transform(mp: dict, feature_type: str = "whisper", 
              data_type: str = "train", test_data_aug: bool = True):
    packet_data, flow_data = [], []
    packet_labels, flow_labels = [], []
    for key, flow in mp.items():
        vec = flow.vector()
        if feature_type == "bytes":
            vec = flow.pkts_bytes
        elif feature_type == "intervals":
            vec = flow.time_interval

        if len(vec) <= (whisper_config["n_fft"] // 2):
            # implement fft on short flows
            ten_res = fft_module(vec)
            if data_type == "test" and test_data_aug:
                # data shape for test data augmentation: (n_flow, n_sample, floor(n_fft/2)+1)
                packet_data.append([ten_res.tolist()])
            else:
                # data shape for no data augmentation: (n_flow, floor(n_fft/2)+1)
                packet_data.append(ten_res.tolist())
            packet_labels.append(flow.label)
        else:
            # flow level featrues
            ten_res = stft_module(vec)
            if data_type == "train":
                if (ten_res.size(0) > whisper_config["mean_win_train"]):
                    for _ in range(whisper_config["num_train_sample"]):
                        start_idx = torch.randint(0, ten_res.size(0)
                                    - whisper_config["mean_win_train"], (1,)).item()
                        ten_tmp = ten_res[start_idx:start_idx+whisper_config["mean_win_train"],:].mean(dim=0)
                        flow_data.append(ten_tmp.tolist())
                        flow_labels.append(flow.label)
                else:
                    flow_data.append(ten_res.mean(dim=0).tolist())
                    flow_labels.append(flow.label)
            else: # for test
                if test_data_aug:
                    tmp_data = []
                    if (ten_res.size(0) > whisper_config["mean_win_test"]):
                        # data augmentation for kmeans on flows with length > mean_win_test
                        for idx in range(0, ten_res.size(0) - whisper_config["mean_win_test"], 
                                        whisper_config["mean_win_test"]):
                            ten_tmp = ten_res[idx:idx+whisper_config["mean_win_test"],:].mean(dim=0)
                            tmp_data.append(ten_tmp.tolist())
                    else:
                        # no data augmentation for kmeans on flows with length < mean_win_test
                        tmp_data.append(ten_res.mean(dim=0).tolist())
                    flow_data.append(tmp_data)
                    # data shape for augmentation: (n_flow, n_sample, floor(n_fft/2)+1)
                else: # for other detection methods
                    flow_data.append(ten_res.mean(dim=0).tolist())
                    # data shape for no augmentation: (n_flow, floor(n_fft/2)+1)
                flow_labels.append(flow.label)
    return packet_data, packet_labels, flow_data, flow_labels

In [3]:
benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("attack_set", x) for x in 
                    os.listdir("attack_set") if x.endswith(".csv")]

In [15]:
feature_type = "intervals" # "bytes", "intervals", "whisper"
train_benign_filename = "dataset/benign_small.csv"

kmeans_save_path = os.path.join("model", feature_type, "kmeans", 
                    os.path.basename(train_benign_filename), "kmeans.json")
ae_save_dir = os.path.join("model", feature_type, "autoencoder", 
                os.path.basename(train_benign_filename))

In [16]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
= transform(get_flows(train_df))

train_data = train_flow_data + train_packet_data
train_labels = train_flow_labels + train_packet_labels

In [17]:
from model import train_kmeans, train_ae, test_kmeans, test_ae, AutoEncoder, Dataset

ae_input_dim = whisper_config["n_fft"] // 2 + 1

In [18]:
train_kmeans(train_data, kmeans_save_path, whisper_config["val_K"])



In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aew = AutoEncoder(ae_input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_aew.parameters(), lr=0.01, weight_decay=1e-5)
train_ae(torch.tensor(train_data), torch.tensor(train_labels), ae_save_dir,
         model_aew, criterion, optimizer, device, num_epochs=50)

Epoch 1/50, Loss: 18.9881
Epoch 2/50, Loss: 4.8031
Epoch 3/50, Loss: 5.9244
Epoch 4/50, Loss: 4.7105
Epoch 5/50, Loss: 4.6361
Epoch 6/50, Loss: 4.3389
Epoch 7/50, Loss: 4.7074
Epoch 8/50, Loss: 4.5740
Epoch 9/50, Loss: 4.9693
Epoch 10/50, Loss: 4.1968
Epoch 11/50, Loss: 4.5997
Epoch 12/50, Loss: 4.0803
Epoch 13/50, Loss: 3.9684
Epoch 14/50, Loss: 4.1833
Epoch 15/50, Loss: 5.2319
Epoch 16/50, Loss: 4.3518
Epoch 17/50, Loss: 4.9230
Epoch 18/50, Loss: 4.3739
Epoch 19/50, Loss: 4.8055
Epoch 20/50, Loss: 4.4244
Epoch 21/50, Loss: 4.9329
Epoch 22/50, Loss: 5.1307
Epoch 23/50, Loss: 4.3462
Epoch 24/50, Loss: 4.2996
Epoch 25/50, Loss: 4.9082
Epoch 26/50, Loss: 4.3725
Epoch 27/50, Loss: 4.9464
Epoch 28/50, Loss: 5.0010
Epoch 29/50, Loss: 5.0134
Epoch 30/50, Loss: 5.4166
Epoch 31/50, Loss: 4.5327
Epoch 32/50, Loss: 4.8199
Epoch 33/50, Loss: 4.3712
Epoch 34/50, Loss: 4.9294
Epoch 35/50, Loss: 3.6745
Epoch 36/50, Loss: 4.0238
Epoch 37/50, Loss: 4.9755
Epoch 38/50, Loss: 4.3630
Epoch 39/50, Loss: 4

In [20]:
acc_kmeans_dict, acc_ae_dict = {}, {}
USE_DATA_AUG = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
model_ae = AutoEncoder(ae_input_dim)
model_ae.load_state_dict(torch.load(os.path.join(ae_save_dir, "model.pt")))
model_ae.to(device)
with open(os.path.join(ae_save_dir, "train_loss.json"), "r") as f:
        loss_list = json.load(f)
threshold = torch.tensor(loss_list).mean().item()

for test_filename in benign_filenames + attack_filenames:
    test_df = pd.read_csv(test_filename)
    test_df["label"] = 1 if test_filename in benign_filenames else -1
    test_packet_data, test_packet_labels, test_flow_data, test_flow_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    preds, ratios = test_kmeans(test_flow_data+test_packet_data, 
                    kmeans_save_path, whisper_config, scale=7)
    acc = accuracy_score(test_flow_labels+test_packet_labels, preds)
    print(f"kmeans      accuracy of {test_filename}: {acc}")
    acc_kmeans_dict[test_filename] = acc

    preds, ratios = test_ae(test_flow_data+test_packet_data,
                    model_ae, device, criterion, threshold, 
                    scale=5, test_data_aug=USE_DATA_AUG)
    acc = accuracy_score(test_flow_labels+test_packet_labels, preds)
    print(f"autoencoder accuracy of {test_filename}: {acc}")
    acc_ae_dict[test_filename] = acc


accuracy_save_path = os.path.join("result", "test", feature_type,
                os.path.basename(train_benign_filename), "all-accuracy.json")
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump({
        "kmeans": acc_kmeans_dict,
        "autoencoder": acc_ae_dict
    }, f, indent=4)

kmeans      accuracy of train_set/benign1.csv: 0.9988014382740711
autoencoder accuracy of train_set/benign1.csv: 1.0
kmeans      accuracy of train_set/benign2.csv: 0.9982435597189696
autoencoder accuracy of train_set/benign2.csv: 0.9994145199063232
kmeans      accuracy of attack_set/LDoS_small.csv: 0.0
autoencoder accuracy of attack_set/LDoS_small.csv: 0.0
kmeans      accuracy of attack_set/osscan.csv: 0.006842619745845552
autoencoder accuracy of attack_set/osscan.csv: 0.002932551319648094
kmeans      accuracy of attack_set/infiltration.csv: 0.3333333333333333
autoencoder accuracy of attack_set/infiltration.csv: 0.16666666666666666
kmeans      accuracy of attack_set/HOIC_small.csv: 0.5767805804854879
autoencoder accuracy of attack_set/HOIC_small.csv: 0.49998250043748904
kmeans      accuracy of attack_set/BruteForce-Web.csv: 0.7700729927007299
autoencoder accuracy of attack_set/BruteForce-Web.csv: 0.5
kmeans      accuracy of attack_set/LOIC_UDP_small.csv: 0.88
autoencoder accuracy of at