# Ensemble unsupervised learning

In [1]:
import pandas as pd
import numpy as np
import os
import json
import torch
import torch.nn as nn
from config import whisper_config
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from model import AutoEncoder, Dataset, train_kmeans, test_kmeans, train_ae, test_ae, get_flows, transform
from plot import plot_cdf, plot_line

MAX_LEN = whisper_config["n_fft"] * 2

  from .autonotebook import tqdm as notebook_tqdm


## Useful functions

In [11]:
benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("attack_set", x) for x in 
                    os.listdir("attack_set") if x.endswith(".csv")]

In [3]:
def test_ensemble(datac, dataw, labels, kmeans_load_path,
         aec_input_dim, aec_load_path, aew_input_dim, aew_load_path, 
         kmeans_scale=7, aec_scale=10, aew_scale=3,
         test_data_aug=False, vote_method="majority", plot_dir=None):
    
    kmeans_preds, kmeans_ratios, kmeans_loss_list = test_kmeans(dataw, kmeans_load_path, 
                                              whisper_config, scale=kmeans_scale)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.MSELoss()

    model_aec = AutoEncoder(aec_input_dim, decoder_sigmoid=True)
    model_aec.load_state_dict(torch.load(os.path.join(aec_load_path, "model.pt")))
    model_aec.to(device)
    with open(os.path.join(aec_load_path, "train_loss.json"), "r") as f:
        loss_list = json.load(f)
    threshold = torch.tensor(loss_list).mean().item()
    aec_preds, aec_ratios, aec_loss_list = test_ae(datac, model_aec, device, criterion, threshold, 
                                    scale=aec_scale, test_data_aug=False, 
                                    decoder_sigmoid=True) 
    
    model_aew = AutoEncoder(aew_input_dim)
    model_aew.load_state_dict(torch.load(os.path.join(aew_load_path, "model.pt")))
    model_aew.to(device)
    with open(os.path.join(aew_load_path, "train_loss.json"), "r") as f:
        loss_list = json.load(f)
    threshold = torch.tensor(loss_list).mean().item()
    aew_preds, aew_ratios, aew_loss_list = test_ae(dataw, model_aew, device, criterion, threshold, 
                                    scale=aew_scale, test_data_aug=test_data_aug, 
                                    decoder_sigmoid=False)

    # preds = np.sign(np.array(kmeans_preds) + np.array(aec_preds) + np.array(aew_preds))
    preds = []
    weights = np.array([1, 1, 1]) / 3   # kmeans, aec, aew
    for idx in range(len(kmeans_preds)):
        if vote_method == "majority":
            preds.append(np.sign(kmeans_preds[idx] + aec_preds[idx] + aew_preds[idx]))
        elif vote_method == "positive":
            if kmeans_preds[idx] == -1 or aec_preds[idx] == -1 or aew_preds[idx] == -1:
                preds.append(-1)
            else:
                preds.append(1)
        else: # weighted
            # tmp_preds = np.array([kmeans_preds[idx], aec_preds[idx], aew_preds[idx]])
            # tmp_ratios = np.array([kmeans_ratios[idx], aec_ratios[idx], aew_ratios[idx]])
            pred = np.sign(kmeans_preds[idx] * kmeans_ratios[idx] + 
                           aec_preds[idx] * aec_ratios[idx] + 
                           aew_preds[idx] * aew_ratios[idx])
            preds.append(pred)

    return {
        "kmeans": accuracy_score(labels, kmeans_preds),
        "aec": accuracy_score(labels, aec_preds),
        "aew": accuracy_score(labels, aew_preds),
        "ensemble": accuracy_score(labels, preds)
    }, kmeans_loss_list, aec_loss_list, aew_loss_list

In [4]:
def get_ensemble_result(df_test, test_data_aug, use_short_flow, 
                        kmeans_load_path, aec_input_dim, aec_load_path, 
                        aew_input_dim, aew_load_path, vote_method="majority",
                        plot_dir=None):
    
    test_packet_data, test_packet_labels, test_flow_data, test_flow_labels  \
    = transform(get_flows(df_test), feature_type="encoding" 
                ,data_type="test", test_data_aug=test_data_aug)
    data_encoding = test_flow_data if not use_short_flow else test_flow_data + test_packet_data
    labels_encoding = test_flow_labels if not use_short_flow else test_flow_labels + test_packet_labels

    test_packet_data, test_packet_labels, test_flow_data, test_flow_labels \
    = transform(get_flows(df_test), data_type="test", test_data_aug=test_data_aug)
    data_whisper = test_flow_data if not use_short_flow else test_flow_data + test_packet_data
    labels_whisper = test_flow_labels if not use_short_flow else test_flow_labels + test_packet_labels

    assert len(labels_encoding) == len(labels_whisper), \
        print(f"len labels_encoding: {len(labels_encoding)}, len labels_whisper: {len(labels_whisper)}")
    for idx in range(len(labels_encoding)):
        assert labels_encoding[idx] == labels_whisper[idx]
    
    acc, kmeans_loss_list, aec_loss_list, aew_loss_list = \
                         test_ensemble(data_encoding, data_whisper, labels_whisper, 
                        kmeans_load_path, aec_input_dim, aec_load_path, aew_input_dim, 
                        aew_load_path, test_data_aug=test_data_aug, vote_method=vote_method)
    
    # if plot_dir is not None:
    #     plot_cdf(, kmeans_loss_list, "kmeans", plot_dir)

    return acc

In [7]:
USE_DATA_AUG = True
USE_SHORT_FLOW = True

accuracy_dict = {}
suffix = "-all" if USE_SHORT_FLOW else "-long"
train_benign_filename = "dataset/benign_small.csv"

aec_input_dim = MAX_LEN
aew_input_dim = whisper_config["n_fft"] // 2 + 1
kmeans_save_path = os.path.join("model", "whisper", "kmeans"+suffix, 
                    os.path.basename(train_benign_filename), "kmeans.json")
aec_save_dir = os.path.join("model", "autoencoding"+suffix, 
                        os.path.basename(train_benign_filename))
aew_save_dir = os.path.join("model", "whisper", "autoencoder"+suffix, 
                        os.path.basename(train_benign_filename))

## Train Frequency features + KMeans

In [8]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
= transform(get_flows(train_df))

train_data = train_flow_data if not USE_SHORT_FLOW else train_flow_data + train_packet_data
train_labels = train_flow_labels if not USE_SHORT_FLOW else train_flow_labels + train_packet_labels

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


In [9]:
train_kmeans(train_data, kmeans_save_path, whisper_config["val_K"])



## Train Frequency features + AutoEncoders

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aew = AutoEncoder(aew_input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_aew.parameters(), lr=0.01, weight_decay=1e-5)
train_ae(torch.tensor(train_data), torch.tensor(train_labels), aew_save_dir,
         model_aew, criterion, optimizer, device, num_epochs=50)

Epoch 1/50, Loss: 98.1166
Epoch 2/50, Loss: 23.9330
Epoch 3/50, Loss: 5.0799
Epoch 4/50, Loss: 1.9488
Epoch 5/50, Loss: 0.7033
Epoch 6/50, Loss: 2.9570
Epoch 7/50, Loss: 0.3255
Epoch 8/50, Loss: 2.7003
Epoch 9/50, Loss: 0.8728
Epoch 10/50, Loss: 0.5623
Epoch 11/50, Loss: 0.9623
Epoch 12/50, Loss: 0.4802
Epoch 13/50, Loss: 0.3182
Epoch 14/50, Loss: 0.6932
Epoch 15/50, Loss: 0.7875
Epoch 16/50, Loss: 0.2908
Epoch 17/50, Loss: 0.7281
Epoch 18/50, Loss: 0.2246
Epoch 19/50, Loss: 0.5675
Epoch 20/50, Loss: 0.5423
Epoch 21/50, Loss: 0.2764
Epoch 22/50, Loss: 1.2582
Epoch 23/50, Loss: 0.6044
Epoch 24/50, Loss: 0.3076
Epoch 25/50, Loss: 0.3734
Epoch 26/50, Loss: 0.3808
Epoch 27/50, Loss: 0.6041
Epoch 28/50, Loss: 0.3547
Epoch 29/50, Loss: 0.6665
Epoch 30/50, Loss: 0.6599
Epoch 31/50, Loss: 0.6247
Epoch 32/50, Loss: 0.2840
Epoch 33/50, Loss: 1.9269
Epoch 34/50, Loss: 2.5784
Epoch 35/50, Loss: 1.2231
Epoch 36/50, Loss: 0.4141
Epoch 37/50, Loss: 1.5400
Epoch 38/50, Loss: 1.2441
Epoch 39/50, Loss: 

## Train Time features + AutoEncoder

In [64]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_packet_data_, train_packet_labels_, train_flow_data_, train_flow_labels_ \
= transform(get_flows(train_df), feature_type="encoding")

train_data_ = train_flow_data_ if not USE_SHORT_FLOW else train_flow_data_ + train_packet_data_
train_labels_ = train_flow_labels_ if not USE_SHORT_FLOW else train_flow_labels_ + train_packet_labels_

In [77]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aec = AutoEncoder(aec_input_dim, decoder_sigmoid=True).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_aec.parameters(), lr=0.01, weight_decay=1e-5)
train_ae(torch.tensor(train_data_), torch.tensor(train_labels_), aec_save_dir,
         model_aec, criterion, optimizer, device, num_epochs=50, decoder_sigmoid=True)

Epoch 1/50, Loss: 0.0816
Epoch 2/50, Loss: 0.0461
Epoch 3/50, Loss: 0.0254
Epoch 4/50, Loss: 0.0206
Epoch 5/50, Loss: 0.0150
Epoch 6/50, Loss: 0.0207
Epoch 7/50, Loss: 0.0185
Epoch 8/50, Loss: 0.0141
Epoch 9/50, Loss: 0.0125
Epoch 10/50, Loss: 0.0024
Epoch 11/50, Loss: 0.0128
Epoch 12/50, Loss: 0.0070
Epoch 13/50, Loss: 0.0020
Epoch 14/50, Loss: 0.0052
Epoch 15/50, Loss: 0.0029
Epoch 16/50, Loss: 0.0031
Epoch 17/50, Loss: 0.0041
Epoch 18/50, Loss: 0.0047
Epoch 19/50, Loss: 0.0039
Epoch 20/50, Loss: 0.0037
Epoch 21/50, Loss: 0.0063
Epoch 22/50, Loss: 0.0057
Epoch 23/50, Loss: 0.0041
Epoch 24/50, Loss: 0.0017
Epoch 25/50, Loss: 0.0058
Epoch 26/50, Loss: 0.0083
Epoch 27/50, Loss: 0.0056
Epoch 28/50, Loss: 0.0073
Epoch 29/50, Loss: 0.0030
Epoch 30/50, Loss: 0.0017
Epoch 31/50, Loss: 0.0021
Epoch 32/50, Loss: 0.0029
Epoch 33/50, Loss: 0.0051
Epoch 34/50, Loss: 0.0014
Epoch 35/50, Loss: 0.0028
Epoch 36/50, Loss: 0.0044
Epoch 37/50, Loss: 0.0027
Epoch 38/50, Loss: 0.0003
Epoch 39/50, Loss: 0.

# Test ensemble

In [12]:
vote_method = "positive"

accuracy_save_dir = os.path.join("result", "ensemble", vote_method,
                    os.path.basename(train_benign_filename))

for filename in benign_filenames + attack_filenames:
    test_df = pd.read_csv(filename)
    test_df["label"] = 1 if filename in benign_filenames else -1
    acc = get_ensemble_result(test_df, USE_DATA_AUG, USE_SHORT_FLOW, 
            kmeans_save_path, aec_input_dim, aec_save_dir, aew_input_dim, 
            aew_save_dir, vote_method=vote_method)

    print(f"accuracy of {filename}: {acc}")
    accuracy_dict[filename] = acc

accuracy_base_name = "flow-accuracy.json" if not USE_SHORT_FLOW else "all-accuracy.json"
accuracy_save_path = os.path.join(accuracy_save_dir, accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

accuracy of train_set/benign1.csv: {'kmeans': 0.9982021574111066, 'aec': 0.9940071913703555, 'aew': 0.9970035956851778, 'ensemble': 0.9906112664802237}
accuracy of train_set/benign2.csv: {'kmeans': 0.9984387197501952, 'aec': 0.9923887587822015, 'aew': 0.997072599531616, 'ensemble': 0.9892661982825918}
accuracy of attack_set/LDoS_small.csv: {'kmeans': 1.0, 'aec': 0.026671387266182806, 'aew': 1.0, 'ensemble': 1.0}
accuracy of attack_set/SYNDoS.csv: {'kmeans': 0.0, 'aec': 0.0, 'aew': 0.0, 'ensemble': 0.0}
accuracy of attack_set/osscan.csv: {'kmeans': 0.006842619745845552, 'aec': 0.9775171065493646, 'aew': 0.006842619745845552, 'ensemble': 0.9843597262952102}
accuracy of attack_set/infiltration.csv: {'kmeans': 0.3333333333333333, 'aec': 0.0, 'aew': 0.3333333333333333, 'ensemble': 0.3333333333333333}
accuracy of attack_set/HOIC_small.csv: {'kmeans': 0.5024487756121939, 'aec': 0.0, 'aew': 0.6141929035482259, 'ensemble': 0.6141929035482259}
accuracy of attack_set/BruteForce-Web.csv: {'kmeans'