# Ensemble unsupervised learning

In [1]:
import pandas as pd
import numpy as np
import os
import json
import torch
import torch.nn as nn
from config import whisper_config
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from model import AutoEncoder, Dataset, train_kmeans, test_kmeans, train_ae, test_ae, get_flows, transform
from model import get_metrics
from plot import plot_cdf, plot_line

MAX_LEN = whisper_config["n_fft"] * 2

  from .autonotebook import tqdm as notebook_tqdm


## Useful functions

In [2]:
benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("attack_set", x) for x in 
                    os.listdir("attack_set") if x.endswith(".csv")]

In [3]:
def test_ensemble(datac, dataw, labels, kmeans_load_path,
         aec_input_dim, aec_load_path, aew_input_dim, aew_load_path, 
         kmeans_scale=7, aec_scale=10, aew_scale=3,
         test_data_aug=False, vote_method="majority", plot_dir=None):
    
    kmeans_preds, kmeans_ratios, kmeans_loss_list, kmeans_threshold = \
        test_kmeans(dataw, kmeans_load_path, whisper_config, scale=kmeans_scale)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.MSELoss()

    model_aec = AutoEncoder(aec_input_dim, decoder_sigmoid=True)
    model_aec.load_state_dict(torch.load(os.path.join(aec_load_path, "model.pt")))
    model_aec.to(device)
    with open(os.path.join(aec_load_path, "train_loss.json"), "r") as f:
        loss_list = json.load(f)
    threshold = torch.tensor(loss_list).mean().item()
    aec_preds, aec_ratios, aec_loss_list, aec_threshold = \
        test_ae(datac, model_aec, device, criterion, threshold, 
                scale=aec_scale, test_data_aug=False, decoder_sigmoid=True) 
    
    model_aew = AutoEncoder(aew_input_dim)
    model_aew.load_state_dict(torch.load(os.path.join(aew_load_path, "model.pt")))
    model_aew.to(device)
    with open(os.path.join(aew_load_path, "train_loss.json"), "r") as f:
        loss_list = json.load(f)
    threshold = torch.tensor(loss_list).mean().item()
    aew_preds, aew_ratios, aew_loss_list, aew_threshold = \
        test_ae(dataw, model_aew, device, criterion, threshold, 
                scale=aew_scale, test_data_aug=test_data_aug, decoder_sigmoid=False)

    # preds = np.sign(np.array(kmeans_preds) + np.array(aec_preds) + np.array(aew_preds))
    preds_majority, preds_positive, preds_weighted = [], [], []
    for idx in range(len(kmeans_preds)):
        preds_majority.append(np.sign(kmeans_preds[idx] + aec_preds[idx] + aew_preds[idx]))
        if kmeans_preds[idx] == -1 or aec_preds[idx] == -1 or aew_preds[idx] == -1:
            preds_positive.append(-1)
        else:
            preds_positive.append(1)
        preds_weighted.append(np.sign(kmeans_preds[idx] * kmeans_ratios[idx] +
                                        aec_preds[idx] * aec_ratios[idx] + 
                                        aew_preds[idx] * aew_ratios[idx]))
    return {
        "labels": labels,
        "kmeans": kmeans_preds,
        "aec": aec_preds,
        "aew": aew_preds,
        "preds_majority": preds_majority,
        "preds_positive": preds_positive,
        "preds_weighted": preds_weighted,
    }

In [4]:
def get_ensemble_result(df_test, test_data_aug, use_short_flow, 
                        kmeans_load_path, aec_input_dim, aec_load_path, 
                        aew_input_dim, aew_load_path, vote_method="majority",
                        plot_dir=None):
    
    test_packet_data, test_packet_labels, test_flow_data, test_flow_labels  \
    = transform(get_flows(df_test), feature_type="encoding" 
                ,data_type="test", test_data_aug=test_data_aug)
    data_encoding = test_flow_data if not use_short_flow else test_flow_data + test_packet_data
    labels_encoding = test_flow_labels if not use_short_flow else test_flow_labels + test_packet_labels

    test_packet_data, test_packet_labels, test_flow_data, test_flow_labels \
    = transform(get_flows(df_test), data_type="test", test_data_aug=test_data_aug)
    data_whisper = test_flow_data if not use_short_flow else test_flow_data + test_packet_data
    labels_whisper = test_flow_labels if not use_short_flow else test_flow_labels + test_packet_labels

    assert len(labels_encoding) == len(labels_whisper), \
        print(f"len labels_encoding: {len(labels_encoding)}, len labels_whisper: {len(labels_whisper)}")
    for idx in range(len(labels_encoding)):
        assert labels_encoding[idx] == labels_whisper[idx]
    
    data_dict = test_ensemble(data_encoding, data_whisper, labels_whisper, 
                        kmeans_load_path, aec_input_dim, aec_load_path, aew_input_dim, 
                        aew_load_path, test_data_aug=test_data_aug, vote_method=vote_method)
    
    # if plot_dir is not None:
    #     plot_cdf(, kmeans_loss_list, "kmeans", plot_dir)

    metric_dict = {}
    for key in ["kmeans", "aec", "aew", "preds_majority", "preds_positive", "preds_weighted"]:
        metric_dict[key] = get_metrics(data_dict["labels"], data_dict[key])
    
    return metric_dict

In [5]:
USE_DATA_AUG = True
USE_SHORT_FLOW = True

accuracy_dict = {}
suffix = "-all" if USE_SHORT_FLOW else "-long"
train_benign_filename = "dataset/benign_small.csv"

aec_input_dim = MAX_LEN
aew_input_dim = whisper_config["n_fft"] // 2 + 1
kmeans_save_path = os.path.join("model", "whisper", "kmeans"+suffix, 
                    os.path.basename(train_benign_filename), "kmeans.json")
aec_save_dir = os.path.join("model", "autoencoding"+suffix, 
                        os.path.basename(train_benign_filename))
aew_save_dir = os.path.join("model", "whisper", "autoencoder"+suffix, 
                        os.path.basename(train_benign_filename))

## Train Frequency features + KMeans

In [8]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
= transform(get_flows(train_df))

train_data = train_flow_data if not USE_SHORT_FLOW else train_flow_data + train_packet_data
train_labels = train_flow_labels if not USE_SHORT_FLOW else train_flow_labels + train_packet_labels

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


In [9]:
train_kmeans(train_data, kmeans_save_path, whisper_config["val_K"])



## Train Frequency features + AutoEncoders

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aew = AutoEncoder(aew_input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_aew.parameters(), lr=0.01, weight_decay=1e-5)
train_ae(torch.tensor(train_data), torch.tensor(train_labels), aew_save_dir,
         model_aew, criterion, optimizer, device, num_epochs=50)

Epoch 1/50, Loss: 98.1166
Epoch 2/50, Loss: 23.9330
Epoch 3/50, Loss: 5.0799
Epoch 4/50, Loss: 1.9488
Epoch 5/50, Loss: 0.7033
Epoch 6/50, Loss: 2.9570
Epoch 7/50, Loss: 0.3255
Epoch 8/50, Loss: 2.7003
Epoch 9/50, Loss: 0.8728
Epoch 10/50, Loss: 0.5623
Epoch 11/50, Loss: 0.9623
Epoch 12/50, Loss: 0.4802
Epoch 13/50, Loss: 0.3182
Epoch 14/50, Loss: 0.6932
Epoch 15/50, Loss: 0.7875
Epoch 16/50, Loss: 0.2908
Epoch 17/50, Loss: 0.7281
Epoch 18/50, Loss: 0.2246
Epoch 19/50, Loss: 0.5675
Epoch 20/50, Loss: 0.5423
Epoch 21/50, Loss: 0.2764
Epoch 22/50, Loss: 1.2582
Epoch 23/50, Loss: 0.6044
Epoch 24/50, Loss: 0.3076
Epoch 25/50, Loss: 0.3734
Epoch 26/50, Loss: 0.3808
Epoch 27/50, Loss: 0.6041
Epoch 28/50, Loss: 0.3547
Epoch 29/50, Loss: 0.6665
Epoch 30/50, Loss: 0.6599
Epoch 31/50, Loss: 0.6247
Epoch 32/50, Loss: 0.2840
Epoch 33/50, Loss: 1.9269
Epoch 34/50, Loss: 2.5784
Epoch 35/50, Loss: 1.2231
Epoch 36/50, Loss: 0.4141
Epoch 37/50, Loss: 1.5400
Epoch 38/50, Loss: 1.2441
Epoch 39/50, Loss: 

## Train Time features + AutoEncoder

In [64]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_packet_data_, train_packet_labels_, train_flow_data_, train_flow_labels_ \
= transform(get_flows(train_df), feature_type="encoding")

train_data_ = train_flow_data_ if not USE_SHORT_FLOW else train_flow_data_ + train_packet_data_
train_labels_ = train_flow_labels_ if not USE_SHORT_FLOW else train_flow_labels_ + train_packet_labels_

In [77]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aec = AutoEncoder(aec_input_dim, decoder_sigmoid=True).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_aec.parameters(), lr=0.01, weight_decay=1e-5)
train_ae(torch.tensor(train_data_), torch.tensor(train_labels_), aec_save_dir,
         model_aec, criterion, optimizer, device, num_epochs=50, decoder_sigmoid=True)

Epoch 1/50, Loss: 0.0816
Epoch 2/50, Loss: 0.0461
Epoch 3/50, Loss: 0.0254
Epoch 4/50, Loss: 0.0206
Epoch 5/50, Loss: 0.0150
Epoch 6/50, Loss: 0.0207
Epoch 7/50, Loss: 0.0185
Epoch 8/50, Loss: 0.0141
Epoch 9/50, Loss: 0.0125
Epoch 10/50, Loss: 0.0024
Epoch 11/50, Loss: 0.0128
Epoch 12/50, Loss: 0.0070
Epoch 13/50, Loss: 0.0020
Epoch 14/50, Loss: 0.0052
Epoch 15/50, Loss: 0.0029
Epoch 16/50, Loss: 0.0031
Epoch 17/50, Loss: 0.0041
Epoch 18/50, Loss: 0.0047
Epoch 19/50, Loss: 0.0039
Epoch 20/50, Loss: 0.0037
Epoch 21/50, Loss: 0.0063
Epoch 22/50, Loss: 0.0057
Epoch 23/50, Loss: 0.0041
Epoch 24/50, Loss: 0.0017
Epoch 25/50, Loss: 0.0058
Epoch 26/50, Loss: 0.0083
Epoch 27/50, Loss: 0.0056
Epoch 28/50, Loss: 0.0073
Epoch 29/50, Loss: 0.0030
Epoch 30/50, Loss: 0.0017
Epoch 31/50, Loss: 0.0021
Epoch 32/50, Loss: 0.0029
Epoch 33/50, Loss: 0.0051
Epoch 34/50, Loss: 0.0014
Epoch 35/50, Loss: 0.0028
Epoch 36/50, Loss: 0.0044
Epoch 37/50, Loss: 0.0027
Epoch 38/50, Loss: 0.0003
Epoch 39/50, Loss: 0.

# Test ensemble

In [6]:
vote_method = "positive"

metrics_save_dir = os.path.join("result", "ensemble",
                    os.path.basename(train_benign_filename))

with open("attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)

for filename in attack_filenames:
    benign_df = pd.read_csv(benign_filenames[0])
    attack_df = pd.read_csv(filename)
    test_df = pd.concat([benign_df, attack_df], ignore_index=True)
    # test_df = pd.read_csv(filename)
    
    file_key = os.path.basename(filename).split(".")[0]
    cur_attack_ips = attack_ips_dict.get(file_key, [])
    test_df["label"] = 0
    for row in test_df.iterrows():
        if row[1]["src_ip"] in cur_attack_ips or row[1]["dst_ip"] in cur_attack_ips:
            test_df.loc[row[0], "label"] = -1
        else:
            test_df.loc[row[0], "label"] = 1

    metrics = get_ensemble_result(test_df, USE_DATA_AUG, USE_SHORT_FLOW, 
            kmeans_save_path, aec_input_dim, aec_save_dir, aew_input_dim, 
            aew_save_dir, vote_method=vote_method)

    print(f"metrics of {filename}: {metrics}")
    accuracy_dict[filename] = metrics

accuracy_base_name = "flow-metrics.json" if not USE_SHORT_FLOW else "all-metrics.json"
accuracy_save_path = os.path.join(metrics_save_dir, accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
  return torch._C._cuda_getDeviceCount() > 0
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


metrics of attack_set/LDoS_small.csv: {'kmeans': (0.22762264838518653, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'aec': (0.9780440030975265, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'aew': (0.22734933722042547, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_majority': (0.22771375210677355, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_positive': (0.22589167767503301, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_weighted': (0.22652940372614222, 0.0, 0.0, 0.0, nan, nan, nan, nan)}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


metrics of attack_set/osscan.csv: {'kmeans': (0.9967385138967668, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'aec': (0.712138400453772, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'aew': (0.9958876914350538, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_majority': (0.9970221213840046, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_positive': (0.7077424844015882, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_weighted': (0.7097277368122519, 0.0, 0.0, 0.0, nan, nan, nan, nan)}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


metrics of attack_set/infiltration.csv: {'kmeans': (0.99780526735834, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'aec': (0.9940143655227454, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'aew': (0.996608140462889, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_majority': (0.9982043096568236, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_positive': (0.9902234636871509, 0.0, 0.0, 0.0, nan, nan, nan, nan), 'preds_weighted': (0.9930167597765364, 0.0, 0.0, 0.0, nan, nan, nan, nan)}
metrics of attack_set/HOIC_small.csv: {'kmeans': (0.6016549408378637, 0.5026702997275204, 0.9166252608566035, 0.6492802590363566, nan, nan, 0.31159614326555274, 0.8741374137413741), 'aec': (0.265270227054685, 0.0, 0.0, 0.0, nan, nan, 0.5022502250225023, 0.9954995499549955), 'aew': (0.6788055644387592, 0.6163487738419619, 0.9191385615603413, 0.7378894144511499, nan, nan, 0.2664580763253437, 0.8507350735073508), 'preds_majority': (0.6017348896706108, 0.5026702997275204, 0.9168074744061226, 0.6493259652951322, nan, nan, 0.31144612826405255

## Check positive sample distribution

In [21]:
with open("attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)

for filename in attack_filenames:
    attack_df = pd.read_csv(filename)
    file_key = os.path.basename(filename).split(".")[0]
    cur_attack_ips = attack_ips_dict.get(file_key, [])
    attack_packet_count = 0
    for row in attack_df.iterrows():
        if row[1]["src_ip"] in cur_attack_ips or row[1]["dst_ip"] in cur_attack_ips:
            attack_packet_count += 1
    print(f"{filename}: total= {len(attack_df)}, attack= {attack_packet_count}, attack_ips= {cur_attack_ips}")

attack_set/LDoS_small.csv: total= 97750, attack= 97750, attack_ips= ['10.0.0.1', '10.0.0.4']
attack_set/osscan.csv: total= 2337, attack= 2337, attack_ips= ['10.0.0.1', '10.0.0.4']
attack_set/infiltration.csv: total= 3758, attack= 3758, attack_ips= ['13.58.225.34']
attack_set/HOIC_small.csv: total= 100000, attack= 91700, attack_ips= ['18.218.115.60', '18.219.9.1', '18.219.32.43', '18.218.55.126', '52.14.136.135', '18.219.5.43', '18.216.200.189', '18.216.229.235', '18.218.11.51', '18.216.24.42']
attack_set/BruteForce-Web.csv: total= 18252, attack= 18252, attack_ips= ['18.218.115.60']
attack_set/LOIC_UDP_small.csv: total= 100000, attack= 100000, attack_ips= ['18.218.115.60', '18.219.9.1', '18.219.32.43', '18.218.55.126', '52.14.136.135', '18.219.5.43', '18.216.200.189', '18.216.229.235', '18.218.11.51', '18.216.24.42']
attack_set/SQL_Injection.csv: total= 178, attack= 178, attack_ips= ['18.218.115.60']
attack_set/ssldosA.csv: total= 100987, attack= 100987, attack_ips= ['10.0.0.1', '10.0.0

In [17]:
infiltration_df = pd.read_csv("attack_set/infiltration.csv")
display(infiltration_df.head())
# for row in infiltration_df.iterrows():
#     print(row[1]["src_ip"], row[1]["dst_ip"])
#     print (row[1]["src_ip"] in attack_ips_dict["infiltration"] or row[1]["dst_ip"] in attack_ips_dict["infiltration"])

Unnamed: 0,src_ip,dst_ip,src_port,dst_port,protocol,proto_code,pkt_length,timestamp,tos,id,ttl,chksum,flags,tcp_window,tcp_dataoffset,udp_length
0,172.31.69.13,13.58.225.34,50887,31337,6,1,66,1519913000.0,2,16350,128,0,16384,8192,8,
1,13.58.225.34,172.31.69.13,31337,50887,6,1,66,1519913000.0,0,0,63,23611,16384,26883,8,
2,172.31.69.13,13.58.225.34,50887,31337,6,1000,54,1519913000.0,0,16351,128,0,16384,256,5,
3,13.58.225.34,172.31.69.13,31337,50887,6,1000,58,1519913000.0,0,41463,63,47691,16384,211,5,
4,13.58.225.34,172.31.69.13,31337,50887,6,1000,1514,1519913000.0,0,41464,63,46234,16384,211,5,
