# Test flow level ensemble on cic-ids2017 datasets

In [1]:
import pandas as pd
import numpy as np
import os
import json
import torch
import torch.nn as nn
from config import whisper_config
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from model import AutoEncoder, Dataset, train_kmeans, test_kmeans, train_ae, test_ae, get_flows, transform
from model import get_metrics, test_ensemble, get_ensemble_result
from plot import plot_cdf, plot_line

MAX_LEN = whisper_config["n_fft"] * 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("cic-ids", x) for x in 
                    os.listdir("cic-ids") if x.endswith(".csv")]

In [3]:
USE_DATA_AUG = True
USE_SHORT_FLOW = True

accuracy_dict = {}
suffix = "-all" if USE_SHORT_FLOW else "-long"
train_benign_filename = "cic-ids-benign/benign.csv"

aec_input_dim = MAX_LEN
aew_input_dim = whisper_config["n_fft"] // 2 + 1
kmeans_save_path = os.path.join("model-cic", "whisper", "kmeans"+suffix, 
                    os.path.basename(train_benign_filename), "kmeans.json")
aec_save_dir = os.path.join("model-cic", "autoencoding"+suffix, 
                        os.path.basename(train_benign_filename))
aew_save_dir = os.path.join("model-cic", "whisper", "autoencoder"+suffix, 
                        os.path.basename(train_benign_filename))

## Label benign traffic

In [12]:
with open("cic-attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)
attack_ips_set = set()
for key, value in attack_ips_dict.items():
    attack_ips_set.update(value)

train_df = pd.read_csv("cic-ids-benign/benign.csv")
for row in train_df.iterrows():
    if row[1]["src_ip"] in attack_ips_set or row[1]["dst_ip"] in attack_ips_set:
        train_df.drop(row[0], inplace=True)
train_df.to_csv(train_benign_filename, index=False)

## Train Whisper

In [14]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
= transform(get_flows(train_df))

train_data = train_flow_data if not USE_SHORT_FLOW else train_flow_data + train_packet_data
train_labels = train_flow_labels if not USE_SHORT_FLOW else train_flow_labels + train_packet_labels

In [15]:
train_kmeans(train_data, kmeans_save_path, whisper_config["val_K"])



## Train FAE

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aew = AutoEncoder(aew_input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_aew.parameters(), lr=0.01, weight_decay=1e-5)
train_ae(torch.tensor(train_data), torch.tensor(train_labels), aew_save_dir,
         model_aew, criterion, optimizer, device, num_epochs=50)

Epoch 1/50, Loss: 293.7304
Epoch 2/50, Loss: 121.1765
Epoch 3/50, Loss: 48.1123
Epoch 4/50, Loss: 23.1398
Epoch 5/50, Loss: 19.3566
Epoch 6/50, Loss: 22.2214
Epoch 7/50, Loss: 31.8477
Epoch 8/50, Loss: 17.7351
Epoch 9/50, Loss: 18.4880
Epoch 10/50, Loss: 22.1270
Epoch 11/50, Loss: 19.1999
Epoch 12/50, Loss: 18.6823
Epoch 13/50, Loss: 14.5888
Epoch 14/50, Loss: 23.1581
Epoch 15/50, Loss: 11.6993
Epoch 16/50, Loss: 24.8537
Epoch 17/50, Loss: 22.4674
Epoch 18/50, Loss: 14.5400
Epoch 19/50, Loss: 10.7778
Epoch 20/50, Loss: 13.9572
Epoch 21/50, Loss: 18.8821
Epoch 22/50, Loss: 22.3701
Epoch 23/50, Loss: 27.8160
Epoch 24/50, Loss: 14.1748
Epoch 25/50, Loss: 22.6573
Epoch 26/50, Loss: 21.7075
Epoch 27/50, Loss: 17.6834
Epoch 28/50, Loss: 13.3385
Epoch 29/50, Loss: 17.1879
Epoch 30/50, Loss: 24.5757
Epoch 31/50, Loss: 14.6766
Epoch 32/50, Loss: 17.1005
Epoch 33/50, Loss: 15.8159
Epoch 34/50, Loss: 18.0359
Epoch 35/50, Loss: 19.7387
Epoch 36/50, Loss: 18.8387
Epoch 37/50, Loss: 18.0954
Epoch 38

## Train TAE

In [17]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_packet_data_, train_packet_labels_, train_flow_data_, train_flow_labels_ \
= transform(get_flows(train_df), feature_type="encoding")

train_data_ = train_flow_data_ if not USE_SHORT_FLOW else train_flow_data_ + train_packet_data_
train_labels_ = train_flow_labels_ if not USE_SHORT_FLOW else train_flow_labels_ + train_packet_labels_

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aec = AutoEncoder(aec_input_dim, decoder_sigmoid=True).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_aec.parameters(), lr=0.01, weight_decay=1e-5)
train_ae(torch.tensor(train_data_), torch.tensor(train_labels_), aec_save_dir,
         model_aec, criterion, optimizer, device, num_epochs=50, decoder_sigmoid=True)

Epoch 1/50, Loss: 0.0224
Epoch 2/50, Loss: 0.0039
Epoch 3/50, Loss: 0.0033
Epoch 4/50, Loss: 0.0015
Epoch 5/50, Loss: 0.0046
Epoch 6/50, Loss: 0.0046
Epoch 7/50, Loss: 0.0026
Epoch 8/50, Loss: 0.0045
Epoch 9/50, Loss: 0.0032
Epoch 10/50, Loss: 0.0016
Epoch 11/50, Loss: 0.0013
Epoch 12/50, Loss: 0.0011
Epoch 13/50, Loss: 0.0021
Epoch 14/50, Loss: 0.0012
Epoch 15/50, Loss: 0.0014
Epoch 16/50, Loss: 0.0008
Epoch 17/50, Loss: 0.0016
Epoch 18/50, Loss: 0.0013
Epoch 19/50, Loss: 0.0012
Epoch 20/50, Loss: 0.0035
Epoch 21/50, Loss: 0.0029
Epoch 22/50, Loss: 0.0008
Epoch 23/50, Loss: 0.0022
Epoch 24/50, Loss: 0.0019
Epoch 25/50, Loss: 0.0005
Epoch 26/50, Loss: 0.0012
Epoch 27/50, Loss: 0.0011
Epoch 28/50, Loss: 0.0015
Epoch 29/50, Loss: 0.0011
Epoch 30/50, Loss: 0.0036
Epoch 31/50, Loss: 0.0022
Epoch 32/50, Loss: 0.0007
Epoch 33/50, Loss: 0.0015
Epoch 34/50, Loss: 0.0016
Epoch 35/50, Loss: 0.0037
Epoch 36/50, Loss: 0.0011
Epoch 37/50, Loss: 0.0006
Epoch 38/50, Loss: 0.0043
Epoch 39/50, Loss: 0.

## Test Ensemble

In [5]:
vote_method = "positive"

metrics_save_dir = os.path.join("result", "ensemble-cic",
                    os.path.basename(train_benign_filename))

with open("cic-attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)

kmeans_scale = 1.5
aec_scale = 1.5
aew_scale = 1.5

for filename in attack_filenames:
    # benign_df = pd.read_csv(benign_filenames[0])
    # attack_df = pd.read_csv(filename)
    # test_df = pd.concat([benign_df, attack_df], ignore_index=True)
    test_df = pd.read_csv(filename)
    
    file_key = os.path.basename(filename).split(".")[0]
    cur_attack_ips = attack_ips_dict.get(file_key, [])
    test_df["label"] = 0
    for row in test_df.iterrows():
        if row[1]["src_ip"] in cur_attack_ips or row[1]["dst_ip"] in cur_attack_ips:
            test_df.loc[row[0], "label"] = -1
        else:
            test_df.loc[row[0], "label"] = 1

    metrics = get_ensemble_result(test_df, USE_DATA_AUG, USE_SHORT_FLOW, 
            kmeans_save_path, aec_input_dim, aec_save_dir, aew_input_dim, 
            aew_save_dir, vote_method=vote_method,
            kmeans_scale=kmeans_scale, aec_scale=aec_scale, aew_scale=aew_scale)

    print(f"metrics of {filename}: {metrics}")
    accuracy_dict[filename] = metrics

accuracy_base_name = "flow-metrics.json" if not USE_SHORT_FLOW else "all-metrics.json"
accuracy_save_path = os.path.join(metrics_save_dir, accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

metrics of cic-ids/DoS-Slowhttptest.csv: {'kmeans': (0.6459417449483004, 0.6012084592145015, 0.2116645984754476, 0.3130982037498361, nan, nan, 0.37295755323561863, 0.6528764343142612), 'aec': (0.8031357707643442, 0.04179254783484391, 0.07593778591033852, 0.05391360831438779, nan, nan, 0.5185229751653976, 0.9211615018343611), 'aew': (0.8508481448942353, 0.00906344410876133, 0.07003891050583658, 0.016049933125278644, nan, nan, 0.5047961992632369, 0.9813441573647647), 'preds_majority': (0.8074609718186119, 0.04481369587109768, 0.08549471661863593, 0.058804096465147004, nan, nan, 0.51474872149697, 0.9256888611349622), 'preds_positive': (0.6366831114415085, 0.6042296072507553, 0.20725388601036268, 0.30864197530864196, nan, nan, 0.37702812042426714, 0.6417141519007104), 'preds_weighted': (0.65952557950936, 0.5926485397784491, 0.2177210506844247, 0.31845238095238093, nan, nan, 0.3687291997852739, 0.6698930606510031)}
metrics of cic-ids/Infiltration-Dropbox-2.csv: {'kmeans': (0.731069998092695

## Check train traffic

In [13]:
with open("cic-attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)
attack_ips_set = set()
for key, value in attack_ips_dict.items():
    attack_ips_set.update(value)

train_df = pd.read_csv("cic-ids-benign/benign.csv")
attack_count = 0
for row in train_df.iterrows():
    if row[1]["src_ip"] in attack_ips_set or row[1]["dst_ip"] in attack_ips_set:
        attack_count += 1
print(f"Total packets: {len(train_df)}, attack packets: {attack_count}, attack ratio: {attack_count / len(train_df)}")

Total packets: 32802, attack packets: 0, attack ratio: 0.0


## Check attack packets distribution

In [5]:
with open("cic-attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)

for filename in attack_filenames:
    attack_df = pd.read_csv(filename)
    file_key = os.path.basename(filename).split(".")[0]
    cur_attack_ips = attack_ips_dict.get(file_key, [])
    attack_packet_count = 0
    for row in attack_df.iterrows():
        if row[1]["src_ip"] in cur_attack_ips or row[1]["dst_ip"] in cur_attack_ips:
            attack_packet_count += 1
    print(f"{filename}: total= {len(attack_df)}, attack= {attack_packet_count}, attack_ips= {cur_attack_ips}")

cic-ids/DoS-Slowhttptest.csv: total= 146240, attack= 16974, attack_ips= ['205.174.165.73', '205.174.165.80', '172.16.0.1', '192.168.10.50']
cic-ids/Infiltration-Dropbox-2.csv: total= 69211, attack= 15061, attack_ips= ['205.174.165.73']
cic-ids/DDoS-LOIT.csv: total= 274493, attack= 0, attack_ips= [' 205.174.165.69', ' 205.174.165.70', ' 205.174.165.71']
cic-ids/Infiltration-Dropbox-3.csv: total= 199738, attack= 63232, attack_ips= ['205.174.165.73', '192.168.10.8']
cic-ids/Web-BruteForce.csv: total= 248360, attack= 4129, attack_ips= ['205.174.165.73', '205.174.165.80', '172.16.0.1 ', '192.168.10.50']
cic-ids/DoS-GoldenEye.csv: total= 159133, attack= 61928, attack_ips= ['205.174.165.73', '205.174.165.80', '172.16.0.1', '192.168.10.50']
cic-ids/BruteForce-SSH.csv: total= 131250, attack= 2862, attack_ips= ['205.174.165.73', '205.174.165.80', '172.16.0.1', '192.168.10.50']
cic-ids/BruteForce-FTP.csv: total= 543693, attack= 4556, attack_ips= ['205.174.165.73', '205.174.165.80', '172.16.0.1', 