# Combine supervised and unsupervised learning

In [9]:
import pandas as pd
from model import get_flows, transform, get_metrics, train_ae, train_kmeans, test_ae, test_kmeans
from model import AutoEncoder, Dataset
import numpy as np
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
import json
import skops.io as sio
import torch.nn as nn
import torch
from config import whisper_config

MAX_LEN = whisper_config["n_fft"] * 2

## Mix benign and attack traffic

In [2]:
def get_train_test_df(benign_path: str, attack_path: str):
    df_benign = pd.read_csv(benign_path)
    df_benign["label"] = 1
    df_attack = pd.read_csv(attack_path)
    df_attack["label"] = -1
    df_mix = pd.concat([df_benign, df_attack], ignore_index=True, axis=0)
    df_group = df_mix.groupby(["src_ip", "dst_ip", "src_port", "dst_port", "protocol"])
    df_mix["group"] = df_group.ngroup()
    df_mix["group"] = df_mix["group"] % 10
    df_mix["group"] = df_mix["group"].apply(lambda x: "train" if x < 6 else "test")
    # df_mix["group"].apply(lambda x: "train" if np.random.rand() < train_test_ratio else "test")
    df_train = df_mix[df_mix["group"] == "train"]
    df_test = df_mix[df_mix["group"] == "test"]
    return df_train, df_test

In [3]:
def get_mix_df(benign_path: str = None, 
                attack_path: str = None, 
                attack_ratio: int = None):
    assert benign_path is not None or attack_path is not None, \
        "benign_path and attack_path cannot be None at the same time"
    if benign_path is not None:
        df_benign = pd.read_csv(benign_path)
        df_benign["label"] = 1
    else:
        df_benign = None
    if attack_path is not None:
        df_attack = pd.read_csv(attack_path)
        df_attack["label"] = -1
        if attack_ratio is not None:
            df_group = df_attack.groupby(["src_ip", "dst_ip", "src_port", "dst_port", "protocol"])
            df_attack["group"] = df_group.ngroup()
            df_attack["group"] = df_attack["group"] % 500
            df_attack["group"] = df_attack["group"].apply(lambda x: "train" if x < attack_ratio else "test")
            df_attack = df_attack[df_attack["group"] == "train"]
    else:
        df_attack = None
    df_mix = pd.concat([df_benign, df_attack], ignore_index=True, axis=0)
    return df_mix

In [4]:

benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("attack_set", x) for x in 
                    os.listdir("attack_set") if x.endswith(".csv")]

## Train a single supervised model

In [15]:
def train_supervise(data, labels, save_path, model="dt"):
    if model == "dt":
        clf = DecisionTreeClassifier(random_state=0)
    elif model == "svm":
        clf = SVC()
    else:
        clf = GaussianNB()
        # clf = MultinomialNB()
    clf.fit(data, labels)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    sio.dump(clf, save_path)


def test_supervise(data, labels, load_path, test_data_aug=False):
    clf = sio.load(load_path, True)
    if not test_data_aug:
        preds = clf.predict(data)
    else:
        preds = []
        for val in data:
            pred = clf.predict(val)
            preds.append(1 if sum(pred) > -1*len(pred) else -1)
    return get_metrics(labels, preds)

In [13]:
USE_DATA_AUG = True

model = "dt"
if model == "dt":
    save_model_name = "decision_tree"
elif model == "svm":
    save_model_name = "svm"
else:
    save_model_name = "naive_bayes"

In [26]:
train_benign_filename = "dataset/benign_small.csv"
df_train = get_mix_df(benign_path=train_benign_filename, 
                    attack_path=attack_filenames[1],
                    attack_ratio=1)
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
    = transform(get_flows(df_train), data_type="train")
train_data = train_flow_data + train_packet_data
train_labels = train_flow_labels + train_packet_labels

print(np.array(train_data).shape)
print(np.array(train_labels).shape)

(4784, 26)
(4784,)


In [17]:
train_benign_filename = "dataset/benign_small.csv"
for train_attack_filename in attack_filenames[:3]:
    df_train = get_mix_df(benign_path=train_benign_filename, 
                    attack_path=train_attack_filename,
                    attack_ratio=1)
    train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
        = transform(get_flows(df_train), data_type="train")

    save_path = os.path.join("model", "whisper", save_model_name, 
                os.path.basename(train_benign_filename) + "_" + \
                os.path.basename(train_attack_filename) + ".skops")
    train_data = train_flow_data + train_packet_data
    train_labels = train_flow_labels + train_packet_labels
    train_supervise(train_data, train_labels, save_path, model=model)


In [16]:
train_benign_filename = "dataset/benign_small.csv"
with open("attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)

total_acc_dict = {}
for train_attack_filename in attack_filenames[:2]:
    save_path = os.path.join("model", "whisper", save_model_name, 
                os.path.basename(train_benign_filename) + "_" + \
                os.path.basename(train_attack_filename) + ".skops")

    acc_dict = {}
    for attack_filename in attack_filenames:
        df_benign = pd.read_csv(train_benign_filename)
        df_attack = pd.read_csv(attack_filename)
        df_test = pd.concat([df_benign, df_attack], ignore_index=True)

        file_key = os.path.basename(attack_filename).split(".")[0]
        cur_attack_ips = attack_ips_dict.get(file_key, [])
        df_test["label"] = 0
        for row in df_test.iterrows():
            if row[1]["src_ip"] in cur_attack_ips or row[1]["dst_ip"] in cur_attack_ips:
                df_test.loc[row[0], "label"] = -1
            else:
                df_test.loc[row[0], "label"] = 1
        
        test_packet_data, test_packet_labels, test_flow_data, test_flow_labels \
        = transform(get_flows(df_test), data_type="test", test_data_aug=USE_DATA_AUG)
        test_data =  test_flow_data + test_packet_data
        test_labels = test_flow_labels + test_packet_labels
        acc = test_supervise(test_data, test_labels, save_path, test_data_aug=USE_DATA_AUG)
        acc_dict[attack_filename] = acc
        print(attack_filename, acc)
    total_acc_dict[train_attack_filename] = acc_dict

metrics_save_path = os.path.join("result", "whisper", save_model_name, 
                    "all-metrics.json")
os.makedirs(os.path.dirname(metrics_save_path), exist_ok=True)
with open(metrics_save_path, "w") as f:
    json.dump(total_acc_dict, f, indent=4)

attack_set/LDoS_small.csv (0.999950149551346, 0.9999409925060483, 1.0, 0.9999704953825274, nan, nan, 2.9503746975865936e-05, 1.0)
attack_set/osscan.csv (0.6057375460360535, 0.005865102639296188, 1.0, 0.011661807580174927, nan, nan, 0.4970674486803519, 1.0)
attack_set/infiltration.csv (0.9987175376723308, 0.3333333333333333, 1.0, 0.5, nan, nan, 0.3333333333333333, 1.0)
attack_set/HOIC_small.csv (0.9151494183280716, 0.9807629427792915, 0.9179332857288585, 0.9483085678153652, nan, nan, 0.17817080181379022, 0.662895453593128)
attack_set/BruteForce-Web.csv (0.9790374963094184, 0.7408759124087592, 1.0, 0.8511530398322852, nan, nan, 0.12956204379562045, 1.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/LOIC_UDP_small.csv (0.998716714789862, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)
attack_set/SQL_Injection.csv (0.9993644741023197, 0.9411764705882353, 1.0, 0.9696969696969697, nan, nan, 0.029411764705882353, 1.0)
attack_set/ssldosA.csv (0.9923639834552975, 0.2, 1.0, 0.33333333333333337, nan, nan, 0.4, 1.0)
attack_set/fuzzscan.csv (0.4791857973676156, 0.005261619409529378, 1.0, 0.010468159348647864, nan, nan, 0.4973691902952353, 1.0)
attack_set/BruteForce-XSS.csv (0.9771803688652704, 0.1511627906976744, 1.0, 0.2626262626262626, nan, nan, 0.42441860465116277, 1.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/LDoS_small.csv (0.15518444666001993, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)
attack_set/osscan.csv (0.9912773793370808, 0.9780058651026393, 1.0, 0.9888806523350631, nan, nan, 0.010997067448680353, 1.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/infiltration.csv (0.9980763065084963, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/HOIC_small.csv (0.20641785235479826, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/BruteForce-Web.csv (0.9191024505462061, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/LOIC_UDP_small.csv (0.998716714789862, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/SQL_Injection.csv (0.9891960597394344, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/ssldosA.csv (0.9904549793191219, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)
attack_set/fuzzscan.csv (0.9801040710131619, 0.9619994153756212, 1.0, 0.980631704410012, nan, nan, 0.01900029231218942, 1.0)
attack_set/BruteForce-XSS.csv (0.9731165989371678, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0)


  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
metrics_save_path = os.path.join("result", "whisper", save_model_name, 
                    "all-metrics.json")
os.makedirs(os.path.dirname(metrics_save_path), exist_ok=True)
with open(metrics_save_path, "w") as f:
    json.dump(total_acc_dict, f, indent=4)

In [87]:
detect_type = save_model_name if USE_DATA_AUG else save_model_name + "_no_aug"
accuracy_dict = {}

for test_filename in benign_filenames + attack_filenames:
    if test_filename in benign_filenames:
        df_test = get_mix_df(benign_path=test_filename)
    else:
        df_test = get_mix_df(attack_path=test_filename)
    test_packet_data, test_packet_labels, test_flow_data, test_flow_labels \
    = transform(get_flows(df_test), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data + test_packet_data
    test_labels = test_flow_labels + test_packet_labels
    acc = test_supervise(test_data, test_labels, save_path, test_data_aug=USE_DATA_AUG)
    print(f"accuracy of {test_filename}: {acc}")
    accuracy_dict[test_filename] = acc

accuracy_base_name = "all-accuracy.json"
accuracy_save_path = os.path.join("result", "whisper", detect_type, 
                    os.path.basename(train_benign_filename)+"-"+os.path.basename(train_attack_filename),
                    accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

accuracy of train_set/benign1.csv: 1.0
accuracy of train_set/benign2.csv: 1.0
accuracy of dataset_lite/mirai-benign.csv: 1.0
accuracy of dataset_lite/BruteForce-Web.csv: 0.0
accuracy of dataset_lite/BruteForce-XSS.csv: 0.0
accuracy of dataset_lite/infiltration.csv: 0.0
accuracy of dataset_lite/osscan.csv: 0.0
accuracy of dataset_lite/SQL_Injection.csv: 0.0
accuracy of dataset_lite/ssldosA10only.csv: 0.21739130434782608
accuracy of dataset_lite/mirai-attack.csv: 0.0


# Train semi-supervised ensemble

In [5]:
def train_dt(train_data, train_labels, dt_save_path):
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(train_data, train_labels)
    if not os.path.exists(os.path.dirname(dt_save_path)):
        os.makedirs(os.path.dirname(dt_save_path))
    sio.dump(clf, dt_save_path)

In [6]:

def train_ensemble(benign_data, benign_labels, attack_data, attack_labels, 
        kmeans_save_path, n_clusters, dt_save_path, ae_save_dir,
        model, criterion, optimizer, device, batch_size=32, num_epochs=20
        ):
    train_dt(benign_data+attack_data, benign_labels+attack_labels, dt_save_path)
    train_kmeans(benign_data, kmeans_save_path, n_clusters=n_clusters)
    train_ae(benign_data, benign_labels, ae_save_dir, model, criterion, optimizer, 
             device, batch_size=batch_size, num_epochs=num_epochs)

In [7]:
def test_dt(data, dt_load_path):
    clf = sio.load(dt_load_path, True)
    dt_preds = []
    for val in data:
        pred = clf.predict(val)
        dt_preds.append(1 if sum(pred) > -1*len(pred) else -1)
    return dt_preds

In [40]:
def test_ensemble(dataw, labels, kmeans_load_path,
         aew_input_dim, aew_load_path, dt_load_path, 
         dt_train_data, dt_train_labels, test_attack,
         kmeans_scale=7, aew_scale=3,
         test_data_aug=False):
    
    kmeans_preds, kmeans_ratios, kmeans_loss_list, kmeans_threshold = \
        test_kmeans(dataw, kmeans_load_path, whisper_config, scale=kmeans_scale)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.MSELoss()
    model_aew = AutoEncoder(aew_input_dim)
    model_aew.load_state_dict(torch.load(os.path.join(aew_load_path, "model.pt")))
    model_aew.to(device)
    with open(os.path.join(aew_load_path, "train_loss.json"), "r") as f:
        loss_list = json.load(f)
    threshold = torch.tensor(loss_list).mean().item()
    aew_preds, aew_ratios, aew_loss_list, aew_threshold = \
        test_ae(dataw, model_aew, device, criterion, threshold, 
                scale=aew_scale, test_data_aug=test_data_aug, decoder_sigmoid=False)
    
    dt_preds = test_dt(dataw, dt_load_path)

    preds_majority, preds_positive = [], []
    for idx in range(len(kmeans_preds)):
        preds_majority.append(np.sign(kmeans_preds[idx] + dt_preds[idx] + aew_preds[idx]))
        if kmeans_preds[idx] == -1 or dt_preds[idx] == -1 or aew_preds[idx] == -1:
            preds_positive.append(-1)
        else:
            preds_positive.append(1)
    
    start_idx = len(kmeans_preds) // 2
    for idx in range(start_idx):
        if kmeans_preds[idx] == -1 and aew_preds[idx] == -1:
            dt_train_data.append(np.array(dataw[idx]).mean(axis=0).tolist())
            dt_train_labels.append(labels[idx])
    
    new_dt_save_path = os.path.join("model", "semi", "dt", test_attack)
    train_dt(dt_train_data, dt_train_labels, new_dt_save_path)
    new_dt_preds = test_dt(dataw, new_dt_save_path)

    # only use the latter half data
    new_dt_preds[:start_idx] = dt_preds[:start_idx]
    new_preds_majority, new_preds_positive = preds_majority[:start_idx], preds_positive[:start_idx]
    for idx in range(start_idx, len(kmeans_preds)):
        new_preds_majority.append(np.sign(kmeans_preds[idx] + new_dt_preds[idx] + aew_preds[idx]))
        if kmeans_preds[idx] == -1 or new_dt_preds[idx] == -1 or aew_preds[idx] == -1:
            new_preds_positive.append(-1)
        else:
            new_preds_positive.append(1)
        
    return {
        "kmeans": get_metrics(labels, kmeans_preds),
        "dt": get_metrics(labels, dt_preds),
        "aew": get_metrics(labels, aew_preds),
        "preds_majority": get_metrics(labels, preds_majority),
        "preds_positive": get_metrics(labels, preds_positive),
        "new_dt": get_metrics(labels, new_dt_preds),
        "new_preds_majority": get_metrics(labels, new_preds_majority),
        "new_preds_positive": get_metrics(labels, new_preds_positive),
    }


In [22]:
USE_DATA_AUG = True
USE_SHORT_FLOW = True

aec_input_dim = MAX_LEN
aew_input_dim = whisper_config["n_fft"] // 2 + 1
kmeans_save_path = os.path.join("model", "whisper", "kmeans-all", 
                    os.path.basename(train_benign_filename), "kmeans.json")
aew_save_dir = os.path.join("model", "whisper", "autoencoder-all", 
                    os.path.basename(train_benign_filename))

In [None]:
# data for train whisper and FAE

# train_df = pd.read_csv(train_benign_filename)
# train_df["label"] = 1
# train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
# = transform(get_flows(train_df))

# train_data = train_flow_data + train_packet_data
# train_labels = train_flow_labels + train_packet_labels

# train whisper and FAE
# do not train again, just load trained model

In [28]:
train_benign_filename = "dataset/benign_small.csv"
train_attack_filename = "attack_set/osscan.csv"

df_train = get_mix_df(benign_path=train_benign_filename, 
                    attack_path=train_attack_filename,
                    attack_ratio=1)
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels \
    = transform(get_flows(df_train), data_type="train")

train_data = train_flow_data + train_packet_data
train_labels = train_flow_labels + train_packet_labels

dt_save_path = os.path.join("model", "whisper", "decision_tree", 
                os.path.basename(train_benign_filename) + "_" + \
                os.path.basename(train_attack_filename) + ".skops")

## Test semi-ensemble

In [41]:
with open("attacker-ips.json", "r") as f:
    attack_ips_dict = json.load(f)
acc_dict = {}

for attack_filename in attack_filenames:
    df_benign = pd.read_csv(train_benign_filename)
    df_attack = pd.read_csv(attack_filename)
    df_test = pd.concat([df_benign, df_attack], ignore_index=True)

    file_key = os.path.basename(attack_filename).split(".")[0]
    cur_attack_ips = attack_ips_dict.get(file_key, [])
    df_test["label"] = 0
    for row in df_test.iterrows():
        if row[1]["src_ip"] in cur_attack_ips or row[1]["dst_ip"] in cur_attack_ips:
            df_test.loc[row[0], "label"] = -1
        else:
            df_test.loc[row[0], "label"] = 1
    
    test_packet_data, test_packet_labels, test_flow_data, test_flow_labels \
    = transform(get_flows(df_test), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data =  test_flow_data + test_packet_data
    test_labels = test_flow_labels + test_packet_labels
    
    acc = test_ensemble(test_data, test_labels, 
                        kmeans_save_path, aew_input_dim, aew_save_dir, 
                        dt_save_path, train_data, train_labels, 
                        os.path.basename(attack_filename),
                        test_data_aug=USE_DATA_AUG)
    
    acc_dict[attack_filename] = acc
    print(attack_filename, acc)
    
metrics_save_path = os.path.join("result", "semi-ensemble", train_attack_filename, 
                    "all-metrics.json")
os.makedirs(os.path.dirname(metrics_save_path), exist_ok=True)
with open(metrics_save_path, "w") as f:
    json.dump(acc_dict, f, indent=4)

  _warn_prf(average, modifier, msg_start, len(result))


attack_set/LDoS_small.csv {'kmeans': (0.9998504486540379, 1.0, 0.9998230088495575, 0.9999114965926188, nan, nan, 0.00048185030517183813, 0.9990362993896563), 'dt': (0.15518444666001993, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.999950149551346, 1.0, 0.9999409959877271, 0.9999704971234695, nan, nan, 0.00016061676839063121, 0.9996787664632187), 'preds_majority': (0.999950149551346, 1.0, 0.9999409959877271, 0.9999704971234695, nan, nan, 0.00016061676839063121, 0.9996787664632187), 'preds_positive': (0.9998504486540379, 1.0, 0.9998230088495575, 0.9999114965926188, nan, nan, 0.00048185030517183813, 0.9990362993896563), 'new_dt': (0.65518444666002, 0.5918451643358706, 1.0, 0.7435963969307188, nan, nan, 0.20407741783206468, 1.0), 'new_preds_majority': (0.999950149551346, 1.0, 0.9999409959877271, 0.9999704971234695, nan, nan, 0.00016061676839063121, 0.9996787664632187), 'new_preds_positive': (0.9998504486540379, 1.0, 0.9998230088495575, 0.9999114965926188, nan, nan, 0.00048185030517183813,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


attack_set/infiltration.csv {'kmeans': (0.997755690926579, 0.3333333333333333, 0.4, 0.3636363636363636, nan, nan, 0.33381518363850515, 0.9990362993896563), 'dt': (0.9980763065084963, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.9983969220904136, 0.3333333333333333, 0.6666666666666666, 0.4444444444444444, nan, nan, 0.33349395010172395, 0.9996787664632187), 'preds_majority': (0.9983969220904136, 0.3333333333333333, 0.6666666666666666, 0.4444444444444444, nan, nan, 0.33349395010172395, 0.9996787664632187), 'preds_positive': (0.997755690926579, 0.3333333333333333, 0.4, 0.3636363636363636, nan, nan, 0.33381518363850515, 0.9990362993896563), 'new_dt': (0.9980763065084963, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'new_preds_majority': (0.9983969220904136, 0.3333333333333333, 0.6666666666666666, 0.4444444444444444, nan, nan, 0.33349395010172395, 0.9996787664632187), 'new_preds_positive': (0.997755690926579, 0.3333333333333333, 0.4, 0.3636363636363636, nan, nan, 0.33381518363850515, 0.9990362993896

  _warn_prf(average, modifier, msg_start, len(result))


attack_set/HOIC_small.csv {'kmeans': (0.5693032910954461, 0.5026702997275204, 0.9171721189221438, 0.649417397120428, nan, nan, 0.3359265304211759, 0.8254766394301278), 'dt': (0.20641785235479826, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.6531159451628249, 0.6163487738419619, 0.9201855015865267, 0.7382265591854051, nan, nan, 0.29459116933294743, 0.7944688874921433), 'preds_majority': (0.5693897850624919, 0.5026702997275204, 0.9173545499751368, 0.6494631226896672, nan, nan, 0.3357170185837571, 0.8258956631049654), 'preds_positive': (0.653029451195779, 0.6163487738419619, 0.9200357927275685, 0.7381783767907841, nan, nan, 0.29480068117036623, 0.7940498638173057), 'new_dt': (0.550880076114691, 0.48463215258855585, 0.9055086040118114, 0.6313584892265095, nan, nan, 0.3548974162680518, 0.8055730148753405), 'new_preds_majority': (0.6117285819314103, 0.5641961852861035, 0.9134462678665961, 0.6975475003368818, nan, nan, 0.3206674636108766, 0.7944688874921433), 'new_preds_positive': (0.7676339

  _warn_prf(average, modifier, msg_start, len(result))


attack_set/BruteForce-Web.csv {'kmeans': (0.9813994685562445, 0.781021897810219, 0.9861751152073732, 0.8716904276985743, nan, nan, 0.10997090140006235, 0.9990362993896563), 'dt': (0.9191024505462061, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.9790374963094184, 0.7445255474452555, 0.9951219512195122, 0.8517745302713987, nan, nan, 0.1278978430457629, 0.9996787664632187), 'preds_majority': (0.9790374963094184, 0.7445255474452555, 0.9951219512195122, 0.8517745302713987, nan, nan, 0.1278978430457629, 0.9996787664632187), 'preds_positive': (0.9813994685562445, 0.781021897810219, 0.9861751152073732, 0.8716904276985743, nan, nan, 0.10997090140006235, 0.9990362993896563), 'new_dt': (0.9586654856805432, 0.48905109489051096, 1.0, 0.6568627450980392, nan, nan, 0.25547445255474455, 1.0), 'new_preds_majority': (0.9790374963094184, 0.7445255474452555, 0.9951219512195122, 0.8517745302713987, nan, nan, 0.1278978430457629, 0.9996787664632187), 'new_preds_positive': (0.9813994685562445, 0.781021897810

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


attack_set/LOIC_UDP_small.csv {'kmeans': (0.998716714789862, 0.75, 0.5, 0.6, nan, nan, 0.12548185030517184, 0.9990362993896563), 'dt': (0.998716714789862, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.9993583573949311, 0.75, 0.75, 0.75, nan, nan, 0.12516061676839063, 0.9996787664632187), 'preds_majority': (0.9993583573949311, 0.75, 0.75, 0.75, nan, nan, 0.12516061676839063, 0.9996787664632187), 'preds_positive': (0.998716714789862, 0.75, 0.5, 0.6, nan, nan, 0.12548185030517184, 0.9990362993896563), 'new_dt': (0.998716714789862, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'new_preds_majority': (0.9993583573949311, 0.75, 0.75, 0.75, nan, nan, 0.12516061676839063, 0.9996787664632187), 'new_preds_positive': (0.998716714789862, 0.75, 0.5, 0.6, nan, nan, 0.12548185030517184, 0.9990362993896563)}


  _warn_prf(average, modifier, msg_start, len(result))


attack_set/SQL_Injection.csv {'kmeans': (0.9984111852557992, 0.9411764705882353, 0.9142857142857143, 0.9275362318840579, nan, nan, 0.02989361501105419, 0.9990362993896563), 'dt': (0.9891960597394344, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.9990467111534795, 0.9411764705882353, 0.9696969696969697, 0.955223880597015, nan, nan, 0.029572381474272984, 0.9996787664632187), 'preds_majority': (0.9990467111534795, 0.9411764705882353, 0.9696969696969697, 0.955223880597015, nan, nan, 0.029572381474272984, 0.9996787664632187), 'preds_positive': (0.9984111852557992, 0.9411764705882353, 0.9142857142857143, 0.9275362318840579, nan, nan, 0.02989361501105419, 0.9990362993896563), 'new_dt': (0.9993644741023197, 0.9411764705882353, 1.0, 0.9696969696969697, nan, nan, 0.029411764705882353, 1.0), 'new_preds_majority': (0.9990467111534795, 0.9411764705882353, 0.9696969696969697, 0.955223880597015, nan, nan, 0.029572381474272984, 0.9996787664632187), 'new_preds_positive': (0.9984111852557992, 0.94117647

  _warn_prf(average, modifier, msg_start, len(result))


attack_set/ssldosA.csv {'kmeans': (0.9914094813872096, 0.2, 0.6666666666666666, 0.30769230769230765, nan, nan, 0.40048185030517186, 0.9990362993896563), 'dt': (0.9904549793191219, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.9923639834552975, 0.23333333333333334, 0.875, 0.3684210526315789, nan, nan, 0.383493950101724, 0.9996787664632187), 'preds_majority': (0.9920458160992682, 0.2, 0.8571428571428571, 0.32432432432432434, nan, nan, 0.40016061676839065, 0.9996787664632187), 'preds_positive': (0.9917276487432389, 0.23333333333333334, 0.7, 0.35, nan, nan, 0.3838151836385052, 0.9990362993896563), 'new_dt': (0.9942729875914731, 0.4, 1.0, 0.5714285714285715, nan, nan, 0.3, 1.0), 'new_preds_majority': (0.9920458160992682, 0.2, 0.8571428571428571, 0.32432432432432434, nan, nan, 0.40016061676839065, 0.9996787664632187), 'new_preds_positive': (0.9936366528794146, 0.43333333333333335, 0.8125, 0.5652173913043479, nan, nan, 0.28381518363850516, 0.9990362993896563)}
attack_set/fuzzscan.csv {'kmeans

  _warn_prf(average, modifier, msg_start, len(result))


attack_set/BruteForce-XSS.csv {'kmeans': (0.9887464832760238, 0.6162790697674418, 0.9464285714285714, 0.7464788732394365, nan, nan, 0.19234231542145092, 0.9990362993896563), 'dt': (0.9731165989371678, 0.0, 0.0, 0.0, nan, nan, 0.5, 0.0), 'aew': (0.9884338855892466, 0.5813953488372093, 0.9803921568627451, 0.7299270072992701, nan, nan, 0.209462942349786, 0.9996787664632187), 'preds_majority': (0.9884338855892466, 0.5813953488372093, 0.9803921568627451, 0.7299270072992701, nan, nan, 0.209462942349786, 0.9996787664632187), 'preds_positive': (0.9887464832760238, 0.6162790697674418, 0.9464285714285714, 0.7464788732394365, nan, nan, 0.19234231542145092, 0.9990362993896563), 'new_dt': (0.9753047827446076, 0.08139534883720931, 1.0, 0.15053763440860218, nan, nan, 0.45930232558139533, 1.0), 'new_preds_majority': (0.9884338855892466, 0.5813953488372093, 0.9803921568627451, 0.7299270072992701, nan, nan, 0.209462942349786, 0.9996787664632187), 'new_preds_positive': (0.9887464832760238, 0.616279069767

In [19]:
USE_DATA_AUG = True
detect_type = "ensemble" if USE_DATA_AUG else "ensemble-no-aug"
accuracy_dict = {}

for test_benign_filename in benign_filenames:
    test_df = pd.read_csv(test_benign_filename)
    test_df["label"] = 1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    acc = test_ensemble(test_data, test_labels, kmeans_save_path, dt_save_path,
            model, device, criterion, threshold, whisper_config, test_data_aug=USE_DATA_AUG)
    print(f"accuracy of {test_benign_filename}: {acc}")
    accuracy_dict[test_benign_filename] = acc

for test_attack_filename in attack_filenames:
    test_df = pd.read_csv(test_attack_filename)
    test_df["label"] = -1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    acc = test_ensemble(test_data, test_labels, kmeans_save_path, dt_save_path,
            model, device, criterion, threshold, whisper_config, test_data_aug=USE_DATA_AUG)
    print(f"accuracy of {test_attack_filename}: {acc}")
    accuracy_dict[test_attack_filename] = acc

accuracy_base_name = "flow-accuracy.json" if not USE_SHORT_FLOW else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "whisper", detect_type, os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

accuracy of train_set/benign1.csv: 0.9630443467838594
accuracy of train_set/benign2.csv: 0.9627244340359095
accuracy of dataset_lite/mirai-benign.csv: 0.964148620265083
accuracy of dataset_lite/BruteForce-Web.csv: 0.9854014598540146
accuracy of dataset_lite/BruteForce-XSS.csv: 0.9767441860465116
accuracy of dataset_lite/infiltration.csv: 0.8333333333333334
accuracy of dataset_lite/osscan.csv: 0.007331378299120235
accuracy of dataset_lite/SQL_Injection.csv: 0.9411764705882353
accuracy of dataset_lite/ssldosA10only.csv: 0.8260869565217391
accuracy of dataset_lite/mirai-attack.csv: 0.03410001602820965
