# Extract flow level features by Whisper

In [1]:
import pandas as pd
import os
import json
import torch
import torch.nn as nn
from utils import Packet, Flow
from config import whisper_config
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
            label=row["label"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [3]:
def transform(mp: dict, feature_type: str = "whisper", 
              data_type: str = "train", test_data_aug: bool = True):
    packet_data, flow_data = [], []
    packet_labels, flow_labels = [], []
    for key, flow in mp.items():
        vec = flow.vector(feature_type=feature_type)
        if feature_type == "whisper":
            if len(vec) <= (whisper_config["n_fft"] // 2):
                # packet level features
                # vec = flow.packet_vector(agg_type="mean") + flow.packet_vector(agg_type="std") \
                #     + flow.packet_vector(agg_type="max") + flow.packet_vector(agg_type="min")
                # packet_data.append(vec)
                # packet_labels.append(flow.label)

                # implement fft on short flows
                ten = torch.tensor(vec)
                ten_fft = torch.fft.fft(ten, n=(whisper_config["n_fft"] // 2)+1)
                ten_power = torch.pow(ten_fft.real, 2) + torch.pow(ten_fft.imag, 2)
                ten_res = (ten_power.squeeze()+1).log2()
                ten_res = torch.where(torch.isnan(ten_res), torch.zeros_like(ten_res), ten_res)
                ten_res = torch.where(torch.isinf(ten_res), torch.zeros_like(ten_res), ten_res)
                if data_type == "test" and test_data_aug:
                    # data shape for test data augmentation: (n_flow, n_sample, floor(n_fft/2)+1)
                    packet_data.append([ten_res.tolist()])
                else:
                    # data shape for no data augmentation: (n_flow, floor(n_fft/2)+1)
                    packet_data.append(ten_res.tolist())
                packet_labels.append(flow.label)
                
            else:
                # flow level featrues
                ten = torch.tensor(vec)
                # stft requirement: input_size > (n_fft // 2)
                # default return shape: (floor(n_fft/2)+1, n_frame, 2)
                ten_fft = torch.stft(ten, whisper_config["n_fft"])
                ten_power = torch.pow(ten_fft[:,:,0], 2) + torch.pow(ten_fft[:,:,1], 2)
                ten_res = ((ten_power.squeeze()+1).log2()).permute(1,0)
                ten_res = torch.where(torch.isnan(ten_res), torch.zeros_like(ten_res), ten_res)
                ten_res = torch.where(torch.isinf(ten_res), torch.zeros_like(ten_res), ten_res)
                # ten_res shape: (n_frame, floor(n_fft/2)+1)
                if data_type == "train":
                    if (ten_res.size(0) > whisper_config["mean_win_train"]):
                        for _ in range(whisper_config["num_train_sample"]):
                            start_idx = torch.randint(0, ten_res.size(0)
                                        - whisper_config["mean_win_train"], (1,)).item()
                            ten_tmp = ten_res[start_idx:start_idx+whisper_config["mean_win_train"],:].mean(dim=0)
                            flow_data.append(ten_tmp.tolist())
                    else:
                        flow_data.append(ten_res.mean(dim=0).tolist())
                else: # for test
                    if test_data_aug:
                        tmp_data = []
                        if (ten_res.size(0) > whisper_config["mean_win_test"]):
                            # data augmentation for kmeans on flows with length > mean_win_test
                            for idx in range(0, ten_res.size(0) - whisper_config["mean_win_test"], 
                                            whisper_config["mean_win_test"]):
                                ten_tmp = ten_res[idx:idx+whisper_config["mean_win_test"],:].mean(dim=0)
                                tmp_data.append(ten_tmp.tolist())
                        else:
                            # no data augmentation for kmeans on flows with length < mean_win_test
                            tmp_data.append(ten_res.mean(dim=0).tolist())
                        flow_data.append(tmp_data)
                        # data shape for augmentation: (n_flow, n_sample, floor(n_fft/2)+1)
                    else: # for other detection methods
                        flow_data.append(ten_res.mean(dim=0).tolist())
                        # data shape for no augmentation: (n_flow, floor(n_fft/2)+1)
                flow_labels.append(flow.label)
        elif feature_type == "encoding":
            # directly use the whisper encoding vector
            pass
        else: # for other feature types
            pass
    return packet_data, packet_labels, flow_data, flow_labels

# Train & test with unsupervised learning

In [5]:
def train_kmeans(train_data, save_path, n_clusters):
    train_data = torch.tensor(train_data)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(train_data.cpu().numpy())

    centroids = torch.tensor(kmeans.cluster_centers_)
    train_loss = torch.cdist(train_data, centroids, p=2).min(dim=1).values.mean()

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, "w") as f:
        json.dump({
            "centroids": centroids.tolist(),
            "train_loss": train_loss.item(),
        }, f)

In [4]:
def test_kmeans(test_data, test_labels, load_path, scale=5):
    with open(load_path, "r") as f:
        model_param = json.load(f)
    centroids = torch.tensor(model_param["centroids"])
    train_loss = model_param["train_loss"]
    
    pred = []
    for val in test_data:
        val = torch.tensor(val)
        if (val.size(0) > whisper_config["mean_win_test"]):
            max_dist = 0
            for idx in range(0, val.size(0) - whisper_config["mean_win_test"], 
                             whisper_config["mean_win_test"]):
                ten_tmp = val[idx:idx+whisper_config["mean_win_test"],:].mean(dim=0)
                dist = torch.norm(ten_tmp - centroids, dim=1).min()
                max_dist = max(max_dist, dist)
            min_dist = max_dist
        else:
            min_dist = torch.norm(val.mean(dim=0) - centroids, dim=1).min()
        pred.append(-1 if min_dist > scale * train_loss else 1)
    return accuracy_score(test_labels, pred)
    

In [5]:
benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("attack_set", x) for x in 
                    os.listdir("attack_set") if x.endswith(".csv")]

In [11]:
USE_SHORT_FLOW = True
scale = 7

# train_benign_filename = benign_filenames[0]
# train_benign_filename = os.path.join("dataset_lite", "mirai-benign.csv")
train_benign_filename = "dataset/benign_small.csv"
base_name = "flow-kmeans.json" if not USE_SHORT_FLOW else "all-kmeans.json"
save_path = os.path.join("model", "whisper", "kmeans", os.path.basename(train_benign_filename), base_name)

In [79]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = "unknown"
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels = transform(get_flows(train_df))

train_data = train_flow_data if not USE_SHORT_FLOW else train_flow_data + train_packet_data
train_kmeans(train_data, save_path, whisper_config["val_K"])



In [12]:
USE_DATA_AUG = True
detect_type = "kmeans" if USE_DATA_AUG else "kmeans-no-aug"

accuracy_dict = {}

for test_benign_filename in benign_filenames:
    test_df = pd.read_csv(test_benign_filename)
    test_df["label"] = 1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    acc = test_kmeans(test_data, test_labels, save_path, scale=scale)
    print(f"accuracy of {test_benign_filename}: {acc}")
    accuracy_dict[test_benign_filename] = acc

for test_attack_filename in attack_filenames:
    test_df = pd.read_csv(test_attack_filename)
    test_df["label"] = -1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    acc = test_kmeans(test_data, test_labels, save_path, scale=scale)
    print(f"accuracy of {test_attack_filename}: {acc}")
    accuracy_dict[test_attack_filename] = acc

accuracy_base_name = "flow-accuracy.json" if not USE_SHORT_FLOW else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "whisper", detect_type, 
                    os.path.basename(train_benign_filename), str(scale)+"-"+accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

accuracy of train_set/benign1.csv: 0.9982021574111066
accuracy of train_set/benign2.csv: 0.9984387197501952
accuracy of attack_set/LDoS_small.csv: 0.0
accuracy of attack_set/osscan.csv: 0.006842619745845552
accuracy of attack_set/infiltration.csv: 0.3333333333333333
accuracy of attack_set/HOIC_small.csv: 0.5058723531911702
accuracy of attack_set/BruteForce-Web.csv: 0.781021897810219
accuracy of attack_set/LOIC_UDP_small.csv: 0.88
accuracy of attack_set/SQL_Injection.csv: 0.9411764705882353
accuracy of attack_set/ssldosA.csv: 0.23333333333333334
accuracy of attack_set/fuzzscan.csv: 0.006430868167202572
accuracy of attack_set/BruteForce-XSS.csv: 0.6395348837209303


## Mix benign and attack traffic during testing

In [85]:
def get_mix_df(benign_path: str = None, attack_path: str = None):
    assert benign_path is not None or attack_path is not None, \
        "benign_path and attack_path cannot be None at the same time"
    if benign_path is not None:
        df_benign = pd.read_csv(benign_path)
        df_benign["label"] = 1
    else:
        df_benign = None
    if attack_path is not None:
        df_attack = pd.read_csv(attack_path)
        df_attack["label"] = -1
    else:
        df_attack = None
    df_mix = pd.concat([df_benign, df_attack], ignore_index=True, axis=0)
    return df_mix

In [87]:
USE_DATA_AUG = True
detect_type = "kmeans" if USE_DATA_AUG else "kmeans-no-aug"

accuracy_dict = {}
test_benign_filename = benign_filenames[0]

for test_attack_filename in attack_filenames:
    test_df = get_mix_df(benign_path=test_benign_filename, 
                         attack_path=test_attack_filename)
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    acc = test_kmeans(test_data, test_labels, save_path, scale=10)
    print(f"accuracy of {test_attack_filename}: {acc}")
    accuracy_dict[test_benign_filename+"-"+test_attack_filename] = acc

accuracy_base_name = "flow-mix-accuracy.json" if not USE_SHORT_FLOW else "all-mix-accuracy.json"
accuracy_save_path = os.path.join("result", "whisper", detect_type, 
                    os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

accuracy of dataset_lite/BruteForce-Web.csv: 0.9740530303030303
accuracy of dataset_lite/BruteForce-XSS.csv: 0.9844854673998429
accuracy of dataset_lite/infiltration.csv: 0.9990023942537909
accuracy of dataset_lite/osscan.csv: 0.7107203630175837
accuracy of dataset_lite/SQL_Injection.csv: 0.9968253968253968
accuracy of dataset_lite/ssldosA10only.csv: 0.9954265261483396
accuracy of dataset_lite/mirai-attack.csv: 0.1684800747613644


# Train and test LOF

In [53]:
from sklearn.neighbors import LocalOutlierFactor
import skops.io as sio
from sklearn.metrics import accuracy_score

def train_lof(data, save_path):
    clf = LocalOutlierFactor(novelty=True)
    clf.fit(data)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    sio.dump(clf, save_path)

def test_lof(data, labels, load_path, test_data_aug=False):
    clf = sio.load(load_path)
    if not test_data_aug:
        preds = clf.predict(data)
    else:
        preds = []
        for val in data:
            pred = clf.predict(val)
            preds.append(1 if sum(pred) > -1*len(pred) else -1)
    return accuracy_score(labels, preds)

In [54]:
import pandas as pd

USE_SHORT_FLOW = True

# train_benign_filename = benign_filenames[0]
# train_benign_filename = os.path.join("dataset_lite", "mirai-benign.csv")
train_benign_filename = "dataset/benign_small.csv"
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = "unknown"
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels = transform(get_flows(train_df))

base_name = "flow-lof.skops" if not USE_SHORT_FLOW else "all-lof.skops"
save_path = os.path.join("model", "whisper", "lof", os.path.basename(train_benign_filename), base_name)
train_data = train_flow_data if not USE_SHORT_FLOW else train_flow_data + train_packet_data
train_lof(train_data, save_path)

In [71]:
import numpy as np

USE_DATA_AUG = True
detect_type = "lof" if USE_DATA_AUG else "lof-no-aug"

accuracy_dict = {}

for test_benign_filename in benign_filenames[:2]:
    test_df = pd.read_csv(test_benign_filename)
    test_df["label"] = 1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
        = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    acc = test_lof(test_data, test_labels, save_path, test_data_aug=USE_DATA_AUG)
    print(f"accuracy of {test_benign_filename}: {acc}")
    accuracy_dict[test_benign_filename] = acc

for test_attack_filename in attack_filenames:
    test_df = pd.read_csv(test_attack_filename)
    test_df["label"] = -1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
        = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    if len(test_data) > 0:
        acc = test_lof(test_data, test_labels, save_path, test_data_aug=USE_DATA_AUG)
    else:
        acc = np.nan
    print(f"accuracy of {test_attack_filename}: {acc}")
    accuracy_dict[test_attack_filename] = acc

accuracy_base_name = "flow-accuracy.json" if not USE_SHORT_FLOW else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "whisper", detect_type, os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

accuracy of train_set/benign1.csv: 0.18318018377946466
accuracy of train_set/benign2.csv: 0.23341139734582358
accuracy of dataset_lite/BruteForce-Web.csv: 1.0
accuracy of dataset_lite/BruteForce-XSS.csv: 1.0
accuracy of dataset_lite/infiltration.csv: 1.0
accuracy of dataset_lite/osscan.csv: 0.9965786901270772
accuracy of dataset_lite/SQL_Injection.csv: 1.0
accuracy of dataset_lite/ssldosA10only.csv: 0.30434782608695654
accuracy of dataset_lite/mirai-attack.csv: 0.880629908639205


# Train and test SVM

In [72]:
from sklearn.svm import OneClassSVM
import skops.io as sio
from sklearn.metrics import accuracy_score

def train_svm(data, save_path):
    clf = OneClassSVM(kernel="rbf", nu=0.1)
    clf.fit(data)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    sio.dump(clf, save_path)

def test_svm(data, labels, load_path, test_data_aug=False):
    clf = sio.load(load_path)
    if not test_data_aug:
        preds = clf.predict(data)
    else:
        preds = []
        for val in data:
            pred = clf.predict(val)
            preds.append(1 if sum(pred) > -1*len(pred) else -1)
    return accuracy_score(labels, preds)

In [76]:
import pandas as pd

USE_SHORT_FLOW = True

# train_benign_filename = benign_filenames[0]
# train_benign_filename = os.path.join("dataset_lite", "mirai-benign.csv")
train_benign_filename = "dataset/benign_small.csv"
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = "unknown"
train_packet_data, train_packet_labels, train_flow_data, train_flow_labels = transform(get_flows(train_df))

base_name = "flow-svm.skops" if not USE_SHORT_FLOW else "all-svm.skops"
save_path = os.path.join("model", "whisper", "svm", os.path.basename(train_benign_filename), base_name)
train_data = train_flow_data if not USE_SHORT_FLOW else train_flow_data + train_packet_data
train_lof(train_data, save_path)

In [78]:
import numpy as np

USE_DATA_AUG = True
detect_type = "svm" if USE_DATA_AUG else "svm-no-aug"

accuracy_dict = {}

for test_benign_filename in benign_filenames[:2]:
    test_df = pd.read_csv(test_benign_filename)
    test_df["label"] = 1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
        = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    acc = test_svm(test_data, test_labels, save_path, test_data_aug=USE_DATA_AUG)
    print(f"accuracy of {test_benign_filename}: {acc}")
    accuracy_dict[test_benign_filename] = acc

for test_attack_filename in attack_filenames:
    test_df = pd.read_csv(test_attack_filename)
    test_df["label"] = -1
    test_packet_data, test_packet_lables, test_flow_data, test_flow_labels \
        = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    test_data = test_flow_data if not USE_SHORT_FLOW else test_flow_data + test_packet_data
    test_labels = test_flow_labels if not USE_SHORT_FLOW else test_flow_labels + test_packet_lables
    if len(test_data) > 0:
        acc = test_svm(test_data, test_labels, save_path, test_data_aug=USE_DATA_AUG)
    else:
        acc = np.nan
    print(f"accuracy of {test_attack_filename}: {acc}")
    accuracy_dict[test_attack_filename] = acc

accuracy_base_name = "flow-accuracy.json" if not USE_SHORT_FLOW else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "whisper", detect_type, os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

accuracy of train_set/benign1.csv: 0.18457850579304835
accuracy of train_set/benign2.csv: 0.23341139734582358
accuracy of dataset_lite/BruteForce-Web.csv: 1.0
accuracy of dataset_lite/BruteForce-XSS.csv: 1.0
accuracy of dataset_lite/infiltration.csv: 1.0
accuracy of dataset_lite/osscan.csv: 0.9965786901270772
accuracy of dataset_lite/SQL_Injection.csv: 1.0
accuracy of dataset_lite/ssldosA10only.csv: 0.30434782608695654
accuracy of dataset_lite/mirai-attack.csv: 0.8807501202115724


# Train and test ensemble detectors

In [None]:
from sklearn.cluster import KMeans
import os
import json
import torch

def train_kmeans(train_data, save_path, n_clusters):
    train_data = torch.tensor(train_data)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(train_data.cpu().numpy())

    centroids = torch.tensor(kmeans.cluster_centers_)
    train_loss = torch.cdist(train_data, centroids, p=2).min(dim=1).values.mean()

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, "w") as f:
        json.dump({
            "centroids": centroids.tolist(),
            "train_loss": train_loss.item(),
        }, f)