Load training data

In [1]:
import pandas as pd
from utils import Packet, Flow

def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_addr=row["src_addr"],
            dst_addr=row["dst_addr"],
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [2]:
import torch
from config import whisper_config

def transform(mp: dict, feature_type: str = "whisper", data_type: str = "train"):
    train_packet_data, train_flow_data = [], []
    test_packet_data, test_flow_data = {}, {}
    for key, flow in mp.items():
        vec = flow.vector(feature_type=feature_type)
        if feature_type == "whisper":
            if len(vec) <= (whisper_config["n_fft"] // 2):
                # packet level features
                vec = flow.packet_vector(agg_type="mean") + flow.packet_vector(agg_type="std") \
                    + flow.packet_vector(agg_type="max") + flow.packet_vector(agg_type="min")
                if data_type == "train":
                    train_packet_data.append(vec)
                else: # for test
                    test_packet_data[key] = vec
            else:
                # flow level featrues
                ten = torch.tensor(vec)
                # stft requirement: input_size > (n_fft // 2)
                # default return shape: (floor(n_fft/2)+1, n_frame, 2)
                ten_fft = torch.stft(ten, whisper_config["n_fft"])
                ten_power = torch.pow(ten_fft[:,:,0], 2) + torch.pow(ten_fft[:,:,1], 2)
                ten_res = ((ten_power.squeeze()+1).log2()).permute(1,0)
                ten_res = torch.where(torch.isnan(ten_res), torch.zeros_like(ten_res), ten_res)
                ten_res = torch.where(torch.isinf(ten_res), torch.zeros_like(ten_res), ten_res)
                if data_type == "train":
                    if (ten_res.size(0) > whisper_config["mean_win_train"]):
                        for _ in range(whisper_config["num_train_sample"]):
                            start_idx = torch.randint(0, ten_res.size(0)
                                        - whisper_config["mean_win_train"], (1,)).item()
                            ten_tmp = ten_res[start_idx:start_idx+whisper_config["mean_win_train"],:].mean(dim=0)
                            train_flow_data.append(ten_tmp.tolist())
                    else:
                        train_flow_data.append(ten_res.mean(dim=0).tolist())
                else: # for test
                    tmp_data = []
                    if (ten_res.size(0) > whisper_config["mean_win_test"]):
                        for idx in range(0, ten_res.size(0) - whisper_config["mean_win_test"], 
                                        whisper_config["mean_win_test"]):
                            ten_tmp = ten_res[idx:idx+whisper_config["mean_win_test"],:].mean(dim=0)
                            tmp_data.append(ten_tmp.tolist())
                    else:
                        tmp_data.append(ten_res.mean(dim=0).tolist())
                    test_flow_data[key] = tmp_data
        else: # for other feature types
            pass
    if data_type == "train":
        return train_packet_data, train_flow_data
    else:
        return test_packet_data, test_flow_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch

a = torch.tensor([[1,2,3], [4,5,6], [7,8,9], [3,2,6]]).float()
c = torch.tensor([[2,1,4], [7,8,5]]).float()
dist = torch.cdist(a, c, p=2)
print(dist)
print(dist.min(dim=1).values)
print(dist.min(dim=1).values.mean())

tensor([[1.7321, 8.7178],
        [4.8990, 4.3589],
        [9.9499, 4.0000],
        [2.4495, 7.2801]])
tensor([1.7321, 4.3589, 4.0000, 2.4495])
tensor(3.1351)


In [4]:
from sklearn.cluster import KMeans
import os
import json

def train(train_data, save_path, n_clusters):
    train_data = torch.tensor(train_data)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(train_data.cpu().numpy())

    centroids = torch.tensor(kmeans.cluster_centers_)
    train_loss = torch.cdist(train_data, centroids, p=2).min(dim=1).values.mean()

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, "w") as f:
        json.dump({
            "centroids": centroids.tolist(),
            "train_loss": train_loss.item(),
        }, f)

start training

In [5]:
import os
import pandas as pd

train_file = os.path.join("train_set", "benign1.csv")
train_df = pd.read_csv(train_file)

In [6]:
# train_flow_dict = get_flows(train_df, key_type="whisper") # merge flow by source ip
train_flow_dict = get_flows(train_df) # merge flow by 5-tuple

KeyboardInterrupt: 

In [None]:
# train_data = transform(train_flow_dict, feature_type="whisper", data_type="train")
train_packet_data, train_flow_data = transform(train_flow_dict, feature_type="whisper", data_type="train")

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


In [None]:
save_path = os.path.join("model", "whisper", "train_packet.json")
train(train_packet_data, save_path, n_clusters=whisper_config["val_K"])

save_path = os.path.join("model", "whisper", "train_flow.json")
train(train_flow_data, save_path, n_clusters=whisper_config["val_K"])



start testing

In [None]:
import os
import json

def test(test_data, load_path, save_path):
    with open(load_path, "r") as f:
        centroids = json.load(f)["centroids"]
    centroids = torch.tensor(centroids)
    
    test_res = []
    for key, val in test_data.items():
        val = torch.tensor(val)
        if (val.size(0) > whisper_config["mean_win_test"]):
            max_dist = 0
            for idx in range(0, val.size(0) - whisper_config["mean_win_test"], 
                             whisper_config["mean_win_test"]):
                ten_tmp = val[idx:idx+whisper_config["mean_win_test"],:].mean(dim=0)
                dist = torch.norm(ten_tmp - centroids, dim=1).min()
                max_dist = max(max_dist, dist)
            min_dist = max_dist
        else:
            min_dist = torch.norm(val.mean(dim=0) - centroids, dim=1).min()
        test_res.append({"key": key, "loss": min_dist.item()})

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, "w") as f:
        json.dump(test_res, f)

In [None]:
import os
import pandas as pd

# test_file = os.path.join("dataset_lite", "ssldosA10only.csv")
test_file = os.path.join("train_set", "benign2.csv")
test_df = pd.read_csv(test_file)

FileNotFoundError: [Errno 2] No such file or directory: 'train_set/benign_test.csv'

In [None]:
# test_flow_dict = get_flows(test_df, key_type="whisper")
test_flow_dict = get_flows(test_df)

In [None]:
test_packet_data, test_flow_data = transform(test_flow_dict, feature_type="whisper", data_type="test")

In [None]:
load_path = os.path.join("model", "whisper", "train_packet.json")
save_path = os.path.join("result", "whisper", test_file.split(os.sep)[-1].split(".")[0] + "-packet.json")
test(test_packet_data, load_path, save_path)

load_path = os.path.join("model", "whisper", "train_flow.json")
save_path = os.path.join("result", "whisper", test_file.split(os.sep)[-1].split(".")[0] + "-flow.json")
test(test_flow_data, load_path, save_path)

# Mix benign and attack traffic

In [None]:
import os
import pandas as pd

df_benign = pd.read_csv(os.path.join("train_set", "benign3.csv"))
df_benign["label"] = "BENIGN"
df_attack = pd.read_csv(os.path.join("dataset_lite", "ssldosA10only.csv"))
df_attack["label"] = "ATTACK"
df_mix = pd.concat([df_benign, df_attack], axis=0)
print(df_mix.head())

     src_addr    dst_addr          src_ip          dst_ip  src_port  dst_port  \
0  1902274041  2516971618  113.98.101.249    150.5.240.98   51631.0      80.0   
1  2648454693  3699660524   157.220.54.37  220.132.86.236    4500.0    4500.0   
2  1902274041  2516971618  113.98.101.249    150.5.240.98   51631.0      80.0   
3  1902274041  2516971618  113.98.101.249    150.5.240.98   51631.0      80.0   
4  1902274041  2516971618  113.98.101.249    150.5.240.98   51631.0      80.0   

   protocol  proto_code  pkt_length     timestamp  tos     id  ttl  chksum  \
0         6        1000          54  1.591765e+09    0  56155  122   51120   
1        17           3          42  1.591765e+09    0  23255   56    6996   
2         6        1000          54  1.591765e+09    0  56155  122   51120   
3         6        1000          54  1.591765e+09    0  56156  122   51119   
4         6        1000          54  1.591765e+09    0  56156  122   51119   

   flags  tcp_window  tcp_dataoffset  udp_le

In [None]:
df_group = df_mix.groupby(["src_ip", "src_port", "dst_ip", "dst_port", "protocol"])

TRAIN_RATIO = 0.8
df_mix["group"] = df_group.ngroup()
df_mix["group"] = df_mix["group"] % 10
df_mix["group"] = df_mix["group"].apply(lambda x: "train" if x < 8 else "test")

df_train = df_mix[df_mix["group"] == "train"]
df_test = df_mix[df_mix["group"] == "test"]

In [None]:
import pandas as pd
from utils import Packet, Flow

def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_addr=row["src_addr"],
            dst_addr=row["dst_addr"],
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
            label=row["label"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [None]:
import torch
from config import whisper_config

def transform(mp: dict, feature_type: str = "whisper", data_type: str = "train"):
    packet_data = []
    for key, flow in mp.items():
        vec = flow.vector(feature_type=feature_type)
        if feature_type == "whisper":
            if len(vec) <= (whisper_config["n_fft"] // 2):
                # packet level features
                data = flow.packet_vector_simple()
                packet_data += data
            else: # flow level features
                pass
        else: # for other feature types
            pass
    return packet_data

## Train

In [None]:
from sklearn.tree import DecisionTreeClassifier
import os
import json
import skops.io as sio

def train(data, labels, save_path):
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(data, labels)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    sio.dump(clf, save_path)

In [None]:
train_flow_dict = get_flows(df_train)
train_packet_data = transform(train_flow_dict, feature_type="whisper")

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

save_path = os.path.join("model", "dt", "model.skops")
train_data = np.array(train_packet_data)[:,:-1]
train_labels = np.array(train_packet_data)[:,-1]
train(data, labels, save_path)

## Test

In [None]:
test_flow_dict = get_flows(df_test)
test_packet_data = transform(test_flow_dict, feature_type="whisper")

In [None]:
clf = sio.load(save_path, True)
test_data = np.array(test_packet_data)[:,:-1]
test_labels = np.array(test_packet_data)[:,-1]

pred = clf.predict(test_data)

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(test_labels, pred))

0.9994699178372648


In [None]:
from sklearn.metrics import accuracy_score

def test(data, labels, load_path):
    clf = sio.load(load_path, True)
    pred = clf.predict(data)
    return accuracy_score(labels, pred)

## Test short flows with more datasets

In [None]:
import os

benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") for i in range(4, 6)]
attack_filenames = [os.path.join("dataset_lite", x) for x in os.listdir("dataset_lite") if x.endswith(".csv")] + ["mirai.csv"]

In [None]:
import pandas as pd
import skops.io as sio

load_path = os.path.join("model", "dt", "model.skops")

for filename in benign_filenames:
    test_df = pd.read_csv(filename)
    test_df["label"] = "BENIGN"
    test_flow_dict = get_flows(test_df)
    test_packet_data = transform(test_flow_dict, feature_type="whisper")
    test_data = np.array(test_packet_data)[:,:-1]
    test_labels = np.array(test_packet_data)[:,-1]
    acc = test(test_data, test_labels, load_path)
    print(f"benign {filename} accuracy: {acc}")

benign train_set/benign4.csv accuracy: 0.9993577017400443
benign train_set/benign5.csv accuracy: 0.9995105836290223


In [None]:
for filename in attack_filenames:
    test_df = pd.read_csv(filename)
    test_df["label"] = "ATTACK"
    test_flow_dict = get_flows(test_df)
    test_packet_data = np.array(transform(test_flow_dict, feature_type="whisper"))
    if test_packet_data.shape[0] == 0:
        continue
    test_data = test_packet_data[:,:-1]
    test_labels = test_packet_data[:,-1]
    acc = test(test_data, test_labels, load_path)
    print(f"attack {filename} accuracy: {acc}")

attack dataset_lite/osscan.csv accuracy: 0.0928540864356012
attack dataset_lite/ssldosA10only.csv accuracy: 1.0
attack dataset_lite/BruteForce-Web.csv accuracy: 0.39919354838709675
attack dataset_lite/SQL_Injection.csv accuracy: 0.38202247191011235
attack dataset_lite/BruteForce-XSS.csv accuracy: 0.56
attack mirai.csv accuracy: 0.8087250657720737
