## compare time intervals of different attack types

In [1]:
import os
import pandas as pd
import numpy as np

benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("attack_set", x) for x in 
                    os.listdir("attack_set") if x.endswith(".csv")]
for filename in attack_filenames + benign_filenames:
    df = pd.read_csv(filename)
    df_group = df.groupby(["src_ip", "dst_ip", "src_port", "dst_port", "protocol"])
    total_time_interval = []
    for name, group in df_group:
        time_interval = [0] + list(np.diff(group["timestamp"].values))
        total_time_interval += time_interval
    print(f"{filename} time interval mean: {np.mean(total_time_interval)}")
    print(f"{filename} time interval std: {np.std(total_time_interval)}")
    print(f"{filename} time interval max: {np.max(total_time_interval)}")
    print(f"{filename} time interval min: {np.min(total_time_interval)}")

attack_set/LDoS_small.csv time interval mean: 0.3006698918581009
attack_set/LDoS_small.csv time interval std: 0.11745484057654854
attack_set/LDoS_small.csv time interval max: 0.37209105491638184
attack_set/LDoS_small.csv time interval min: 0.0
attack_set/osscan.csv time interval mean: 0.11535976605431844
attack_set/osscan.csv time interval std: 0.5495557577423145
attack_set/osscan.csv time interval max: 3.77128005027771
attack_set/osscan.csv time interval min: 0.0
attack_set/infiltration.csv time interval mean: 10.637729452860489
attack_set/infiltration.csv time interval std: 22.320994515466953
attack_set/infiltration.csv time interval max: 84.10361289978027
attack_set/infiltration.csv time interval min: 0.0
attack_set/HOIC_small.csv time interval mean: 0.0064822229959964756
attack_set/HOIC_small.csv time interval std: 0.008688119431468546
attack_set/HOIC_small.csv time interval max: 0.051918983459472656
attack_set/HOIC_small.csv time interval min: 0.0
attack_set/BruteForce-Web.csv tim

# Novelty detection with One Class SVM

In [2]:
import pandas as pd
from utils import Packet, Flow

def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [3]:
from config import whisper_config

def transform(mp: dict, all_flows: bool = False):
    packet_data = []
    for key, flow in mp.items():
        data = flow.packet_vector()
        if all_flows: # short & long flow features
            packet_data.append(data)
        elif len(data) <= (whisper_config["n_fft"] // 2): # short flows
            packet_data.append(data)
    return packet_data

## Load data

In [5]:
import os

all_flows = False
train_benign_filename = os.path.join("dataset", "benign_small.csv")
# train_benign_filename = os.path.join("train_set", "benign1.csv")

# df_benign = pd.read_csv(os.path.join("train_set", "benign1.csv"))
df_benign = pd.read_csv(train_benign_filename)
train_flow_dict = get_flows(df_benign)
train_packet_data = transform(train_flow_dict, all_flows=all_flows)

In [6]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

# clf = OneClassSVM(kernel="rbf", nu=0.1)
clf = LocalOutlierFactor(novelty=True)
clf.fit(train_packet_data)

In [15]:
from sklearn.metrics import accuracy_score
import json
import os
import torch
from sklearn.cluster import KMeans

In [9]:

accuracy_dict = {}

for filename in benign_filenames:
    df_benign = pd.read_csv(filename)
    train_flow_dict = get_flows(df_benign)
    train_packet_data = transform(train_flow_dict, all_flows=all_flows)
    y_pred = clf.predict(train_packet_data)
    y_true = [1] * len(train_packet_data)
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy of {}: {:.2f}%".format(filename, acc * 100))
    accuracy_dict[filename] = acc

for filename in attack_filenames:
    df_attack = pd.read_csv(filename)
    attack_flow_dict = get_flows(df_attack)
    attack_packet_data = transform(attack_flow_dict, all_flows=all_flows)
    y_pred = clf.predict(attack_packet_data)
    y_true = [-1] * len(attack_packet_data)
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy of {}: {:.2f}%".format(filename, acc * 100))
    accuracy_dict[filename] = acc

accuracy_base_name = "short-accuracy.json" if not all_flows else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "packet", "lof", os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f)

Accuracy of train_set/benign1.csv: 36.16%
Accuracy of train_set/benign2.csv: 35.93%
Accuracy of attack_set/LDoS_small.csv: 100.00%
Accuracy of attack_set/osscan.csv: 99.22%
Accuracy of attack_set/infiltration.csv: 100.00%


KeyboardInterrupt: 

# Detect with kmeans

In [16]:
def train_kmeans(train_data, save_path, n_clusters):
    train_data = torch.tensor(train_data)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(train_data.cpu().numpy())

    centroids = torch.tensor(kmeans.cluster_centers_)
    train_loss = torch.cdist(train_data, centroids, p=2).min(dim=1).values.mean()

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, "w") as f:
        json.dump({
            "centroids": centroids.tolist(),
            "train_loss": train_loss.item(),
        }, f)

In [23]:
def test_kmeans(test_data, test_labels, load_path, scale=5):
    with open(load_path, "r") as f:
        model_param = json.load(f)
    centroids = torch.tensor(model_param["centroids"])
    train_loss = model_param["train_loss"]
    
    pred = []
    for val in test_data:
        val = torch.tensor(val)
        dist = torch.norm(val - centroids, dim=1).min().item()
        pred.append(-1 if dist > scale * train_loss else 1)
    return accuracy_score(test_labels, pred)

In [19]:
kmeans_save_path = os.path.join("model", "packet", "kmeans", 
                    os.path.basename(train_benign_filename), "model.json")
train_kmeans(train_packet_data, kmeans_save_path, 10)



In [25]:

accuracy_dict = {}

for filename in benign_filenames:
    df_benign = pd.read_csv(filename)
    train_flow_dict = get_flows(df_benign)
    train_packet_data = transform(train_flow_dict, all_flows=all_flows)
    y_true = [1] * len(train_packet_data)
    acc = test_kmeans(train_packet_data, y_true, kmeans_save_path, 5)
    print("Accuracy of {}: {:.2f}%".format(filename, acc * 100))
    accuracy_dict[filename] = acc

for filename in attack_filenames:
    df_attack = pd.read_csv(filename)
    attack_flow_dict = get_flows(df_attack)
    attack_packet_data = transform(attack_flow_dict, all_flows=all_flows)
    y_true = [-1] * len(attack_packet_data)
    acc = test_kmeans(attack_packet_data, y_true, kmeans_save_path, 5)
    print("Accuracy of {}: {:.2f}%".format(filename, acc * 100))
    accuracy_dict[filename] = acc

accuracy_base_name = "short-accuracy.json" if not all_flows else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "packet", "kmeans", 
                    os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f)

Accuracy of train_set/benign1.csv: 99.30%
Accuracy of train_set/benign2.csv: 99.32%
Accuracy of attack_set/LDoS_small.csv: 0.00%
Accuracy of attack_set/osscan.csv: 0.00%
Accuracy of attack_set/infiltration.csv: 33.33%
Accuracy of attack_set/HOIC_small.csv: 0.00%
Accuracy of attack_set/BruteForce-Web.csv: 25.18%
Accuracy of attack_set/LOIC_UDP_small.csv: 0.00%
Accuracy of attack_set/SQL_Injection.csv: 2.94%
Accuracy of attack_set/ssldosA.csv: 3.33%
Accuracy of attack_set/fuzzscan.csv: 0.00%
Accuracy of attack_set/BruteForce-XSS.csv: 46.51%
