## compare time intervals of different attack types

In [8]:
import os
import pandas as pd
import numpy as np

attack_filenames = [os.path.join("dataset_lite", x) for x in \
                    list(filter(lambda x: x.endswith(".csv"), os.listdir("dataset_lite")))]
begign_filenames = [os.path.join("train_set", f"benign{idx}.csv") for idx in range(1, 6)]
for filename in attack_filenames + begign_filenames:
    df = pd.read_csv(filename)
    df_group = df.groupby(["src_ip", "dst_ip", "src_port", "dst_port", "protocol"])
    total_time_interval = []
    for name, group in df_group:
        time_interval = [0] + list(np.diff(group["timestamp"].values))
        total_time_interval += time_interval
    print(f"{filename} time interval mean: {np.mean(total_time_interval)}")
    print(f"{filename} time interval std: {np.std(total_time_interval)}")
    print(f"{filename} time interval max: {np.max(total_time_interval)}")
    print(f"{filename} time interval min: {np.min(total_time_interval)}")

dataset_lite/osscan.csv time interval mean: 0.11535976605431844
dataset_lite/osscan.csv time interval std: 0.5495557577423145
dataset_lite/osscan.csv time interval max: 3.77128005027771
dataset_lite/osscan.csv time interval min: 0.0
dataset_lite/ssldosA10only.csv time interval mean: 0.026381990544893846
dataset_lite/ssldosA10only.csv time interval std: 1.8928329878415313
dataset_lite/ssldosA10only.csv time interval max: 173.13449597358704
dataset_lite/ssldosA10only.csv time interval min: 0.0
dataset_lite/infiltration.csv time interval mean: 10.637729452860489
dataset_lite/infiltration.csv time interval std: 22.320994515466953
dataset_lite/infiltration.csv time interval max: 84.10361289978027
dataset_lite/infiltration.csv time interval min: 0.0
dataset_lite/BruteForce-Web.csv time interval mean: 0.4611680051644539
dataset_lite/BruteForce-Web.csv time interval std: 0.6562899844458695
dataset_lite/BruteForce-Web.csv time interval max: 5.889467000961304
dataset_lite/BruteForce-Web.csv time

# Novelty detection with One Class SVM

In [1]:
import pandas as pd
from utils import Packet, Flow

def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [2]:
from config import whisper_config

def transform(mp: dict, all_flows: bool = False):
    packet_data = []
    for key, flow in mp.items():
        data = flow.packet_vector()
        if all_flows: # short & long flow features
            packet_data.append(data)
        elif len(data) <= (whisper_config["n_fft"] // 2): # short flows
            packet_data.append(data)
    return packet_data

## Load data

In [18]:
import os

all_flows = False
train_benign_filename = os.path.join("dataset_lite", "mirai-benign.csv")
# train_benign_filename = os.path.join("train_set", "benign1.csv")

# df_benign = pd.read_csv(os.path.join("train_set", "benign1.csv"))
df_benign = pd.read_csv(train_benign_filename)
train_flow_dict = get_flows(df_benign)
train_packet_data = transform(train_flow_dict, all_flows=all_flows)

In [19]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

# clf = OneClassSVM(kernel="rbf", nu=0.1)
clf = LocalOutlierFactor(novelty=True)
clf.fit(train_packet_data)

In [20]:
from sklearn.metrics import accuracy_score

benign_traffic_list = ["benign2", "benign3"]
attack_traffic_list = ["BruteForce-Web", "BruteForce-XSS", "mirai", "infiltration", "SQL_Injection", "osscan", "ssldosA10only", "mirai-attack"]

In [22]:
import json
import os

accuracy_dict = {}

for benign_traffic in benign_traffic_list:
    df_benign = pd.read_csv(os.path.join("train_set", benign_traffic + ".csv"))
    train_flow_dict = get_flows(df_benign)
    train_packet_data = transform(train_flow_dict, all_flows=all_flows)
    y_pred = clf.predict(train_packet_data)
    y_true = [1] * len(train_packet_data)
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy of {}: {:.2f}%".format(benign_traffic, acc * 100))
    accuracy_dict[benign_traffic] = acc

for attack_trafic in attack_traffic_list:
    df_attack = pd.read_csv(os.path.join("dataset_lite", attack_trafic + ".csv"))
    attack_flow_dict = get_flows(df_attack)
    attack_packet_data = transform(attack_flow_dict, all_flows=all_flows)
    y_pred = clf.predict(attack_packet_data)
    y_true = [-1] * len(attack_packet_data)
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy of {}: {:.2f}%".format(attack_trafic, acc * 100))
    accuracy_dict[attack_trafic] = acc

accuracy_base_name = "short-accuracy.json" if not all_flows else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "dt", os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f)

Accuracy of benign2: 0.18%
Accuracy of benign3: 0.10%
Accuracy of BruteForce-Web: 100.00%
Accuracy of BruteForce-XSS: 100.00%
Accuracy of mirai: 3.54%
Accuracy of infiltration: 100.00%
Accuracy of SQL_Injection: 100.00%
Accuracy of osscan: 100.00%
Accuracy of ssldosA10only: 100.00%
Accuracy of mirai-attack: 1.31%
