# Detect by packet-level features

In [1]:
import pandas as pd
from utils import Packet, Flow

def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
            label=row["label"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [2]:
from config import whisper_config

def transform(mp: dict, all_flows: bool = False):
    packet_data = []
    for key, flow in mp.items():
        data = flow.packet_vector_simple()
        if all_flows: # short & long flow features
            packet_data += data
        elif len(data) <= (whisper_config["n_fft"] // 2): # short flows
            packet_data += data
    return packet_data

## Train & test with supervised learning

In [3]:
from sklearn.tree import DecisionTreeClassifier
import os
import json
import skops.io as sio

def train(data, labels, save_path):
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(data, labels)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    sio.dump(clf, save_path)

from sklearn.metrics import accuracy_score

def test(data, labels, load_path):
    clf = sio.load(load_path, True)
    pred = clf.predict(data)
    return accuracy_score(labels, pred)

## Mix benign and attack traffic

In [4]:
import numpy as np

def get_train_test_df(benign_path: str, attack_path: str):
    df_benign = pd.read_csv(benign_path)
    df_benign["label"] = 0
    df_attack = pd.read_csv(attack_path)
    df_attack["label"] = 1
    df_mix = pd.concat([df_benign, df_attack], ignore_index=True, axis=0)
    df_group = df_mix.groupby(["src_ip", "dst_ip", "src_port", "dst_port", "protocol"])
    df_mix["group"] = df_group.ngroup()
    df_mix["group"] = df_mix["group"] % 10
    df_mix["group"] = df_mix["group"].apply(lambda x: "train" if x < 6 else "test")
    # df_mix["group"].apply(lambda x: "train" if np.random.rand() < train_test_ratio else "test")
    df_train = df_mix[df_mix["group"] == "train"]
    df_test = df_mix[df_mix["group"] == "test"]
    return df_train, df_test

In [5]:
def get_data_labels(df: pd.DataFrame, all_flows: bool = False):
    flow_dict = get_flows(df)
    data = np.array(transform(flow_dict, all_flows=all_flows))
    if data.shape[0] == 0:
        return None, None
    return data[:, :-1], data[:, -1]

In [6]:
import os

benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") for i in range(1, 6)]
attack_filenames = [os.path.join("dataset_lite", x) for x in os.listdir("dataset_lite") if x.endswith(".csv")]

In [15]:
print(attack_filenames[0])
print(benign_filenames[0])

dataset_lite/mirai-attack.csv
train_set/benign1.csv


## Train a new model for each dataset

In [7]:
acc_dict = dict()
all_flows = True

benign_filename = benign_filenames[0]
for attack_filename in attack_filenames:
    df_train, df_test = get_train_test_df(benign_filename, attack_filename)
    print(f"train df size: {len(df_train)}, test df size: {len(df_test)}")
    train_data, train_labels = get_data_labels(df_train, all_flows=all_flows)
    if train_data is None:
        continue
    save_path = os.path.join("model", "dt", attack_filename + ".skops")
    train(train_data, train_labels, save_path)
    test_data, test_labels = get_data_labels(df_test, all_flows=all_flows)
    acc = test(test_data, test_labels, save_path)
    print(f"accuracy of {attack_filename}: {acc}")
    acc_dict[attack_filename] = acc

import json, os
suffix = "all_flows" if all_flows else "short_flows"
save_path = os.path.join("result", "dt", f"{suffix}.json")
if not os.path.exists(os.path.dirname(save_path)):
    os.makedirs(os.path.dirname(save_path))
with open(os.path.join(save_path), "w") as f:
    json.dump(acc_dict, f)

train df size: 118482, test df size: 74738
accuracy of dataset_lite/osscan.csv: 0.999812678958495
train df size: 112649, test df size: 128522
accuracy of dataset_lite/ssldosA10only.csv: 0.9961407385505984
train df size: 116958, test df size: 77683
accuracy of dataset_lite/infiltration.csv: 0.9996524336083827
train df size: 121224, test df size: 87911
accuracy of dataset_lite/BruteForce-Web.csv: 1.0
train df size: 110373, test df size: 80688
accuracy of dataset_lite/SQL_Injection.csv: 1.0
train df size: 190640, test df size: 197944
accuracy of dataset_lite/mirai.csv: 0.9997474033059855
train df size: 117974, test df size: 84608
accuracy of dataset_lite/BruteForce-XSS.csv: 0.9999054462934948
train df size: 190640, test df size: 197944
accuracy of mirai.csv: 0.9997474033059855


## Train only one model for all datasets

In [7]:
def get_mix_df(benign_path: str = None, attack_path: str = None):
    assert benign_path is not None or attack_path is not None, \
        "benign_path and attack_path cannot be None at the same time"
    if benign_path is not None:
        df_benign = pd.read_csv(benign_path)
        df_benign["label"] = 0
    else:
        df_benign = None
    if attack_path is not None:
        df_attack = pd.read_csv(attack_path)
        df_attack["label"] = 1
    else:
        df_attack = None
    df_mix = pd.concat([df_benign, df_attack], ignore_index=True, axis=0)
    return df_mix

In [12]:
all_flows = True

train_benign_filename = benign_filenames[0]
# train_benign_filename = "dataset_lite/mirai-benign.csv"
train_attack_filename = attack_filenames[0]

df_train = get_mix_df(benign_path=train_benign_filename, attack_path=train_attack_filename)
train_data, train_labels = get_data_labels(df_train, all_flows=all_flows)
if train_data is None:
    print("no data")
    exit(0)
save_path = os.path.join("model", "dt", 
            os.path.basename(train_benign_filename) + "_" + \
            os.path.basename(train_attack_filename) + ".skops")
train(train_data, train_labels, save_path)

In [13]:
for test_benign_filename in benign_filenames[1:2]:
    df_test = get_mix_df(benign_path=test_benign_filename)
    test_data, test_labels = get_data_labels(df_test, all_flows=all_flows)
    acc = test(test_data, test_labels, save_path)
    print(f"accuracy of {test_benign_filename}: {acc}")
    
for test_attack_filename in attack_filenames:
    df_test = get_mix_df(attack_path=test_attack_filename)
    test_data, test_labels = get_data_labels(df_test, all_flows=all_flows)
    acc = test(test_data, test_labels, save_path)
    print(f"accuracy of {test_attack_filename}: {acc}")

accuracy of train_set/benign2.csv: 1.0
accuracy of dataset_lite/mirai-attack.csv: 1.0
accuracy of dataset_lite/osscan.csv: 0.553701326486949
accuracy of dataset_lite/ssldosA10only.csv: 1.0
accuracy of dataset_lite/infiltration.csv: 0.6415646620542842
accuracy of dataset_lite/BruteForce-Web.csv: 0.7888998465921543
accuracy of dataset_lite/SQL_Injection.csv: 0.6123595505617978
accuracy of dataset_lite/mirai.csv: 1.0
accuracy of dataset_lite/mirai-benign.csv: 1.0
accuracy of dataset_lite/BruteForce-XSS.csv: 0.6674074707239935
