# Extract flow level features by FlowLens

In [1]:
import pandas as pd
from utils import Packet, Flow

def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
            label=row["label"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [2]:

def transform(mp: dict, feature_type: str = "whisper"):
    flow_data, flow_labels = [], []
    for key, flow in mp.items():
        if feature_type == "whisper":
            pass
        elif feature_type == "flowlens":
            vec_size, vec_time, label = flow.vector(feature_type=feature_type)
            flow_data.append(vec_size+vec_time)
            flow_labels.append(label)
    return flow_data, flow_labels

# 1. Train & test with supervised learning

In [3]:
from sklearn.tree import DecisionTreeClassifier
import os
import json
import skops.io as sio

def train(data, labels, save_path):
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(data, labels)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    sio.dump(clf, save_path)

from sklearn.metrics import accuracy_score

def test(data, labels, load_path):
    clf = sio.load(load_path, True)
    pred = clf.predict(data)
    return accuracy_score(labels, pred)

## Mix benign and attack traffic

In [4]:
import numpy as np

def get_train_test_df(benign_path: str, attack_path: str):
    df_benign = pd.read_csv(benign_path)
    df_benign["label"] = 0
    df_attack = pd.read_csv(attack_path)
    df_attack["label"] = 1
    df_mix = pd.concat([df_benign, df_attack], ignore_index=True, axis=0)
    df_group = df_mix.groupby(["src_ip", "dst_ip", "src_port", "dst_port", "protocol"])
    df_mix["group"] = df_group.ngroup()
    df_mix["group"] = df_mix["group"] % 10
    df_mix["group"] = df_mix["group"].apply(lambda x: "train" if x < 6 else "test")
    # df_mix["group"].apply(lambda x: "train" if np.random.rand() < train_test_ratio else "test")
    df_train = df_mix[df_mix["group"] == "train"]
    df_test = df_mix[df_mix["group"] == "test"]
    return df_train, df_test

In [7]:
import os

benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") for i in range(1, 6)]
attack_filenames = [os.path.join("dataset_lite", x) for x in os.listdir("dataset_lite") if x.endswith(".csv")]

## Train a new model for each dataset

In [6]:
acc_dict = dict()
all_flows = True

benign_filename = benign_filenames[0]
for attack_filename in attack_filenames:
    df_train, df_test = get_train_test_df(benign_filename, attack_filename)
    print(f"train df size: {len(df_train)}, test df size: {len(df_test)}")
    train_data, train_labels = transform(get_flows(df_train), feature_type="flowlens")
    if train_data is None:
        continue
    save_path = os.path.join("model", "flowlens", attack_filename + ".skops")
    train(train_data, train_labels, save_path)
    test_data, test_labels = transform(get_flows(df_test), feature_type="flowlens")
    acc = test(test_data, test_labels, save_path)
    print(f"accuracy of {attack_filename}: {acc}")
    acc_dict[attack_filename] = acc

import json, os
suffix = "all_flows" if all_flows else "short_flows"
save_path = os.path.join("result", "flowlens", f"{suffix}.json")
if not os.path.exists(os.path.dirname(save_path)):
    os.makedirs(os.path.dirname(save_path))
with open(os.path.join(save_path), "w") as f:
    json.dump(acc_dict, f)

train df size: 117179, test df size: 76630
accuracy of dataset_lite/osscan.csv: 0.9910931174089069
train df size: 178411, test df size: 63349
accuracy of dataset_lite/ssldosA10only.csv: 0.9993089149965446
train df size: 122020, test df size: 73210
accuracy of dataset_lite/infiltration.csv: 1.0
train df size: 133066, test df size: 76658
accuracy of dataset_lite/BruteForce-Web.csv: 1.0
train df size: 122227, test df size: 69423
accuracy of dataset_lite/SQL_Injection.csv: 0.9996550534667127
train df size: 234029, test df size: 155144
accuracy of dataset_lite/mirai.csv: 0.9962952408093474
train df size: 123785, test df size: 79386
accuracy of dataset_lite/BruteForce-XSS.csv: 0.999657651489216


## Train only one model for all datasets

In [3]:
def get_mix_df(benign_path: str = None, attack_path: str = None):
    assert benign_path is not None or attack_path is not None, \
        "benign_path and attack_path cannot be None at the same time"
    if benign_path is not None:
        df_benign = pd.read_csv(benign_path)
        df_benign["label"] = 0
    else:
        df_benign = None
    if attack_path is not None:
        df_attack = pd.read_csv(attack_path)
        df_attack["label"] = 1
    else:
        df_attack = None
    df_mix = pd.concat([df_benign, df_attack], ignore_index=True, axis=0)
    return df_mix

In [7]:
import os

# train_benign_filename = benign_filenames[0]
# train_attack_filename = attack_filenames[0]
train_benign_filename = "dataset_lite/mirai-benign.csv"
train_attack_filename = "dataset_lite/mirai-attack.csv"

df_train = get_mix_df(benign_path=train_benign_filename, attack_path=train_attack_filename)
train_data, train_labels = transform(get_flows(df_train), feature_type="flowlens")
save_path = os.path.join("model", "flowlens", 
            os.path.basename(train_benign_filename) + "_" + \
            os.path.basename(train_attack_filename) + ".skops")
train(train_data, train_labels, save_path)

In [10]:
for test_benign_filename in benign_filenames[1:]:
    df_test = get_mix_df(benign_path=test_benign_filename)
    test_data, test_labels = transform(get_flows(df_test), feature_type="flowlens")
    acc = test(test_data, test_labels, save_path)
    print(f"accuracy of {test_benign_filename}: {acc}")

for test_attack_filename in attack_filenames[1:]:
    df_test = get_mix_df(attack_path=test_attack_filename)
    test_data, test_labels = transform(get_flows(df_test), feature_type="flowlens")
    acc = test(test_data, test_labels, save_path)
    print(f"accuracy of {test_attack_filename}: {acc}")

accuracy of train_set/benign2.csv: 0.054839968774395
accuracy of train_set/benign3.csv: 0.04937845303867403
accuracy of train_set/benign4.csv: 0.05924520345772718
accuracy of train_set/benign5.csv: 0.057473684210526316
accuracy of dataset_lite/osscan.csv: 0.9912023460410557
accuracy of dataset_lite/ssldosA10only.csv: 0.6956521739130435
accuracy of dataset_lite/infiltration.csv: 0.6666666666666666
accuracy of dataset_lite/BruteForce-Web.csv: 1.0
accuracy of dataset_lite/SQL_Injection.csv: 1.0
accuracy of dataset_lite/mirai.csv: 0.9354547937721934
accuracy of dataset_lite/mirai-benign.csv: 0.9916708915767365
accuracy of dataset_lite/BruteForce-XSS.csv: 1.0


# 2. Train & test with unsupervised learning

## Train in a zero-positive way

In [4]:
from sklearn.cluster import KMeans
import os
import json
import torch

def train(train_data, save_path, n_clusters):
    train_data = torch.tensor(train_data).float()
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(train_data.cpu().numpy())

    centroids = torch.tensor(kmeans.cluster_centers_)
    train_loss = torch.cdist(train_data, centroids, p=2).min(dim=1).values.mean()

    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, "w") as f:
        json.dump({
            "centroids": centroids.tolist(),
            "train_loss": train_loss.item(),
        }, f)

from sklearn.metrics import accuracy_score
def test(test_data, test_labels, load_path, scale=5):
    with open(load_path, "r") as f:
        model_param = json.load(f)
    centroids = torch.tensor(model_param["centroids"])
    train_loss = model_param["train_loss"]

    pred = []
    for vec in test_data:
        vec = torch.tensor(vec).float()
        dist = torch.cdist(vec.unsqueeze(0), centroids, p=2).min(dim=1).values
        pred.append(1 if dist > scale * train_loss else 0)
    return accuracy_score(test_labels, pred)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import pandas as pd
from sklearn.cluster import KMeans

# train_benign_filename = benign_filenames[0]
train_benign_filename = os.path.join("dataset_lite", "mirai-benign.csv")
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = "unknown"
train_data, _ = transform(get_flows(train_df), feature_type="flowlens")

from config import whisper_config
save_path = os.path.join("model", "flowlens", "kmeans.json")
train(train_data, save_path, whisper_config["val_K"])



In [8]:
accuracy_dict = {}

for test_benign_filename in benign_filenames[1:]:
    test_df = pd.read_csv(test_benign_filename)
    test_df["label"] = 0
    test_data, test_lables = transform(get_flows(test_df), feature_type="flowlens")
    acc = test(test_data, test_lables, save_path)
    print(f"accuracy of {test_benign_filename}: {acc}")
    accuracy_dict[test_benign_filename] = acc

for test_attack_filename in attack_filenames:
    test_df = pd.read_csv(test_attack_filename)
    test_df["label"] = 1
    test_data, test_lables = transform(get_flows(test_df), feature_type="flowlens")
    acc = test(test_data, test_lables, save_path)
    print(f"accuracy of {test_attack_filename}: {acc}")
    accuracy_dict[test_attack_filename] = acc

accuracy_base_name =  "all-accuracy.json"
accuracy_save_path = os.path.join("result", "flowlens", os.path.basename(train_benign_filename), accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f)

accuracy of train_set/benign2.csv: 0.7950819672131147
accuracy of train_set/benign3.csv: 0.805939226519337
accuracy of train_set/benign4.csv: 0.7853679106051022
accuracy of train_set/benign5.csv: 0.7991578947368421
accuracy of dataset_lite/mirai-attack.csv: 0.005329379708286584
accuracy of dataset_lite/osscan.csv: 0.005376344086021506
accuracy of dataset_lite/ssldosA10only.csv: 0.043478260869565216
accuracy of dataset_lite/infiltration.csv: 1.0
accuracy of dataset_lite/BruteForce-Web.csv: 0.5036496350364964
accuracy of dataset_lite/SQL_Injection.csv: 0.0
accuracy of dataset_lite/mirai.csv: 0.004561595192570336
accuracy of dataset_lite/mirai-benign.csv: 0.002462519012095314
accuracy of dataset_lite/BruteForce-XSS.csv: 0.9302325581395349


FileNotFoundError: [Errno 2] No such file or directory: 'result/flowlens/mirai-benign.csv/all-accuracy.json'