# Predict whether a flow is short or long based on the first packet

In [36]:
import pandas as pd

LENGTH_THRESHOLD = 26

def get_data_labels(df):
    df_group = df.groupby(["src_ip", "dst_ip", "src_port", "dst_port", "protocol"])
    total_data, total_labels = [], []
    for name, group in train_df_group:
        fp = group.iloc[0]
        # data = [fp["pkt_length"], fp["ttl"], fp["tcp_window"], fp["tcp_dataoffset"], 
        #         fp["udp_length"], fp["flags"], fp["tos"], fp["protocol"]]
        data = [fp["pkt_length"], fp["ttl"], fp["tcp_window"], fp["tcp_dataoffset"], 
                fp["udp_length"]]
        data = list(map(lambda x: 0 if pd.isnull(x) else x, data))
        label = "short" if len(group) < LENGTH_THRESHOLD else "long"
        total_data.append(data)
        total_labels.append(label)
    return total_data, total_labels

In [37]:
import os

train_filename = os.path.join("train_set", "benign1.csv")
train_df = pd.read_csv(train_filename)
train_data, train_labels = get_data_labels(train_df)

In [38]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(train_data, train_labels)

In [39]:
from sklearn.metrics import accuracy_score

test_filenames = [os.path.join("dataset_lite", x) for x in os.listdir("dataset_lite") if x.endswith(".csv")] \
                + [os.path.join("train_set", f"benign{x}.csv") for x in range(2, 6)]
for test_filename in test_filenames:
    test_df = pd.read_csv(test_filename)
    test_data, test_labels = get_data_labels(test_df)
    pred_labels = clf.predict(test_data)
    print(f"{test_filename}: {accuracy_score(test_labels, pred_labels)}")

dataset_lite/osscan.csv: 0.9546302050963331
dataset_lite/ssldosA10only.csv: 0.9546302050963331
dataset_lite/infiltration.csv: 0.9546302050963331
dataset_lite/BruteForce-Web.csv: 0.9546302050963331
dataset_lite/SQL_Injection.csv: 0.9546302050963331
dataset_lite/mirai.csv: 0.9546302050963331
dataset_lite/BruteForce-XSS.csv: 0.9546302050963331
train_set/benign2.csv: 0.9546302050963331
train_set/benign3.csv: 0.9546302050963331
train_set/benign4.csv: 0.9546302050963331
train_set/benign5.csv: 0.9546302050963331
