# Implement AutoEncoder based on raw flow features

In [1]:
import pandas as pd
import os
import json
import torch
import torch.nn as nn
from utils import Packet, Flow
from config import whisper_config
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
            label=row["label"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [3]:
MAX_LEN = whisper_config["n_fft"] * 2
def transform(mp: dict, feature_type: str = "whisper", 
              data_type: str = "train", test_data_aug: bool = True):
    data, labels = [], []
    for key, flow in mp.items():
        vec = flow.vector(feature_type=feature_type)
        if len(vec) >= MAX_LEN:
            vec = vec[:MAX_LEN]
        else:
            vec = vec + [0] * (MAX_LEN - len(vec))
        data.append(vec)
        labels.append(flow.label)
    return data, labels

In [4]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Sigmoid())
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid())
        # self.decoder = nn.Linear(hidden_dim, input_dim)
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        super(Dataset, self).__init__()
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

## Training

In [9]:
train_benign_filename = "dataset/benign_small.csv"
save_dir = os.path.join("model", "autoencoding", os.path.basename(train_benign_filename))

In [56]:
train_df = pd.read_csv(train_benign_filename)
train_df["label"] = 1
train_data, train_labels = transform(get_flows(train_df))

In [57]:
def train_ae(train_data, train_labels, save_dir,
            model, criterion, optimizer, device, 
            batch_size=32, num_epochs=200):
    train_dataset = Dataset(train_data, train_labels)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    loss_list = []
    model.to(device)
    model.train()
    for epoch in range(num_epochs):
        for data, labels in train_loader:
            data = torch.sigmoid(data.to(device)).float()
            # data = data.to(device).float()
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, data)
            loss_list.append(loss.item())
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")
    os.makedirs(save_dir, exist_ok=True)
    model_save_path = os.path.join(save_dir, "model.pt")
    torch.save(model.state_dict(), model_save_path)
    loss_save_path = os.path.join(save_dir, "train_loss.json")
    with open(loss_save_path, "w") as f:
        json.dump(loss_list, f)

In [58]:
input_dim = MAX_LEN
hidden_dim = int(input_dim * 0.75)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoEncoder(input_dim, hidden_dim).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

train_ae(torch.tensor(train_data), torch.tensor(train_labels),
        save_dir, model, criterion, optimizer, device, num_epochs=50)

Epoch 1/50, Loss: 0.0052
Epoch 2/50, Loss: 0.0006
Epoch 3/50, Loss: 0.0012
Epoch 4/50, Loss: 0.0012
Epoch 5/50, Loss: 0.0007
Epoch 6/50, Loss: 0.0027
Epoch 7/50, Loss: 0.0003
Epoch 8/50, Loss: 0.0033
Epoch 9/50, Loss: 0.0012
Epoch 10/50, Loss: 0.0014
Epoch 11/50, Loss: 0.0010
Epoch 12/50, Loss: 0.0014
Epoch 13/50, Loss: 0.0014
Epoch 14/50, Loss: 0.0003
Epoch 15/50, Loss: 0.0014
Epoch 16/50, Loss: 0.0008
Epoch 17/50, Loss: 0.0001
Epoch 18/50, Loss: 0.0010
Epoch 19/50, Loss: 0.0006
Epoch 20/50, Loss: 0.0000
Epoch 21/50, Loss: 0.0001
Epoch 22/50, Loss: 0.0023
Epoch 23/50, Loss: 0.0008
Epoch 24/50, Loss: 0.0015
Epoch 25/50, Loss: 0.0008
Epoch 26/50, Loss: 0.0011
Epoch 27/50, Loss: 0.0018
Epoch 28/50, Loss: 0.0005
Epoch 29/50, Loss: 0.0016
Epoch 30/50, Loss: 0.0010
Epoch 31/50, Loss: 0.0019
Epoch 32/50, Loss: 0.0024
Epoch 33/50, Loss: 0.0011
Epoch 34/50, Loss: 0.0001
Epoch 35/50, Loss: 0.0010
Epoch 36/50, Loss: 0.0020
Epoch 37/50, Loss: 0.0022
Epoch 38/50, Loss: 0.0008
Epoch 39/50, Loss: 0.

## Testing

In [15]:
def test_ae(test_data, test_labels, model, device, criterion,
            threshold, scale=5, test_data_aug=False):
    model.eval()
    preds = []
    with torch.no_grad():
        for val in test_data:
            data = torch.sigmoid(torch.tensor(val).to(device)).float()
            # data = torch.tensor(val).to(device).float()
            outputs = model(data)
            loss = criterion(outputs, data)
            if not test_data_aug:
                preds.append(-1 if loss.item() > threshold * scale else 1)
            else:
                preds.append(-1 if loss.max().item() > threshold * scale else 1)
    return accuracy_score(test_labels, preds)

In [16]:
benign_filenames = [os.path.join("train_set", "benign" + str(i) + ".csv") 
                    for i in range(1, 3)]
attack_filenames = [os.path.join("attack_set", x) for x in 
                    os.listdir("attack_set") if x.endswith(".csv")]

In [17]:
USE_DATA_AUG = True
USE_SHORT_FLOW = True
detect_type = "autoencoder" if USE_DATA_AUG else "autoencoder-no-aug"
accuracy_dict = {}

input_dim = MAX_LEN
hidden_dim = int(input_dim * 0.75)
model = AutoEncoder(input_dim, hidden_dim)
model.load_state_dict(torch.load(os.path.join(save_dir, "model.pt")))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
model.to(device)
with open(os.path.join(save_dir, "train_loss.json"), "r") as f:
    loss_list = json.load(f)
threshold = torch.tensor(loss_list).mean().item()

scale = 10

for test_benign_filename in benign_filenames:
    test_df = pd.read_csv(test_benign_filename)
    test_df["label"] = 1
    test_data, test_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    acc = test_ae(test_data, test_labels, model, device, criterion, threshold, scale=scale)
    print(f"accuracy of {test_benign_filename}: {acc}")
    accuracy_dict[test_benign_filename] = acc

for test_attack_filename in attack_filenames:
    test_df = pd.read_csv(test_attack_filename)
    test_df["label"] = -1
    test_data, test_labels \
    = transform(get_flows(test_df), data_type="test", test_data_aug=USE_DATA_AUG)
    acc = test_ae(test_data, test_labels, model, device, criterion, threshold, scale=scale)
    print(f"accuracy of {test_attack_filename}: {acc}")
    accuracy_dict[test_attack_filename] = acc 

accuracy_base_name = "flow-accuracy.json" if not USE_SHORT_FLOW else "all-accuracy.json"
accuracy_save_path = os.path.join("result", "autoencoding", detect_type, 
                    os.path.basename(train_benign_filename), str(scale)+"-"+accuracy_base_name)
os.makedirs(os.path.dirname(accuracy_save_path), exist_ok=True)
with open(accuracy_save_path, "w") as f:
    json.dump(accuracy_dict, f, indent=4)

Threshold: 0.0017496360233053565, Scale: 10
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

KeyboardInterrupt: 