In [None]:
import random
from collections import Counter

import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
CSV_PATH = "./cleaned_sample_parsed_cadets_tagged.csv"
MAX_LEN = 512
BATCH_SIZE = 128
D_MODEL = 64
NHEAD = 4
NUM_LAYERS = 2
FFN_DIM = 128
DROPOUT = 0.1
LR = 1e-3
EPOCHS = 3
SEED = 5231

PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
def load_df(csv_path):
    df = pd.read_csv(csv_path)
    df = df[["subject_uuid", "sequence", "label"]].dropna()

    label2id = {"normal": 0, "attack": 1}
    df = df[df["label"].isin(label2id.keys())].copy()
    df["y"] = df["label"].map(label2id)
    return df

def build_vocab(df):
    counter = Counter()
    for seq in df["sequence"]:
        counter.update(str(seq).split())

    token2id = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    for i, tok in enumerate(counter.keys(), start=2):
        token2id[tok] = i

    id2token = {v: k for k, v in token2id.items()}
    pad_id = token2id[PAD_TOKEN]
    unk_id = token2id[UNK_TOKEN]
    vocab_size = len(token2id)
    return token2id, id2token, pad_id, unk_id, vocab_size

class SeqDataset(Dataset):
    def __init__(self, df, token2id, max_len):
        self.max_len = max_len
        self.token2id = token2id
        self.pad_id = token2id[PAD_TOKEN]

        self.seqs = []
        self.labels = []

        for _, row in df.iterrows():
            tokens = str(row["sequence"]).split()
            ids = [self.token2id.get(t, self.token2id[UNK_TOKEN]) for t in tokens]
            self.seqs.append(ids)
            self.labels.append(int(row["y"]))

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return self.seqs[idx], self.labels[idx]

def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = [len(s) for s in sequences]
    max_len = min(max(lengths), MAX_LEN)

    batch_size = len(sequences)
    input_ids = torch.full((batch_size, max_len), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((batch_size, max_len), dtype=torch.long)

    for i, seq in enumerate(sequences):
        seq = seq[:max_len]
        input_ids[i, :len(seq)] = torch.tensor(seq, dtype=torch.long)
        attention_mask[i, :len(seq)] = 1

    labels = torch.tensor(labels, dtype=torch.float32)
    return input_ids, attention_mask, labels

def evaluate(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            total_loss += loss.item() * labels.size(0)
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)
    avg_loss = total_loss / max(1, total_samples)
    acc = total_correct / max(1, total_samples)
    return avg_loss, acc

def evaluate_metrics(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    y_true = []
    y_prob = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits)
            loss = loss_fn(logits, labels)

            total_loss += loss.item() * labels.size(0)
            y_true.extend(labels.long().tolist())
            y_prob.extend(probs.detach().cpu().tolist())
    import numpy as np
    y_true = np.array(y_true)
    y_prob = np.array(y_prob)
    y_pred = (y_prob > 0.5).astype(int)
    avg_loss = total_loss / max(1, len(y_true))
    acc = (y_pred == y_true).mean() if len(y_true) > 0 else 0.0
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except Exception:
        auc = float('nan')
    return {
        "loss": float(avg_loss),
        "acc": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "auc": float(auc),
    }

def predict_sequence(model, seq_str, token2id, pad_id, max_len, device):
    tokens = str(seq_str).split()
    ids = [token2id.get(t, token2id[UNK_TOKEN]) for t in tokens]
    ids = ids[:max_len]

    input_ids = torch.full((1, max_len), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((1, max_len), dtype=torch.long)
    input_ids[0, :len(ids)] = torch.tensor(ids, dtype=torch.long)
    attention_mask[0, :len(ids)] = 1

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    model.eval()
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        prob_attack = torch.sigmoid(logits)[0].item()
    return prob_attack


56 21010 y
0    20936
1       74
Name: count, dtype: int64
{'<PAD>': 0, '<UNK>': 1, 'aue_openat_rwtc': 2, 'aue_open_rwtc': 3, 'aue_read': 4, 'aue_close': 5, 'aue_exit': 6, 'aue_pread': 7, 'aue_mmap': 8, 'aue_execve': 9, 'aue_mprotect': 10, 'aue_connect': 11, 'aue_fcntl': 12, 'aue_chdir': 13, 'aue_lseek': 14, 'aue_write': 15, 'aue_setuid': 16, 'aue_setgid': 17, 'aue_sendto': 18, 'aue_pipe': 19, 'aue_accept': 20, 'aue_recvfrom': 21, 'aue_umask': 22, 'aue_seteuid': 23, 'aue_setegid': 24, 'aue_setlogin': 25, 'aue_chmod': 26, 'aue_ftruncate': 27, 'aue_fork': 28, 'aue_utimes': 29, 'aue_unlink': 30, 'aue_rename': 31, 'aue_fchmod': 32, 'aue_futimes': 33, 'aue_link': 34, 'aue_vfork': 35, 'aue_kill': 36, 'aue_mkdir': 37, 'aue_fchdir': 38, 'aue_rmdir': 39, 'aue_socketpair': 40, 'aue_chown': 41, 'aue_posix_openpt': 42, 'aue_writev': 43, 'aue_sendmsg': 44, 'aue_setresuid': 45, 'aue_setresgid': 46, 'aue_recvmsg': 47, 'aue_closefrom': 48, 'aue_futimesat': 49, 'aue_fchmodat': 50, 'aue_pwrite': 51, 'au

In [51]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers,
                 dim_feedforward, dropout, max_len, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.max_len = max_len

        # Token + positional embeddings to encode discrete events with order
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.pos_emb = nn.Embedding(max_len, d_model)

        # Transformer encoder stack for contextual sequence modeling
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, input_ids, attention_mask):
        # input_ids: (B, L), attention_mask: (B, L)
        B, L = input_ids.size()
        positions = torch.arange(L, device=input_ids.device).unsqueeze(0).expand(B, L)

        # Embed tokens + positions, switch to (L, B, D) for torch Transformer
        x = self.token_emb(input_ids) + self.pos_emb(positions)
        x = x.transpose(0, 1)

        # True where token is padding so encoder can ignore it
        src_key_padding_mask = (attention_mask == 0)

        encoded = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        encoded = encoded.transpose(0, 1)  # back to (B, L, D)

        # Masked mean pooling keeps only real tokens when averaging
        mask = attention_mask.unsqueeze(-1)
        masked_encoded = encoded * mask
        summed = masked_encoded.sum(dim=1)
        lengths = mask.sum(dim=1).clamp(min=1)
        pooled = summed / lengths

        pooled = self.dropout(pooled)
        logits = self.fc(pooled).squeeze(-1)
        return logits

In [52]:
class LSTMClassifier(nn.Module):
    """Embedding + LSTM encoder with masked mean pooling for classification."""

    def __init__(self, vocab_size, d_model, hidden_size, num_layers,
                 dropout, pad_id):
        super().__init__()
        self.pad_id = pad_id

        # Token embedding shared with Transformer for fair comparison
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)

        # Unidirectional LSTM to capture sequential dependencies
        self.lstm = nn.LSTM(
            input_size=d_model,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=False,
        )

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)

        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.fc.weight)
        if self.fc.bias is not None:
            nn.init.zeros_(self.fc.bias)

    def forward(self, input_ids, attention_mask):
        """Return logits of shape (B,) for the provided batch inputs."""
        x = self.token_emb(input_ids)  # (B, L, D)

        outputs, (h_n, c_n) = self.lstm(x)      # outputs: (B, L, H)

        # Masked mean pooling to ignore padded tokens
        mask = attention_mask.unsqueeze(-1)  # (B, L, 1)
        masked_outputs = outputs * mask
        summed = masked_outputs.sum(dim=1)  # (B, H)
        lengths = mask.sum(dim=1).clamp(min=1)  # (B, 1)
        pooled = summed / lengths  # (B, H)

        pooled = self.dropout(pooled)
        logits = self.fc(pooled).squeeze(-1)    # (B,)
        # logits = self.fc(h_n[-1]).squeeze(-1)
        return logits

In [None]:
from sklearn.model_selection import train_test_split

# Data loading and split
df = load_df(CSV_PATH)
token2id, id2token, pad_id, unk_id, vocab_size = build_vocab(df)

train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["y"], random_state=SEED
)
valid_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["y"], random_state=SEED
)

train_counts = train_df["y"].value_counts().to_dict()
valid_counts = valid_df["y"].value_counts().to_dict()
test_counts = test_df["y"].value_counts().to_dict()
num_pos = float(train_counts.get(1, 0))
num_neg = float(train_counts.get(0, 0))
POS_WEIGHT = num_neg / max(1.0, num_pos)

sample_weight_values = train_df["y"].map(lambda y: 1.0 / train_counts[y]).values
sample_weights = torch.DoubleTensor(sample_weight_values)
sampler = torch.utils.data.WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True,
)

train_dataset = SeqDataset(train_df, token2id, MAX_LEN)
valid_dataset = SeqDataset(valid_df, token2id, MAX_LEN)
test_dataset = SeqDataset(test_df, token2id, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(len(df), df["y"].value_counts())
print(f"Vocabulary size: {vocab_size}")



In [53]:
# Training configuration display
print(f"\n{'='*60}")
print(f"Training Configuration:")
print(f"{'='*60}")
print(f"Device: {device}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Max Sequence Length: {MAX_LEN}")
print(f"Dataset Size (total): {len(df)}")
print(f" - Train: {len(train_dataset)}  (batches: {len(train_loader)})")
print(f" - Valid: {len(valid_dataset)}  (batches: {len(valid_loader)})")
print(f" - Test : {len(test_dataset)}   (batches: {len(test_loader)})")
print(f"Vocabulary Size: {vocab_size}")
print(f"Class distribution (train): {train_counts}")
print(f"Class distribution (valid): {valid_counts}")
print(f"Class distribution (test) : {test_counts}")
print(f"Loss pos_weight: {POS_WEIGHT:.2f}")
print(f"{'='*60}\n")


Training Configuration:
Device: cuda
Batch Size: 128
Max Sequence Length: 512
Dataset Size (total): 21010
 - Train: 16808  (batches: 132)
 - Valid: 2101  (batches: 17)
 - Test : 2101   (batches: 17)
Vocabulary Size: 56
Class distribution (train): {0: 16749, 1: 59}
Class distribution (valid): {0: 2093, 1: 8}
Class distribution (test) : {0: 2094, 1: 7}
Loss pos_weight: 283.88



In [None]:
model = TransformerClassifier(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    nhead=NHEAD,
    num_layers=NUM_LAYERS,
    dim_feedforward=FFN_DIM,
    dropout=DROPOUT,
    max_len=MAX_LEN,
    pad_id=pad_id,
).to(device)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=device))
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

import time

for epoch in range(1, EPOCHS + 1):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch}/{EPOCHS}")
    print(f"{'='*60}")
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    epoch_start = time.time()

    # Add progress bar for batches
    pbar = tqdm(train_loader, desc=f"Training", unit="batch")
    for input_ids, attention_mask, labels in pbar:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        with torch.no_grad():
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)
        
        # Update progress bar with current metrics
        current_loss = total_loss / max(1, total_samples)
        current_acc = total_correct / max(1, total_samples)
        pbar.set_postfix({
            'loss': f'{current_loss:.4f}',
            'acc': f'{current_acc:.4f}'
        })

    train_loss = total_loss / max(1, total_samples)
    train_acc = total_correct / max(1, total_samples)

    # Validation at end of epoch
    val_loss, val_acc = evaluate(model, valid_loader, loss_fn)

    epoch_time = time.time() - epoch_start
    print(f"\n Epoch {epoch} Summary: TrainLoss={train_loss:.4f}, TrainAcc={train_acc:.4f}, ValLoss={val_loss:.4f}, ValAcc={val_acc:.4f}, Time={epoch_time:.1f}s")






Epoch 1/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:16<00:00,  8.16batch/s, loss=2.0363, acc=0.8351]



 Epoch 1 Summary: TrainLoss=2.0363, TrainAcc=0.8351, ValLoss=0.0099, ValAcc=1.0000, Time=16.7s

Epoch 2/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:16<00:00,  8.16batch/s, loss=0.1358, acc=0.9992]



 Epoch 2 Summary: TrainLoss=0.1358, TrainAcc=0.9992, ValLoss=0.0023, ValAcc=1.0000, Time=16.7s

Epoch 3/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:15<00:00,  8.37batch/s, loss=0.0564, acc=0.9990]



 Epoch 3 Summary: TrainLoss=0.0564, TrainAcc=0.9990, ValLoss=0.0009, ValAcc=1.0000, Time=16.3s

ðŸ§ª Transformer Test Results: Loss=0.0009, Accuracy=1.0000
Transformer Test Metrics:
 - loss: 0.0009
 - acc: 1.0000
 - precision: 1.0000
 - recall: 1.0000
 - f1: 1.0000
 - auc: 1.0000


In [None]:
print(f"\n{'='*60}")
print("Training LSTM classifier")
print(f"{'='*60}")

LSTM_NUM_LAYERS = 1
LSTM_HIDDEN_SIZE = D_MODEL

lstm_model = LSTMClassifier(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    hidden_size=LSTM_HIDDEN_SIZE,
    num_layers=LSTM_NUM_LAYERS,
    dropout=DROPOUT,
    pad_id=pad_id,
).to(device)

lstm_loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=device))
lstm_optimizer = torch.optim.Adam(lstm_model.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    print(f"\n{'-'*60}")
    print(f"LSTM Epoch {epoch}/{EPOCHS}")
    print(f"{'-'*60}")
    lstm_model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    pbar = tqdm(train_loader, desc="LSTM Training", unit="batch")
    for input_ids, attention_mask, labels in pbar:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        lstm_optimizer.zero_grad()
        logits = lstm_model(input_ids, attention_mask)
        loss = lstm_loss_fn(logits, labels)
        loss.backward()
        lstm_optimizer.step()

        total_loss += loss.item() * labels.size(0)
        with torch.no_grad():
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)

        current_loss = total_loss / max(1, total_samples)
        current_acc = total_correct / max(1, total_samples)
        pbar.set_postfix({'loss': f'{current_loss:.4f}', 'acc': f'{current_acc:.4f}'})

    train_loss = total_loss / max(1, total_samples)
    train_acc = total_correct / max(1, total_samples)
    val_loss, val_acc = evaluate(lstm_model, valid_loader, lstm_loss_fn)
    print(f" LSTM Epoch {epoch} Summary: TrainLoss={train_loss:.4f}, TrainAcc={train_acc:.4f}, ValLoss={val_loss:.4f}, ValAcc={val_acc:.4f}")





Training LSTM classifier

------------------------------------------------------------
LSTM Epoch 1/3
------------------------------------------------------------


LSTM Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:01<00:00, 75.95batch/s, loss=2.8236, acc=0.6839]


 LSTM Epoch 1 Summary: TrainLoss=2.8236, TrainAcc=0.6839, ValLoss=0.6430, ValAcc=0.8087

------------------------------------------------------------
LSTM Epoch 2/3
------------------------------------------------------------


LSTM Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:01<00:00, 75.95batch/s, loss=0.2012, acc=0.9490]


 LSTM Epoch 2 Summary: TrainLoss=0.2012, TrainAcc=0.9490, ValLoss=0.1270, ValAcc=0.9838

------------------------------------------------------------
LSTM Epoch 3/3
------------------------------------------------------------


LSTM Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:01<00:00, 76.22batch/s, loss=0.0619, acc=0.9956]


 LSTM Epoch 3 Summary: TrainLoss=0.0619, TrainAcc=0.9956, ValLoss=0.0458, ValAcc=0.9990

LSTM Test Metrics:
 - loss: 0.0493
 - acc: 0.9986
 - precision: 0.7000
 - recall: 1.0000
 - f1: 0.8235
 - auc: 1.0000


In [None]:
print(f"\n{'='*60}")
print("Final Evaluation")
print(f"{'='*60}")
transformer_test_metrics = evaluate_metrics(model, test_loader, loss_fn)
lstm_test_metrics = evaluate_metrics(lstm_model, test_loader, lstm_loss_fn)

print("Transformer Test Metrics:")
for k, v in transformer_test_metrics.items():
    print(f" - {k}: {v:.4f}")

print("\nLSTM Test Metrics:")
for k, v in lstm_test_metrics.items():
    print(f" - {k}: {v:.4f}")

example_seq_normal = df[df["label"] == "normal"].iloc[0]["sequence"]
example_seq_attack = df[df["label"] == "attack"].iloc[0]["sequence"]

print("\nTransformer inference (normal, attack):")
print(predict_sequence(model, example_seq_normal, token2id, pad_id, MAX_LEN, device))
print(predict_sequence(model, example_seq_attack, token2id, pad_id, MAX_LEN, device))

print("\nLSTM inference (normal, attack):")
print(predict_sequence(lstm_model, example_seq_normal, token2id, pad_id, MAX_LEN, device))
print(predict_sequence(lstm_model, example_seq_attack, token2id, pad_id, MAX_LEN, device))


In [57]:
example_seq_normal = df.iloc[0]["sequence"]
for i in range(len(df)):
    if df.iloc[i]["label"] == "attack":
        example_seq_attack = df.iloc[i]["sequence"]
        break
print('transformer inference:')
print(predict_sequence(model,example_seq_normal))
print(predict_sequence(model,example_seq_attack))
print('LSTM inference:')
print(predict_sequence(lstm_model,example_seq_normal))
print(predict_sequence(lstm_model,example_seq_attack))

transformer inference:
0.0006897716084495187
0.9998300075531006
LSTM inference:
0.06681326031684875
0.9998456239700317
