In [17]:
import random
from collections import Counter

import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
CSV_PATH = "./cleaned_sample_parsed_cadets_tagged_chunked.csv"
MAX_LEN = 1024
BATCH_SIZE = 64
D_MODEL = 64
NHEAD = 4
NUM_LAYERS = 2
FFN_DIM = 128
DROPOUT = 0.3

LR = 1e-4
EPOCHS = 10
SEED = 5231

PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:
def load_df(csv_path):
    df = pd.read_csv(csv_path)
    df = df[["subject_uuid", "sequence", "label"]].dropna()

    label2id = {"normal": 0, "attack": 1}
    df = df[df["label"].isin(label2id.keys())].copy()
    df["y"] = df["label"].map(label2id)
    return df

def build_vocab(df):
    counter = Counter()
    for seq in df["sequence"]:
        counter.update(str(seq).split())

    token2id = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    for i, tok in enumerate(counter.keys(), start=2):
        token2id[tok] = i

    id2token = {v: k for k, v in token2id.items()}
    pad_id = token2id[PAD_TOKEN]
    unk_id = token2id[UNK_TOKEN]
    vocab_size = len(token2id)
    return token2id, id2token, pad_id, unk_id, vocab_size

class SeqDataset(Dataset):
    def __init__(self, df, token2id, max_len):
        self.max_len = max_len
        self.token2id = token2id
        self.pad_id = token2id[PAD_TOKEN]

        self.seqs = []
        self.labels = []

        for _, row in df.iterrows():
            tokens = str(row["sequence"]).split()
            ids = [self.token2id.get(t, self.token2id[UNK_TOKEN]) for t in tokens]
            self.seqs.append(ids)
            self.labels.append(int(row["y"]))

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return self.seqs[idx], self.labels[idx]

def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = [len(s) for s in sequences]
    max_len = min(max(lengths), MAX_LEN)

    batch_size = len(sequences)
    input_ids = torch.full((batch_size, max_len), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((batch_size, max_len), dtype=torch.long)

    for i, seq in enumerate(sequences):
        seq = seq[:max_len]
        input_ids[i, :len(seq)] = torch.tensor(seq, dtype=torch.long)
        attention_mask[i, :len(seq)] = 1

    labels = torch.tensor(labels, dtype=torch.float32)
    return input_ids, attention_mask, labels

def evaluate(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            total_loss += loss.item() * labels.size(0)
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)
    avg_loss = total_loss / max(1, total_samples)
    acc = total_correct / max(1, total_samples)
    return avg_loss, acc

def evaluate_metrics(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    y_true = []
    y_prob = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits)
            loss = loss_fn(logits, labels)

            total_loss += loss.item() * labels.size(0)
            y_true.extend(labels.long().tolist())
            y_prob.extend(probs.detach().cpu().tolist())
    import numpy as np
    y_true = np.array(y_true)
    y_prob = np.array(y_prob)
    y_pred = (y_prob > 0.5).astype(int)
    avg_loss = total_loss / max(1, len(y_true))
    acc = (y_pred == y_true).mean() if len(y_true) > 0 else 0.0
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    try:
        auc = roc_auc_score(y_true, y_prob)
    except Exception:
        auc = float('nan')
    return {
        "loss": float(avg_loss),
        "acc": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "auc": float(auc),
    }

def predict_sequence(model, seq_str, token2id, pad_id, max_len, device):
    tokens = str(seq_str).split()
    ids = [token2id.get(t, token2id[UNK_TOKEN]) for t in tokens]
    ids = ids[:max_len]

    input_ids = torch.full((1, max_len), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((1, max_len), dtype=torch.long)
    input_ids[0, :len(ids)] = torch.tensor(ids, dtype=torch.long)
    attention_mask[0, :len(ids)] = 1

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    model.eval()
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        prob_attack = torch.sigmoid(logits)[0].item()
    return prob_attack


In [19]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers,
                 dim_feedforward, dropout, max_len, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.max_len = max_len

        # Token + positional embeddings to encode discrete events with order
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.pos_emb = nn.Embedding(max_len, d_model)

        # Transformer encoder stack for contextual sequence modeling
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, input_ids, attention_mask):
        # input_ids: (B, L), attention_mask: (B, L)
        B, L = input_ids.size()
        positions = torch.arange(L, device=input_ids.device).unsqueeze(0).expand(B, L)

        # Embed tokens + positions, switch to (L, B, D) for torch Transformer
        x = self.token_emb(input_ids) + self.pos_emb(positions)
        x = x.transpose(0, 1)

        # True where token is padding so encoder can ignore it
        src_key_padding_mask = (attention_mask == 0)

        encoded = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        encoded = encoded.transpose(0, 1)  # back to (B, L, D)

        # Masked mean pooling keeps only real tokens when averaging
        mask = attention_mask.unsqueeze(-1)
        masked_encoded = encoded * mask
        summed = masked_encoded.sum(dim=1)
        lengths = mask.sum(dim=1).clamp(min=1)
        pooled = summed / lengths

        pooled = self.dropout(pooled)
        logits = self.fc(pooled).squeeze(-1)
        return logits

In [20]:
class LSTMClassifier(nn.Module):
    """Embedding + LSTM encoder with masked mean pooling for classification."""

    def __init__(self, vocab_size, d_model, hidden_size, num_layers,
                 dropout, pad_id):
        super().__init__()
        self.pad_id = pad_id

        # Token embedding shared with Transformer for fair comparison
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)

        # Unidirectional LSTM to capture sequential dependencies
        self.lstm = nn.LSTM(
            input_size=d_model,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=False,
        )

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)

        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.fc.weight)
        if self.fc.bias is not None:
            nn.init.zeros_(self.fc.bias)

    def forward(self, input_ids, attention_mask):
        """Return logits of shape (B,) for the provided batch inputs."""
        x = self.token_emb(input_ids)  # (B, L, D)

        outputs, (h_n, c_n) = self.lstm(x)      # outputs: (B, L, H)

        # Masked mean pooling to ignore padded tokens
        mask = attention_mask.unsqueeze(-1)  # (B, L, 1)
        masked_outputs = outputs * mask
        summed = masked_outputs.sum(dim=1)  # (B, H)
        lengths = mask.sum(dim=1).clamp(min=1)  # (B, 1)
        pooled = summed / lengths  # (B, H)

        pooled = self.dropout(pooled)
        logits = self.fc(pooled).squeeze(-1)    # (B,)
        # logits = self.fc(h_n[-1]).squeeze(-1)
        return logits

In [21]:
from sklearn.model_selection import train_test_split

# Data loading and split
df = load_df(CSV_PATH)
token2id, id2token, pad_id, unk_id, vocab_size = build_vocab(df)
# Sequence length statistics

seq_lengths = df["sequence"].astype(str).apply(lambda x: len(x.split()))
print(f"\nSequence Length Statistics:")
print(seq_lengths.describe())
print(f"90th percentile: {seq_lengths.quantile(0.9)}")
print(f"95th percentile: {seq_lengths.quantile(0.95)}")
print(f"99th percentile: {seq_lengths.quantile(0.99)}")
print(f"Max length setting: {MAX_LEN}")
print(f"Sequences longer than MAX_LEN: {(seq_lengths > MAX_LEN).sum()} ({(seq_lengths > MAX_LEN).mean()*100:.2f}%)")

train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["y"], random_state=SEED
)
valid_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["y"], random_state=SEED
)

train_counts = train_df["y"].value_counts().to_dict()
valid_counts = valid_df["y"].value_counts().to_dict()
test_counts = test_df["y"].value_counts().to_dict()
num_pos = float(train_counts.get(1, 0))
num_neg = float(train_counts.get(0, 0))
POS_WEIGHT = num_neg / max(1.0, num_pos)

sample_weight_values = train_df["y"].map(lambda y: 1.0 / train_counts[y]).values
sample_weights = torch.DoubleTensor(sample_weight_values)
sampler = torch.utils.data.WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True,
)

train_dataset = SeqDataset(train_df, token2id, MAX_LEN)
valid_dataset = SeqDataset(valid_df, token2id, MAX_LEN)
test_dataset = SeqDataset(test_df, token2id, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(len(df), df["y"].value_counts())
print(f"Vocabulary size: {vocab_size}")




Sequence Length Statistics:
count    68729.000000
mean       218.757366
std        672.485184
min          3.000000
25%         22.000000
50%         29.000000
75%        124.000000
max       5000.000000
Name: sequence, dtype: float64
90th percentile: 632.0
95th percentile: 661.0
99th percentile: 5000.0
Max length setting: 1024
Sequences longer than MAX_LEN: 1862 (2.71%)
68729 y
0    66617
1     2112
Name: count, dtype: int64
Vocabulary size: 56


In [22]:
# Training configuration display
print(f"\n{'='*60}")
print(f"Training Configuration:")
print(f"{'='*60}")
print(f"Device: {device}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Max Sequence Length: {MAX_LEN}")
print(f"Dataset Size (total): {len(df)}")
print(f" - Train: {len(train_dataset)}  (batches: {len(train_loader)})")
print(f" - Valid: {len(valid_dataset)}  (batches: {len(valid_loader)})")
print(f" - Test : {len(test_dataset)}   (batches: {len(test_loader)})")
print(f"Vocabulary Size: {vocab_size}")
print(f"Class distribution (train): {train_counts}")
print(f"Class distribution (valid): {valid_counts}")
print(f"Class distribution (test) : {test_counts}")
print(f"Loss pos_weight: {POS_WEIGHT:.2f}")
print(f"{'='*60}\n")


Training Configuration:
Device: cuda
Batch Size: 64
Max Sequence Length: 1024
Dataset Size (total): 68729
 - Train: 54983  (batches: 860)
 - Valid: 6873  (batches: 108)
 - Test : 6873   (batches: 108)
Vocabulary Size: 56
Class distribution (train): {0: 53293, 1: 1690}
Class distribution (valid): {0: 6662, 1: 211}
Class distribution (test) : {0: 6662, 1: 211}
Loss pos_weight: 31.53



In [23]:
from transformers import get_cosine_schedule_with_warmup
import time

model = TransformerClassifier(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    nhead=NHEAD,
    num_layers=NUM_LAYERS,
    dim_feedforward=FFN_DIM,
    dropout=DROPOUT,
    max_len=MAX_LEN,
    pad_id=pad_id,
).to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

total_epochs = EPOCHS
num_training_steps = len(train_loader) * total_epochs
num_warmup_steps = int(0.05 * num_training_steps)

print(f"num_warmup_steps: {num_warmup_steps}")
print(f"num_training_steps: {num_training_steps}")

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)


for epoch in range(1, EPOCHS + 1):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch}/{EPOCHS}")
    print(f"{'='*60}")
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    epoch_start = time.time()

    # Add progress bar for batches
    pbar = tqdm(train_loader, desc=f"Training", unit="batch")
    for input_ids, attention_mask, labels in pbar:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item() * labels.size(0)
        with torch.no_grad():
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)
        
        # Update progress bar with current metrics
        current_loss = total_loss / max(1, total_samples)
        current_acc = total_correct / max(1, total_samples)
        pbar.set_postfix({
            'loss': f'{current_loss:.4f}',
            'acc': f'{current_acc:.4f}'
        })

    train_loss = total_loss / max(1, total_samples)
    train_acc = total_correct / max(1, total_samples)

    # Validation at end of epoch
    val_loss, val_acc = evaluate(model, valid_loader, loss_fn)

    epoch_time = time.time() - epoch_start
    print(f"\n Epoch {epoch} Summary: TrainLoss={train_loss:.4f}, TrainAcc={train_acc:.4f}, ValLoss={val_loss:.4f}, ValAcc={val_acc:.4f}, Time={epoch_time:.1f}s")





num_warmup_steps: 430
num_training_steps: 8600

Epoch 1/10


Training: 100%|██████████| 860/860 [02:52<00:00,  4.97batch/s, loss=2.2494, acc=0.5005]



 Epoch 1 Summary: TrainLoss=2.2494, TrainAcc=0.5005, ValLoss=2.8562, ValAcc=0.0607, Time=177.5s

Epoch 2/10


Training: 100%|██████████| 860/860 [02:48<00:00,  5.10batch/s, loss=1.7760, acc=0.5423]



 Epoch 2 Summary: TrainLoss=1.7760, TrainAcc=0.5423, ValLoss=2.7682, ValAcc=0.1803, Time=173.3s

Epoch 3/10


Training: 100%|██████████| 860/860 [02:49<00:00,  5.07batch/s, loss=1.5926, acc=0.5975]



 Epoch 3 Summary: TrainLoss=1.5926, TrainAcc=0.5975, ValLoss=2.7493, ValAcc=0.2636, Time=174.3s

Epoch 4/10


Training: 100%|██████████| 860/860 [02:48<00:00,  5.09batch/s, loss=1.5297, acc=0.6189]



 Epoch 4 Summary: TrainLoss=1.5297, TrainAcc=0.6189, ValLoss=2.5596, ValAcc=0.2760, Time=173.6s

Epoch 5/10


Training: 100%|██████████| 860/860 [02:48<00:00,  5.11batch/s, loss=1.4814, acc=0.6317]



 Epoch 5 Summary: TrainLoss=1.4814, TrainAcc=0.6317, ValLoss=2.6413, ValAcc=0.3087, Time=172.9s

Epoch 6/10


Training: 100%|██████████| 860/860 [02:48<00:00,  5.10batch/s, loss=1.4310, acc=0.6437]



 Epoch 6 Summary: TrainLoss=1.4310, TrainAcc=0.6437, ValLoss=2.9518, ValAcc=0.3246, Time=173.3s

Epoch 7/10


Training: 100%|██████████| 860/860 [02:51<00:00,  5.02batch/s, loss=1.3233, acc=0.6723]



 Epoch 7 Summary: TrainLoss=1.3233, TrainAcc=0.6723, ValLoss=3.1747, ValAcc=0.3706, Time=175.8s

Epoch 8/10


Training: 100%|██████████| 860/860 [02:48<00:00,  5.11batch/s, loss=1.2482, acc=0.6908]



 Epoch 8 Summary: TrainLoss=1.2482, TrainAcc=0.6908, ValLoss=3.5443, ValAcc=0.4067, Time=172.9s

Epoch 9/10


Training: 100%|██████████| 860/860 [02:44<00:00,  5.23batch/s, loss=1.1833, acc=0.7076]



 Epoch 9 Summary: TrainLoss=1.1833, TrainAcc=0.7076, ValLoss=3.6107, ValAcc=0.4230, Time=169.0s

Epoch 10/10


Training: 100%|██████████| 860/860 [02:51<00:00,  5.02batch/s, loss=1.1494, acc=0.7169]



 Epoch 10 Summary: TrainLoss=1.1494, TrainAcc=0.7169, ValLoss=3.6568, ValAcc=0.4340, Time=175.7s


In [24]:
print(f"\n{'='*60}")
print("Training LSTM classifier")
print(f"{'='*60}")

LSTM_NUM_LAYERS = 1
LSTM_HIDDEN_SIZE = D_MODEL

lstm_model = LSTMClassifier(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    hidden_size=LSTM_HIDDEN_SIZE,
    num_layers=LSTM_NUM_LAYERS,
    dropout=DROPOUT,
    pad_id=pad_id,
).to(device)

lstm_loss_fn = nn.BCEWithLogitsLoss()
lstm_optimizer = torch.optim.Adam(lstm_model.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    print(f"\n{'-'*60}")
    print(f"LSTM Epoch {epoch}/{EPOCHS}")
    print(f"{'-'*60}")
    lstm_model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    pbar = tqdm(train_loader, desc="LSTM Training", unit="batch")
    for input_ids, attention_mask, labels in pbar:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        lstm_optimizer.zero_grad()
        logits = lstm_model(input_ids, attention_mask)
        loss = lstm_loss_fn(logits, labels)
        loss.backward()
        lstm_optimizer.step()

        total_loss += loss.item() * labels.size(0)
        with torch.no_grad():
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)

        current_loss = total_loss / max(1, total_samples)
        current_acc = total_correct / max(1, total_samples)
        pbar.set_postfix({'loss': f'{current_loss:.4f}', 'acc': f'{current_acc:.4f}'})

    train_loss = total_loss / max(1, total_samples)
    train_acc = total_correct / max(1, total_samples)
    val_loss, val_acc = evaluate(lstm_model, valid_loader, lstm_loss_fn)
    print(f" LSTM Epoch {epoch} Summary: TrainLoss={train_loss:.4f}, TrainAcc={train_acc:.4f}, ValLoss={val_loss:.4f}, ValAcc={val_acc:.4f}")





Training LSTM classifier

------------------------------------------------------------
LSTM Epoch 1/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 81.97batch/s, loss=2.0727, acc=0.4977]


 LSTM Epoch 1 Summary: TrainLoss=2.0727, TrainAcc=0.4977, ValLoss=3.0677, ValAcc=0.0326

------------------------------------------------------------
LSTM Epoch 2/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 81.59batch/s, loss=1.9922, acc=0.5046]


 LSTM Epoch 2 Summary: TrainLoss=1.9922, TrainAcc=0.5046, ValLoss=3.1077, ValAcc=0.0397

------------------------------------------------------------
LSTM Epoch 3/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 81.55batch/s, loss=1.9607, acc=0.5062]


 LSTM Epoch 3 Summary: TrainLoss=1.9607, TrainAcc=0.5062, ValLoss=2.8694, ValAcc=0.0393

------------------------------------------------------------
LSTM Epoch 4/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 80.21batch/s, loss=1.9091, acc=0.5065]


 LSTM Epoch 4 Summary: TrainLoss=1.9091, TrainAcc=0.5065, ValLoss=2.8111, ValAcc=0.0332

------------------------------------------------------------
LSTM Epoch 5/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 80.29batch/s, loss=1.9087, acc=0.5059]


 LSTM Epoch 5 Summary: TrainLoss=1.9087, TrainAcc=0.5059, ValLoss=2.7949, ValAcc=0.0581

------------------------------------------------------------
LSTM Epoch 6/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 80.02batch/s, loss=1.7857, acc=0.5271]


 LSTM Epoch 6 Summary: TrainLoss=1.7857, TrainAcc=0.5271, ValLoss=2.6599, ValAcc=0.1096

------------------------------------------------------------
LSTM Epoch 7/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 82.09batch/s, loss=1.7270, acc=0.5427]


 LSTM Epoch 7 Summary: TrainLoss=1.7270, TrainAcc=0.5427, ValLoss=2.2894, ValAcc=0.1689

------------------------------------------------------------
LSTM Epoch 8/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 82.83batch/s, loss=1.6716, acc=0.5636]


 LSTM Epoch 8 Summary: TrainLoss=1.6716, TrainAcc=0.5636, ValLoss=2.6468, ValAcc=0.1490

------------------------------------------------------------
LSTM Epoch 9/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:09<00:00, 86.14batch/s, loss=1.6047, acc=0.5790]


 LSTM Epoch 9 Summary: TrainLoss=1.6047, TrainAcc=0.5790, ValLoss=2.5389, ValAcc=0.2060

------------------------------------------------------------
LSTM Epoch 10/10
------------------------------------------------------------


LSTM Training: 100%|██████████| 860/860 [00:10<00:00, 85.96batch/s, loss=1.5275, acc=0.5996]


 LSTM Epoch 10 Summary: TrainLoss=1.5275, TrainAcc=0.5996, ValLoss=2.6753, ValAcc=0.2328


In [25]:
print(f"\n{'='*60}")
print("Final Evaluation")
print(f"{'='*60}")
transformer_test_metrics = evaluate_metrics(model, test_loader, loss_fn)
lstm_test_metrics = evaluate_metrics(lstm_model, test_loader, lstm_loss_fn)

print("Transformer Test Metrics:")
for k, v in transformer_test_metrics.items():
    print(f" - {k}: {v:.4f}")

print("\nLSTM Test Metrics:")
for k, v in lstm_test_metrics.items():
    print(f" - {k}: {v:.4f}")

example_seq_normal = df[df["label"] == "normal"].iloc[0]["sequence"]
example_seq_attack = df[df["label"] == "attack"].iloc[0]["sequence"]

def get_label(prob):
    return 'attack' if prob > 0.5 else 'normal'

print("\nTransformer inference (normal, attack):")
p_normal = predict_sequence(model, example_seq_normal, token2id, pad_id, MAX_LEN, device)
p_attack = predict_sequence(model, example_seq_attack, token2id, pad_id, MAX_LEN, device)
print(f"Normal sample -> Prob: {p_normal:.4f}, Prediction: {get_label(p_normal)}")
print(f"Attack sample -> Prob: {p_attack:.4f}, Prediction: {get_label(p_attack)}")

print("\nLSTM inference (normal, attack):")
p_normal_lstm = predict_sequence(lstm_model, example_seq_normal, token2id, pad_id, MAX_LEN, device)
p_attack_lstm = predict_sequence(lstm_model, example_seq_attack, token2id, pad_id, MAX_LEN, device)
print(f"Normal sample -> Prob: {p_normal_lstm:.4f}, Prediction: {get_label(p_normal_lstm)}")
print(f"Attack sample -> Prob: {p_attack_lstm:.4f}, Prediction: {get_label(p_attack_lstm)}")



Final Evaluation
Transformer Test Metrics:
 - loss: 3.9294
 - acc: 0.4391
 - precision: 0.0373
 - recall: 0.6967
 - f1: 0.0709
 - auc: 0.6785

LSTM Test Metrics:
 - loss: 2.5378
 - acc: 0.2376
 - precision: 0.0344
 - recall: 0.8815
 - f1: 0.0663
 - auc: 0.6803

Transformer inference (normal, attack):
2.838618762268652e-08
0.047563761472702026

LSTM inference (normal, attack):
0.019240975379943848
0.0042176139540970325
