In [15]:
import random
from collections import Counter

import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
CSV_PATH = "./cleaned_sample_parsed_cadets_tagged.csv"
MAX_LEN = 512
BATCH_SIZE = 128
D_MODEL = 64
NHEAD = 4
NUM_LAYERS = 2
FFN_DIM = 128
DROPOUT = 0.1
LR = 1e-3
EPOCHS = 3
SEED = 5231

PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:
def load_df(csv_path):
    df = pd.read_csv(csv_path)
    df = df[["subject_uuid", "sequence", "label"]].dropna()

    label2id = {"normal": 0, "attack": 1}
    df = df[df["label"].isin(label2id.keys())].copy()
    df["y"] = df["label"].map(label2id)
    return df
def build_vocab(df):
    counter = Counter()
    for seq in df["sequence"]:
        counter.update(str(seq).split())

    token2id = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    for i, tok in enumerate(counter.keys(), start=2):
        token2id[tok] = i

    id2token = {v: k for k, v in token2id.items()}
    pad_id = token2id[PAD_TOKEN]
    unk_id = token2id[UNK_TOKEN]
    vocab_size = len(token2id)
    return token2id, id2token, pad_id, unk_id, vocab_size
df = load_df(CSV_PATH)
token2id, id2token, pad_id, unk_id, vocab_size = build_vocab(df)


In [17]:
print(vocab_size, len(df), df["y"].value_counts())
print(token2id)
print(id2token)

56 21010 y
0    20936
1       74
Name: count, dtype: int64
{'<PAD>': 0, '<UNK>': 1, 'aue_openat_rwtc': 2, 'aue_open_rwtc': 3, 'aue_read': 4, 'aue_close': 5, 'aue_exit': 6, 'aue_pread': 7, 'aue_mmap': 8, 'aue_execve': 9, 'aue_mprotect': 10, 'aue_connect': 11, 'aue_fcntl': 12, 'aue_chdir': 13, 'aue_lseek': 14, 'aue_write': 15, 'aue_setuid': 16, 'aue_setgid': 17, 'aue_sendto': 18, 'aue_pipe': 19, 'aue_accept': 20, 'aue_recvfrom': 21, 'aue_umask': 22, 'aue_seteuid': 23, 'aue_setegid': 24, 'aue_setlogin': 25, 'aue_chmod': 26, 'aue_ftruncate': 27, 'aue_fork': 28, 'aue_utimes': 29, 'aue_unlink': 30, 'aue_rename': 31, 'aue_fchmod': 32, 'aue_futimes': 33, 'aue_link': 34, 'aue_vfork': 35, 'aue_kill': 36, 'aue_mkdir': 37, 'aue_fchdir': 38, 'aue_rmdir': 39, 'aue_socketpair': 40, 'aue_chown': 41, 'aue_posix_openpt': 42, 'aue_writev': 43, 'aue_sendmsg': 44, 'aue_setresuid': 45, 'aue_setresgid': 46, 'aue_recvmsg': 47, 'aue_closefrom': 48, 'aue_futimesat': 49, 'aue_fchmodat': 50, 'aue_pwrite': 51, 'au

In [18]:
from sklearn.model_selection import train_test_split

class SeqDataset(Dataset):
    def __init__(self, df, token2id, max_len):
        self.max_len = max_len
        self.token2id = token2id
        self.pad_id = token2id[PAD_TOKEN]

        self.seqs = []
        self.labels = []

        for _, row in df.iterrows():
            tokens = str(row["sequence"]).split()
            ids = [self.token2id.get(t, self.token2id[UNK_TOKEN]) for t in tokens]
            self.seqs.append(ids)
            self.labels.append(int(row["y"]))

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        return self.seqs[idx], self.labels[idx]

def collate_fn(batch):
  sequences, labels = zip(*batch)
  lengths = [len(s) for s in sequences]
  max_len = min(max(lengths), MAX_LEN)

  batch_size = len(sequences)
  input_ids = torch.full((batch_size, max_len), pad_id, dtype=torch.long)
  attention_mask = torch.zeros((batch_size, max_len), dtype=torch.long)

  for i, seq in enumerate(sequences):
      seq = seq[:max_len]
      input_ids[i, :len(seq)] = torch.tensor(seq, dtype=torch.long)
      attention_mask[i, :len(seq)] = 1

  labels = torch.tensor(labels, dtype=torch.float32)
  return input_ids, attention_mask, labels

# Stratified train/validation/test split
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["y"], random_state=SEED
)
valid_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["y"], random_state=SEED
)

# Create datasets
train_dataset = SeqDataset(train_df, token2id, MAX_LEN)
valid_dataset = SeqDataset(valid_df, token2id, MAX_LEN)
test_dataset = SeqDataset(test_df, token2id, MAX_LEN)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


In [19]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers,
                 dim_feedforward, dropout, max_len, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.max_len = max_len

        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.pos_emb = nn.Embedding(max_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, input_ids, attention_mask):
        # input_ids: (B, L), attention_mask: (B, L)
        B, L = input_ids.size()
        positions = torch.arange(L, device=input_ids.device).unsqueeze(0).expand(B, L)

        x = self.token_emb(input_ids) + self.pos_emb(positions)  # (B, L, D)
        x = x.transpose(0, 1)  # (L, B, D) ‰æõ Transformer Áî®

        src_key_padding_mask = (attention_mask == 0)  # (B, L), True Ë°®Á§∫ pad

        encoded = self.encoder(x, src_key_padding_mask=src_key_padding_mask)  # (L, B, D)
        encoded = encoded.transpose(0, 1)  # (B, L, D)

        # masked mean pooling
        mask = attention_mask.unsqueeze(-1)  # (B, L, 1)
        masked_encoded = encoded * mask
        summed = masked_encoded.sum(dim=1)  # (B, D)
        lengths = mask.sum(dim=1).clamp(min=1)  # (B, 1)
        pooled = summed / lengths

        pooled = self.dropout(pooled)
        logits = self.fc(pooled).squeeze(-1)  # (B,)
        return logits

In [20]:
# Training configuration display
print(f"\n{'='*60}")
print(f"Training Configuration:")
print(f"{'='*60}")
print(f"Device: {device}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Max Sequence Length: {MAX_LEN}")
print(f"Dataset Size (total): {len(df)}")
print(f" - Train: {len(train_dataset)}  (batches: {len(train_loader)})")
print(f" - Valid: {len(valid_dataset)}  (batches: {len(valid_loader)})")
print(f" - Test : {len(test_dataset)}   (batches: {len(test_loader)})")
print(f"Vocabulary Size: {vocab_size}")
print(f"{'='*60}\n")

model = TransformerClassifier(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    nhead=NHEAD,
    num_layers=NUM_LAYERS,
    dim_feedforward=FFN_DIM,
    dropout=DROPOUT,
    max_len=MAX_LEN,
    pad_id=pad_id,
).to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

import time

def evaluate(model, data_loader):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            total_loss += loss.item() * labels.size(0)
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)
    avg_loss = total_loss / max(1, total_samples)
    acc = total_correct / max(1, total_samples)
    return avg_loss, acc

for epoch in range(1, EPOCHS + 1):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch}/{EPOCHS}")
    print(f"{'='*60}")
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    epoch_start = time.time()

    # Add progress bar for batches
    pbar = tqdm(train_loader, desc=f"Training", unit="batch")
    for input_ids, attention_mask, labels in pbar:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        with torch.no_grad():
            preds = (torch.sigmoid(logits) > 0.5).long()
            total_correct += (preds == labels.long()).sum().item()
            total_samples += labels.size(0)
        
        # Update progress bar with current metrics
        current_loss = total_loss / max(1, total_samples)
        current_acc = total_correct / max(1, total_samples)
        pbar.set_postfix({
            'loss': f'{current_loss:.4f}',
            'acc': f'{current_acc:.4f}'
        })

    train_loss = total_loss / max(1, total_samples)
    train_acc = total_correct / max(1, total_samples)

    # Validation at end of epoch
    val_loss, val_acc = evaluate(model, valid_loader)

    epoch_time = time.time() - epoch_start
    print(f"\nüìä Epoch {epoch} Summary: TrainLoss={train_loss:.4f}, TrainAcc={train_acc:.4f}, ValLoss={val_loss:.4f}, ValAcc={val_acc:.4f}, Time={epoch_time:.1f}s")

# Final evaluation on test set
test_loss, test_acc = evaluate(model, test_loader)
print(f"\n{'='*60}")
print(f"üß™ Test Results: Loss={test_loss:.4f}, Accuracy={test_acc:.4f}")





Training Configuration:
Device: cuda
Batch Size: 128
Max Sequence Length: 512
Dataset Size (total): 21010
 - Train: 16808  (batches: 132)
 - Valid: 2101  (batches: 17)
 - Test : 2101   (batches: 17)
Vocabulary Size: 56


Epoch 1/3


Training:  24%|‚ñà‚ñà‚ñç       | 32/132 [00:04<00:12,  7.86batch/s, loss=0.1351, acc=0.9363]


KeyboardInterrupt: 

In [None]:
def predict_sequence(model,seq_str):
    tokens = str(seq_str).split()
    ids = [token2id.get(t, unk_id) for t in tokens]
    ids = ids[:MAX_LEN]

    input_ids = torch.full((1, MAX_LEN), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((1, MAX_LEN), dtype=torch.long)
    input_ids[0, :len(ids)] = torch.tensor(ids, dtype=torch.long)
    attention_mask[0, :len(ids)] = 1

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    model.eval()
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        prob_attack = torch.sigmoid(logits)[0].item()
    return prob_attack

In [None]:
example_seq_normal = df.iloc[0]["sequence"]
for i in range(len(df)):
    if df.iloc[i]["label"] == "attack":
        example_seq_attack = df.iloc[i]["sequence"]
        break
print(predict_sequence(model,example_seq_normal))
print(predict_sequence(model,example_seq_attack))

0.00039947463665157557
0.9855538010597229
