In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             precision_recall_curve, f1_score, classification_report,
                             confusion_matrix, ConfusionMatrixDisplay)
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import os
import joblib
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
CONFIG = {
    "dataset": "B",   # "A" or "B"

    # Dataset A
    "A": {
        "file": "/content/drive/MyDrive/wustl_iiot_2021.csv",       # your file path
        "cat_cols": ["SrcAddr", "DstAddr"],
        "drop_cols": ["StartTime", "LastTime", "Traffic"],
        "target_col": "Target",
        "d_model": 128,
        "num_layers": 3,
        "n_heads": 8,
        "batch_size": 256,
        "use_focal": False,import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             precision_recall_curve, f1_score, classification_report,
                             confusion_matrix, ConfusionMatrixDisplay)
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import os
import joblib
import matplotlib.pyplot as plt
        "sample_frac": 0.1,
        "sample_n": None,        # or take exact number of rows
        "dataset_name": "network_attack"
    },

    # Dataset B
    "B": {
        "file": "/content/drive/MyDrive/creditcard.csv",      # credit card fraud dataset path
        "cat_cols": [],                # no categorical
        "drop_cols": [],
        "target_col": "Class",
        "d_model": 64,                 # smaller model (extreme imbalance)
        "num_layers": 2,
        "n_heads": 4,
        "batch_size": 128,
        "use_focal": True,              # focal loss works well here
        "sample_frac": 0.5,
        "sample_n": None,       # or pick exact number of rows
        "dataset_name": "fraud_transaction"
    }
}

In [None]:
# -----------------
# Helper: stratified sampling
# -----------------
def stratified_sample(df, target_col, frac=None, n=None, random_state=42):
    """Return stratified sample keeping target ratio"""
    if frac is not None:
        df_sampled = df.groupby(target_col, group_keys=False)\
                       .apply(lambda x: x.sample(frac=frac, random_state=random_state))
    elif n is not None:
        # allocate samples proportional to class distribution
        class_counts = df[target_col].value_counts()
        total = class_counts.sum()
        df_sampled = []
        for c, count in class_counts.items():
            take = int(n * (count / total))
            df_sampled.append(df[df[target_col] == c].sample(n=take, random_state=random_state))
        df_sampled = pd.concat(df_sampled)
    else:
        df_sampled = df
    return df_sampled.sample(frac=1.0, random_state=random_state)  # shuffle

In [None]:
# -----------------
# Dataset + preprocessing
# -----------------
def prepare_dataframes(df, cat_cols, drop_cols, target_col, test_size=0.15, val_size=0.15, random_state=42):
    """
    Splits dataframe into train/val/test with optional column dropping.
    - df: input DataFrame
    - cat_cols: list of categorical column names
    - target_col: anomaly/fraud label column
    - drop_cols: list of columns to drop before splitting (default None)
    """

    df = df.copy()

    # Drop unwanted columns if provided
    if drop_cols:
        df = df.drop(columns=drop_cols, errors="ignore")

    # Handle missing values
    df.fillna(method='ffill', inplace=True)

    X = df.drop(columns=[target_col])
    y = df[target_col].astype(int).values
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    val_relative = val_size / (1.0 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=val_relative, stratify=y_trainval, random_state=random_state
    )
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def fit_transform_preprocessors(X_train, X_val, X_test, cat_cols):
    num_cols = [c for c in X_train.columns if c not in cat_cols]
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train[num_cols])
    X_val_num = scaler.transform(X_val[num_cols])
    X_test_num = scaler.transform(X_test[num_cols])
    encoders = {}
    if cat_cols:
        X_train_cat = np.zeros((len(X_train), len(cat_cols)), dtype=int)
        X_val_cat   = np.zeros((len(X_val), len(cat_cols)), dtype=int)
        X_test_cat  = np.zeros((len(X_test), len(cat_cols)), dtype=int)
        for i, c in enumerate(cat_cols):
            le = LabelEncoder()
            le.fit(pd.concat([X_train[c], X_val[c], X_test[c]], axis=0).astype(str))
            encoders[c] = le
            X_train_cat[:, i] = le.transform(X_train[c].astype(str))
            X_val_cat[:, i] = le.transform(X_val[c].astype(str))
            X_test_cat[:, i] = le.transform(X_test[c].astype(str))
    else:
        X_train_cat, X_val_cat, X_test_cat = None, None, None
    return num_cols, scaler, encoders, (X_train_num, X_val_num, X_test_num), (X_train_cat, X_val_cat, X_test_cat)


class TabularDataset(Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = X_num.astype(np.float32)
        self.X_cat = X_cat.astype(np.int64) if X_cat is not None else None
        self.y = y.astype(np.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        x_num = torch.from_numpy(self.X_num[idx])
        x_cat = torch.from_numpy(self.X_cat[idx]) if self.X_cat is not None else torch.zeros(0, dtype=torch.long)
        y = torch.tensor(self.y[idx], dtype=torch.float32)
        return x_num, x_cat, y

In [None]:
# -----------------
# Transformer model
# -----------------
class VanillaTransformer(nn.Module):
    def __init__(self, num_num_features, cat_cardinalities, feat_order,
                 num_feature_indices, cat_feature_indices,
                 d_model=128, n_heads=8, num_layers=3, d_ff=None, dropout=0.1):
        super().__init__()
        seq_len = len(feat_order)
        d_ff = d_ff or d_model * 4
        self.feature_embedding = nn.Parameter(torch.randn(seq_len, d_model) * 0.02)
        self.value_proj = nn.Linear(1, d_model)
        self.cls_token = nn.Parameter(torch.randn(1, d_model) * 0.02)
        self.pos_embedding = nn.Parameter(torch.randn(seq_len + 1, d_model) * 0.02)
        self.cat_embs = nn.ModuleList([nn.Embedding(card, d_model) for card in cat_cardinalities])
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads,
                                                   dim_feedforward=d_ff, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.dropout = nn.Dropout(dropout)
        hidden = max(d_model // 2, 32)
        self.classifier = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1)
        )
        self.num_pos = num_feature_indices
        self.cat_pos = cat_feature_indices

    def forward(self, x_num, x_cat):
        batch = x_num.shape[0]
        seq_len = self.feature_embedding.shape[0]
        tokens = self.feature_embedding.unsqueeze(0).expand(batch, -1, -1).clone()
        v = x_num.unsqueeze(-1)
        vproj = self.value_proj(v)
        for i, pos in enumerate(self.num_pos):
            tokens[:, pos, :] += vproj[:, i, :]
        for j, pos in enumerate(self.cat_pos):
            emb = self.cat_embs[j](x_cat[:, j])
            tokens[:, pos, :] += emb
        cls = self.cls_token.unsqueeze(0).expand(batch, -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)
        tokens = tokens + self.pos_embedding.unsqueeze(0)
        tokens = self.dropout(tokens)
        tokens = tokens.permute(1, 0, 2)
        out = self.transformer_encoder(tokens)
        cls_out = out[0]
        logits = self.classifier(cls_out).squeeze(-1)
        return logits

    def forward_verbose(self, x_num, x_cat):
        batch = x_num.shape[0]
        seq_len = self.feature_embedding.shape[0]

        print("\n=== Raw numeric input ===")
        print(x_num)
        if x_cat.numel() > 0:
            print("\n=== Raw categorical input ===")
            print(x_cat)

        # --- Feature embeddings ---
        tokens = self.feature_embedding.unsqueeze(0).expand(batch, -1, -1).clone()
        print("\n=== Initial feature embeddings ===")
        print(tokens)

        # --- Numeric projection ---
        vproj = self.value_proj(x_num.unsqueeze(-1))
        print("\n=== Numeric projections ===")
        print(vproj)
        for i, pos in enumerate(self.num_pos):
            tokens[:, pos, :] += vproj[:, i, :]
        print("\n=== After adding numeric projections ===")
        print(tokens)

        # --- Categorical embeddings ---
        if x_cat.numel() > 0:
            for j, pos in enumerate(self.cat_pos):
                emb = self.cat_embs[j](x_cat[:, j])
                tokens[:, pos, :] += emb
        print("\n=== After adding categorical embeddings ===")
        print(tokens)

        # --- CLS token + positional embeddings ---
        cls = self.cls_token.unsqueeze(0).expand(batch, -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)
        tokens = tokens + self.pos_embedding.unsqueeze(0)
        tokens = self.dropout(tokens)
        print("\n=== Tokens after CLS + positional embeddings ===")
        print(tokens)

        # --- Transformer layers (step-by-step) ---
        out = tokens.permute(1, 0, 2)  # (seq_len+1, batch, d_model)
        for i, layer in enumerate(self.transformer_encoder.layers):
            out = layer(out)
            print(f"\n=== After Transformer layer {i} ===")
            print(out.permute(1, 0, 2))  # (batch, seq_len+1, d_model)

        # --- CLS pooling + classifier ---
        cls_out = out[0]
        logits = self.classifier(cls_out).squeeze(-1)
        probs = torch.sigmoid(logits)

        print("\n=== CLS pooled representation ===")
        print(cls_out)
        print("\n=== Logits ===")
        print(logits)
        print("\n=== Probabilities ===")
        print(probs)

        return probs

In [None]:
# -----------------
# Losses
# -----------------
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.9, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, logits, targets):
        bce = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        pt = torch.exp(-bce)
        focal = self.alpha * (1-pt) ** self.gamma * bce
        return focal.mean()

In [None]:
# -----------------
# Training utils
# -----------------
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for x_num, x_cat, y in dataloader:
        x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x_num, x_cat)
        loss = criterion(logits, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item() * x_num.size(0)
    return total_loss / len(dataloader.dataset)

@torch.no_grad()
def evaluate(model, dataloader, device):
    model.eval()
    ys, probs = [], []
    for x_num, x_cat, y in dataloader:
        x_num, x_cat = x_num.to(device), x_cat.to(device)
        logits = model(x_num, x_cat)
        p = torch.sigmoid(logits).cpu().numpy()
        probs.append(p)
        ys.append(y.numpy())
    probs = np.concatenate(probs)
    ys = np.concatenate(ys)
    rocauc = roc_auc_score(ys, probs)
    prauc = average_precision_score(ys, probs)
    return {'y': ys, 'probs': probs, 'roc_auc': rocauc, 'pr_auc': prauc}

def select_best_threshold(y_val, probs_val):
    prec, rec, thresh = precision_recall_curve(y_val, probs_val)
    f1s = 2*prec*rec/(prec+rec+1e-8)
    best_idx = np.argmax(f1s)
    return thresh[best_idx], f1s[best_idx]

In [None]:
# -----------------
# Helper: anomaly ratio check
# -----------------
def check_anomaly_ratio(loader, name=""):
    total, anomalies = 0, 0
    for _, _, y in loader:   # assuming dataset returns (x_num, x_cat, y)
        total += len(y)
        anomalies += (y == 1).sum().item()
    ratio = anomalies / total if total > 0 else 0
    print(f"{name}: {anomalies}/{total} anomalies ({ratio:.4%})")
    return anomalies, total, ratio

In [None]:
# -----------------
# Main training flow
# -----------------
def main_training_flow(cfg, device='cuda'):
    df = pd.read_csv(cfg["file"])

    # stratified sampling
    if cfg.get("sample_frac") or cfg.get("sample_n"):
        df = stratified_sample(df, cfg["target_col"],
                               frac=cfg.get("sample_frac"),
                               n=cfg.get("sample_n"))
        print(f"Sampled dataset shape: {df.shape}, anomaly ratio={df[cfg['target_col']].mean():.4%}")

    cat_cols = cfg["cat_cols"]
    target_col = cfg["target_col"]
    drop_cols=cfg.get("drop_cols")

    (X_train, y_train), (X_val, y_val), (X_test, y_test) = prepare_dataframes(df, cat_cols, drop_cols, target_col)
    num_cols, scaler, encoders, (X_train_num, X_val_num, X_test_num), (X_train_cat, X_val_cat, X_test_cat) = \
        fit_transform_preprocessors(X_train, X_val, X_test, cat_cols)

    train_ds = TabularDataset(X_train_num, X_train_cat if cat_cols else np.zeros((len(X_train),0)), y_train)
    val_ds   = TabularDataset(X_val_num,   X_val_cat if cat_cols else np.zeros((len(X_val),0)),   y_val)
    test_ds  = TabularDataset(X_test_num,  X_test_cat if cat_cols else np.zeros((len(X_test),0)), y_test)

    train_loader = DataLoader(train_ds, batch_size=cfg["batch_size"], shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=cfg["batch_size"], shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=cfg["batch_size"], shuffle=False)

    feat_order = list(X_train.columns)
    num_feature_indices = [feat_order.index(c) for c in num_cols]
    cat_feature_indices = [feat_order.index(c) for c in cat_cols]
    cat_cardinalities = [len(encoders[c].classes_) for c in cat_cols]

    model = VanillaTransformer(
        num_num_features=len(num_cols),
        cat_cardinalities=cat_cardinalities,
        feat_order=feat_order,
        num_feature_indices=num_feature_indices,
        cat_feature_indices=cat_feature_indices,
        d_model=cfg["d_model"], n_heads=cfg["n_heads"], num_layers=cfg["num_layers"]
    ).to(device)

    if cfg["use_focal"]:
        criterion = FocalLoss(alpha=0.9, gamma=2)
    else:
        n_pos, n_neg = (y_train==1).sum(), (y_train==0).sum()
        pos_weight = torch.tensor([n_neg/max(1,n_pos)], dtype=torch.float32).to(device)
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)

    best_state, best_val_pr, patience, wait = None, -1.0, 5, 0
    for epoch in range(20):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_res = evaluate(model, val_loader, device)
        print(f"Epoch {epoch:02d} Loss={train_loss:.4f} Val PR-AUC={val_res['pr_auc']:.4f} ROC-AUC={val_res['roc_auc']:.4f}")
        if val_res['pr_auc'] > best_val_pr:
            best_val_pr = val_res['pr_auc']
            best_state = {k:v.cpu() for k,v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print("Early stopping")
                break

    model.load_state_dict(best_state)
    val_res = evaluate(model, val_loader, device)
    best_thresh, best_f1 = select_best_threshold(val_res['y'], val_res['probs'])
    print("Best threshold:", best_thresh, "Val F1:", best_f1)

    test_res = evaluate(model, test_loader, device)
    preds = (test_res['probs'] >= best_thresh).astype(int)
    print("Test PR-AUC:", test_res['pr_auc'], "ROC-AUC:", test_res['roc_auc'])
    print(classification_report(test_res['y'], preds, digits=4))
    return model, scaler, encoders, best_thresh, (X_train_num, X_val_num, X_test_num,
                                               X_train_cat, X_val_cat, X_test_cat,
                                               y_train, y_val, y_test)

In [None]:
# -----------------
# Main training flow
# -----------------
def main_training_flow(cfg, device='cuda'):
    # --- 1. Load dataset ---
    df = pd.read_csv(cfg["file"])

    # optional stratified sampling
    if cfg.get("sample_frac") or cfg.get("sample_n"):
        df = stratified_sample(df, cfg["target_col"],
                               frac=cfg.get("sample_frac"),
                               n=cfg.get("sample_n"))
        print(f"Sampled dataset shape: {df.shape}, anomaly ratio={df[cfg['target_col']].mean():.4%}")

    # --- 2. Preprocess / split ---
    cat_cols = cfg["cat_cols"]
    target_col = cfg["target_col"]
    drop_cols  = cfg.get("drop_cols")

    (X_train, y_train), (X_val, y_val), (X_test, y_test) = prepare_dataframes(
        df, cat_cols, drop_cols, target_col
    )

    num_cols, scaler, encoders, \
    (X_train_num, X_val_num, X_test_num), \
    (X_train_cat, X_val_cat, X_test_cat) = fit_transform_preprocessors(
        X_train, X_val, X_test, cat_cols
    )

    # --- 3. Build datasets/loaders ---
    train_ds = TabularDataset(X_train_num, X_train_cat if cat_cols else np.zeros((len(X_train),0)), y_train)
    val_ds   = TabularDataset(X_val_num,   X_val_cat if cat_cols else np.zeros((len(X_val),0)),   y_val)
    test_ds  = TabularDataset(X_test_num,  X_test_cat if cat_cols else np.zeros((len(X_test),0)), y_test)

    train_loader = DataLoader(train_ds, batch_size=cfg["batch_size"], shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=cfg["batch_size"], shuffle=False)
    test_loader  = DataLoader(test_ds,  batch_size=cfg["batch_size"], shuffle=False)

    # check anomaly ratios
    check_anomaly_ratio(train_loader, "Train")
    check_anomaly_ratio(val_loader, "Validation")
    check_anomaly_ratio(test_loader, "Test")

    # --- 4. Model setup ---
    feat_order = list(X_train.columns)
    num_feature_indices = [feat_order.index(c) for c in num_cols]
    cat_feature_indices = [feat_order.index(c) for c in cat_cols]
    cat_cardinalities   = [len(encoders[c].classes_) for c in cat_cols]

    model = VanillaTransformer(
        num_num_features=len(num_cols),
        cat_cardinalities=cat_cardinalities,
        feat_order=feat_order,
        num_feature_indices=num_feature_indices,
        cat_feature_indices=cat_feature_indices,
        d_model=cfg["d_model"], n_heads=cfg["n_heads"], num_layers=cfg["num_layers"]
    ).to(device)

    # --- 5. Loss function ---
    if cfg["use_focal"]:
        criterion = FocalLoss(alpha=0.9, gamma=2)
    else:
        n_pos, n_neg = (y_train==1).sum(), (y_train==0).sum()
        pos_weight = torch.tensor([n_neg/max(1,n_pos)], dtype=torch.float32).to(device)
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)

    # --- 6. Training loop with early stopping ---
    best_state, best_val_pr, patience, wait = None, -1.0, 5, 0
    for epoch in range(cfg.get("max_epochs", 30)):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_res = evaluate(model, val_loader, device)
        print(f"Epoch {epoch:02d} Loss={train_loss:.4f} Val PR-AUC={val_res['pr_auc']:.4f} ROC-AUC={val_res['roc_auc']:.4f}")
        if val_res['pr_auc'] > best_val_pr:
            best_val_pr = val_res['pr_auc']
            best_state = {k:v.cpu() for k,v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print("Early stopping")
                break

    # --- 7. Load best state & threshold selection ---
    model.load_state_dict(best_state)
    val_res = evaluate(model, val_loader, device)
    best_thresh, best_f1 = select_best_threshold(val_res['y'], val_res['probs'])
    print("Best threshold:", best_thresh, "Val F1:", best_f1)

    # --- 8. Final test evaluation ---
    test_res = evaluate(model, test_loader, device)
    preds = (test_res['probs'] >= best_thresh).astype(int)
    print("Test PR-AUC:", test_res['pr_auc'], "ROC-AUC:", test_res['roc_auc'])
    print(classification_report(test_res['y'], preds, digits=4))

    cm = confusion_matrix(test_res['y'], preds)

    # Print raw numbers
    print("Confusion Matrix:\n", cm)

    # Plot nicely
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap="Blues", values_format='d')
    plt.title("Confusion Matrix")
    plt.show()

    # --- 9. Save model + preprocessors + loaders metadata ---
    drive_folder = '/content/drive/MyDrive/anomaly_models'
    os.makedirs(drive_folder, exist_ok=True)

    # Save model
    model_path = os.path.join(drive_folder, f"transformer_{cfg['dataset_name']}.pt")
    torch.save(model.state_dict(), model_path)
    print(f"✅ Model saved to {model_path}")

    # Save scalers/encoders
    joblib.dump(scaler,   os.path.join(drive_folder, f"scaler_{cfg['dataset_name']}.pkl"))
    joblib.dump(encoders, os.path.join(drive_folder, f"encoders_{cfg['dataset_name']}.pkl"))

    # Save threshold
    with open(os.path.join(drive_folder, f"threshold_{cfg['dataset_name']}.txt"), "w") as f:
        f.write(str(best_thresh))

    # Save dataset splits
    np.savez_compressed(os.path.join(drive_folder, f"splits_{cfg['dataset_name']}.npz"),
                        X_train_num=X_train_num, X_val_num=X_val_num, X_test_num=X_test_num,
                        X_train_cat=X_train_cat, X_val_cat=X_val_cat, X_test_cat=X_test_cat,
                        y_train=y_train, y_val=y_val, y_test=y_test)

    # --- 10. Return model, preprocessors, threshold, and splits for inspection ---
    return model, scaler, encoders, best_thresh, \
        (X_train_num, X_val_num, X_test_num,
          X_train_cat, X_val_cat, X_test_cat,
          y_train, y_val, y_test), \
        (train_loader, val_loader, test_loader), \
        (num_cols, feat_order, num_feature_indices, cat_feature_indices)


In [None]:
# -----------------
# RUN
# -----------------
if __name__ == "__main__":
    cfg = CONFIG[CONFIG["dataset"]]
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, scaler, encoders, thresh, splits, loaders, model_info = main_training_flow(cfg, device=device)

  .apply(lambda x: x.sample(frac=frac, random_state=random_state))
  df.fillna(method='ffill', inplace=True)


Sampled dataset shape: (142404, 31), anomaly ratio=0.1727%
Train: 172/99682 anomalies (0.1725%)
Validation: 37/21361 anomalies (0.1732%)
Test: 37/21361 anomalies (0.1732%)




Epoch 00 Loss=0.0047 Val PR-AUC=0.5400 ROC-AUC=0.9260
Epoch 01 Loss=0.0019 Val PR-AUC=0.5509 ROC-AUC=0.9404
Epoch 02 Loss=0.0018 Val PR-AUC=0.5803 ROC-AUC=0.9636
Epoch 03 Loss=0.0017 Val PR-AUC=0.5935 ROC-AUC=0.9459
Epoch 04 Loss=0.0017 Val PR-AUC=0.6025 ROC-AUC=0.9723
Epoch 05 Loss=0.0017 Val PR-AUC=0.6290 ROC-AUC=0.9738
Epoch 06 Loss=0.0016 Val PR-AUC=0.6455 ROC-AUC=0.9713
Epoch 07 Loss=0.0016 Val PR-AUC=0.6510 ROC-AUC=0.9772
Epoch 08 Loss=0.0016 Val PR-AUC=0.7706 ROC-AUC=0.9784
Epoch 09 Loss=0.0013 Val PR-AUC=0.8017 ROC-AUC=0.9850
Epoch 10 Loss=0.0012 Val PR-AUC=0.8118 ROC-AUC=0.9839
Epoch 11 Loss=0.0012 Val PR-AUC=0.7863 ROC-AUC=0.9841
Epoch 12 Loss=0.0012 Val PR-AUC=0.7965 ROC-AUC=0.9861
Epoch 13 Loss=0.0011 Val PR-AUC=0.8143 ROC-AUC=0.9883
Epoch 14 Loss=0.0011 Val PR-AUC=0.8264 ROC-AUC=0.9888
Epoch 15 Loss=0.0011 Val PR-AUC=0.8131 ROC-AUC=0.9898
Epoch 16 Loss=0.0010 Val PR-AUC=0.8316 ROC-AUC=0.9902
Epoch 17 Loss=0.0010 Val PR-AUC=0.8407 ROC-AUC=0.9893
Epoch 18 Loss=0.0010 Val PR-

In [None]:
X_train_num, X_val_num, X_test_num, X_train_cat, X_val_cat, X_test_cat, y_train, y_val, y_test = splits
train_loader, val_loader, test_loader = loaders

In [None]:
import torch
import pandas as pd

all_true, all_probs, all_preds = [], [], []

# Loop through entire test set
with torch.no_grad():
    for x_num_batch, x_cat_batch, y_true_batch in test_loader:
        x_num_batch = x_num_batch.to(device)
        x_cat_batch = x_cat_batch.to(device)
        y_true_batch = y_true_batch.cpu().numpy()   # convert tensor to numpy

        logits = model(x_num_batch, x_cat_batch)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs >= 0.6028694).astype(int)

        # ✅ extend with arrays/lists instead of tensor
        all_true.extend(y_true_batch.tolist())
        all_probs.extend(probs.flatten().tolist())
        all_preds.extend(preds.flatten().tolist())

# Combine into one DataFrame
inspect_df = pd.DataFrame({
    "True_Label": all_true,
    "Pred_Prob": all_probs,
    "Pred_Label": all_preds
})

print(inspect_df)

       True_Label  Pred_Prob  Pred_Label
0             0.0   0.021429           0
1             0.0   0.019984           0
2             0.0   0.020034           0
3             0.0   0.046974           0
4             0.0   0.021561           0
...           ...        ...         ...
21356         0.0   0.020313           0
21357         0.0   0.021065           0
21358         0.0   0.020663           0
21359         0.0   0.022824           0
21360         0.0   0.020495           0

[21361 rows x 3 columns]


In [None]:
# Filter only positive class (True_Label == 1)
anomalies_df = inspect_df[inspect_df["True_Label"] == 1]

print(anomalies_df)

       True_Label  Pred_Prob  Pred_Label
1034          1.0   0.668111           1
1235          1.0   0.602052           0
1539          1.0   0.653075           1
2656          1.0   0.664248           1
3434          1.0   0.655021           1
3896          1.0   0.666337           1
4238          1.0   0.666703           1
5861          1.0   0.656140           1
6275          1.0   0.660363           1
6436          1.0   0.021706           0
6783          1.0   0.622731           1
7006          1.0   0.656908           1
7462          1.0   0.631387           1
9364          1.0   0.271176           0
9472          1.0   0.654865           1
10546         1.0   0.640312           1
10548         1.0   0.616355           1
12429         1.0   0.657639           1
12442         1.0   0.606685           1
12920         1.0   0.639051           1
13188         1.0   0.664591           1
13752         1.0   0.653599           1
13923         1.0   0.025707           0
14141         1.

In [None]:
# False negatives: True_Label=1 but Pred_Label=0
false_negatives = inspect_df[(inspect_df["True_Label"] == 1) & (inspect_df["Pred_Label"] == 0)]

print(false_negatives)

       True_Label  Pred_Prob  Pred_Label
1235          1.0   0.602052           0
6436          1.0   0.021706           0
9364          1.0   0.271176           0
13923         1.0   0.025707           0
14162         1.0   0.244709           0
14229         1.0   0.155214           0
14341         1.0   0.093615           0
16552         1.0   0.581492           0
18157         1.0   0.050947           0
19958         1.0   0.024642           0
20701         1.0   0.031337           0


In [None]:
# False positivies: True_Label=1 but Pred_Label=0
false_positivies = inspect_df[(inspect_df["True_Label"] == 0) & (inspect_df["Pred_Label"] == 1)]

print(false_positivies)

       True_Label  Pred_Prob  Pred_Label
5765          0.0   0.656085           1
6637          0.0   0.661539           1
9544          0.0   0.662388           1
13704         0.0   0.641890           1
14126         0.0   0.661345           1
16031         0.0   0.641890           1


In [None]:
# Pick first normal and first anomaly in test set
normal_idx = np.where(y_test == 0)[0][0]
anomaly_idx = np.where(y_test == 1)[0][0]

x_num_sample = torch.tensor(np.vstack([X_test_num[normal_idx], X_test_num[anomaly_idx]]), dtype=torch.float32)

# Handle the case where there are no categorical columns
#if cfg["cat_cols"]:
#    x_cat_sample = torch.tensor(np.vstack([X_test_cat[normal_idx], X_test_cat[anomaly_idx]]), dtype=torch.long)
#else:
#    x_cat_sample = torch.zeros((2, 0), dtype=torch.long)

x_cat_sample = torch.zeros((2, 0), dtype=torch.long)

y_sample = y_test[[normal_idx, anomaly_idx]]

print(normal_idx)
print(anomaly_idx)
print("Selected rows for inspection (labels):", y_sample)

0
1034
Selected rows for inspection (labels): [0 1]


In [None]:
num_cols, feat_order, num_feature_indices, cat_feature_indices = model_info

In [None]:
model_inspect = VanillaTransformer(
    num_num_features=len(num_cols),
    #cat_cardinalities=[len(encoders[c].classes_) for c in cat_cols],
    cat_cardinalities=[],
    feat_order=feat_order,
    num_feature_indices=num_feature_indices,
    cat_feature_indices=cat_feature_indices,
    d_model=cfg["d_model"], n_heads=cfg["n_heads"], num_layers=cfg["num_layers"]
)
model_inspect.load_state_dict(model.state_dict())  # load trained weights
model_inspect.eval()



VanillaTransformer(
  (value_proj): Linear(in_features=1, out_features=64, bias=True)
  (cat_embs): ModuleList()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Sequential(
    (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (1): Linear(in_features=64, out_feature

In [None]:
with torch.no_grad():
    probs = model_inspect.forward_verbose(x_num_sample, x_cat_sample)

print("\nPredicted probabilities (normal vs anomaly):", probs.numpy())


=== Raw numeric input ===
tensor([[-9.4192e-01,  7.6905e-01, -1.3228e-01, -3.4122e-01, -6.1047e-01,
         -8.4984e-02, -6.7677e-01,  3.7271e-02, -3.7220e-01, -1.1789e+00,
          5.9850e-01, -7.3675e-01, -2.0777e-01,  1.5744e+00, -1.9868e-01,
          3.7458e-01,  1.0448e+00,  3.3331e-02, -1.7742e+00,  1.1566e+00,
          2.3076e-01,  2.0342e-01,  6.0159e-01, -4.5218e-01, -6.2231e-01,
          1.8201e+00, -8.0376e-02, -5.4312e-02, -1.4732e-02, -2.8620e-01],
        [-1.1271e+00, -8.4336e+00,  5.1859e+00, -1.2318e+01,  6.7146e+00,
         -1.0124e+01, -2.1404e+00, -1.3834e+01,  6.4185e+00, -7.7469e+00,
         -1.2997e+01,  5.2000e+00, -1.0924e+01,  1.6841e+00, -9.7586e+00,
          3.9569e-01, -1.1347e+01, -2.2858e+01, -1.0022e+01,  3.8056e+00,
         -1.9633e+00,  1.6259e+00, -1.5522e+00, -3.7441e+00,  1.1107e+00,
         -2.7045e+00, -9.6054e-01, -5.0738e+00, -3.3730e+00,  1.0788e+00]])

=== Initial feature embeddings ===
tensor([[[-0.0290, -0.0071, -0.0285,  ...,  0.