In [4]:
# ==========================================
# VAE anomaly detection (creditcard.csv) - thresholds chuẩn hóa [0,1]
# - Splits: TrainN≈20k normal, VAL 250/250, TEST Balanced 200/200, TEST Imbalanced 10k/200
# - Scale: fit StandardScaler trên train-normal
# - Score: reconstruction error (MSE mean per sample)
# - Normalization: min-max theo VAL => mọi threshold hiển thị & dùng đều là 0.xxx
# - Threshold: manual / F1(VAL) / Precision≥target(VAL) trên thang chuẩn hóa
# ==========================================
import os, random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support, precision_recall_curve
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# ------------------------------
# 0) Reproducibility & device
# ------------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ------------------------------
# 1) Load
# ------------------------------
csv_path = "../creditcard.csv"  # chỉnh lại nếu cần
df = pd.read_csv(csv_path)
assert "Class" in df.columns
X_df = df.drop(columns=["Class"])
y_all = df["Class"].astype(int).to_numpy()

# ------------------------------
# 2) Splits (no overlap), giống IF/GAN
# ------------------------------
normal_idx = np.where(y_all == 0)[0]
anom_idx   = np.where(y_all == 1)[0]
rng = np.random.default_rng(SEED)
rng.shuffle(normal_idx); rng.shuffle(anom_idx)

TR_N, VAL_N, VAL_A, TESTB_N, TESTB_A, TESTI_N = 20000, 250, 250, 200, 200, 10000
assert len(anom_idx) >= (VAL_A + TESTB_A), "Không đủ anomaly cho VAL/TESTB."

max_train_normal = len(normal_idx) - (VAL_N + TESTB_N + TESTI_N)
if max_train_normal < 1000:
    TESTI_N = max(2000, len(normal_idx) - (VAL_N + TESTB_N + 1000))
    max_train_normal = len(normal_idx) - (VAL_N + TESTB_N + TESTI_N)
TRAIN_N = max(5000, min(TR_N, max_train_normal))

ptr_n = 0; ptr_a = 0
trn_n  = normal_idx[ptr_n:ptr_n+TRAIN_N]; ptr_n += TRAIN_N
val_n  = normal_idx[ptr_n:ptr_n+VAL_N];   ptr_n += VAL_N
tstb_n = normal_idx[ptr_n:ptr_n+TESTB_N]; ptr_n += TESTB_N
tsti_n = normal_idx[ptr_n:ptr_n+TESTI_N]; ptr_n += TESTI_N

val_a  = anom_idx[ptr_a:ptr_a+VAL_A];   ptr_a += VAL_A
tstb_a = anom_idx[ptr_a:ptr_a+TESTB_A]; ptr_a += TESTB_A
tsti_a = tstb_a  # dùng chung anomaly

def take_numpy(idxs):
    return X_df.iloc[idxs].to_numpy().astype(np.float32), y_all[idxs]

X_tr_n, _ = take_numpy(trn_n)

X_val  = np.vstack([X_df.iloc[val_n].to_numpy(),  X_df.iloc[val_a].to_numpy()]).astype(np.float32)
y_val  = np.hstack([np.zeros(len(val_n), dtype=int), np.ones(len(val_a), dtype=int)])

X_tstb = np.vstack([X_df.iloc[tstb_n].to_numpy(), X_df.iloc[tstb_a].to_numpy()]).astype(np.float32)
y_tstb = np.hstack([np.zeros(len(tstb_n), dtype=int), np.ones(len(tstb_a), dtype=int)])

X_tsti = np.vstack([X_df.iloc[tsti_n].to_numpy(), X_df.iloc[tsti_a].to_numpy()]).astype(np.float32)
y_tsti = np.hstack([np.zeros(len(tsti_n), dtype=int), np.ones(len(tsti_a), dtype=int)])

print(f"TrainN={len(trn_n)}, Val={len(val_n)}/{len(val_a)}, TestB={len(tstb_n)}/{len(tstb_a)}, TestI={len(tsti_n)}/{len(tsti_a)}")

# ------------------------------
# 3) Scale (fit trên train normal)
# ------------------------------
scaler = StandardScaler().fit(X_tr_n)
def z(x): return scaler.transform(x).astype(np.float32)
X_tr_n  = z(X_tr_n)
X_val   = z(X_val)
X_tstb  = z(X_tstb)
X_tsti  = z(X_tsti)

input_dim = X_tr_n.shape[1]

# ------------------------------
# 4) VAE model
# ------------------------------
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=16, hidden=128):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
        )
        self.enc_mu     = nn.Linear(hidden, latent_dim)
        self.enc_logvar = nn.Linear(hidden, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, input_dim)  # reconstruct ở z-score space
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu, lv = self.enc_mu(h), self.enc_logvar(h)
        z = self.reparameterize(mu, lv)
        xhat = self.decoder(z)
        return xhat, mu, lv, z

def vae_loss(x, xhat, mu, logvar, beta=1.0):
    recon = nn.functional.mse_loss(xhat, x, reduction="mean")
    kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon + beta * kl, recon, kl

vae = VAE(input_dim=input_dim, latent_dim=16, hidden=128).to(device)
opt = optim.Adam(vae.parameters(), lr=1e-3)

# ------------------------------
# 5) Train trên normal only
# ------------------------------
bs, epochs = 256, 60
loader = DataLoader(TensorDataset(torch.from_numpy(X_tr_n)),
                    batch_size=bs, shuffle=True, drop_last=False)

for ep in range(1, epochs+1):
    vae.train()
    tot=recon_tot=kl_tot=0.0; steps=0
    for (xb,) in loader:
        xb = xb.to(device)
        opt.zero_grad()
        xhat, mu, lv, _ = vae(xb)
        loss, rl, kl = vae_loss(xb, xhat, mu, lv, beta=1.0)
        loss.backward(); opt.step()
        tot += loss.item(); recon_tot += rl.item(); kl_tot += kl.item(); steps += 1
    if ep == 1 or ep % 5 == 0:
        print(f"[VAE] Epoch {ep:3d}/{epochs} | loss={tot/steps:.6f} | recon={recon_tot/steps:.6f} | kl={kl_tot/steps:.6f}")

# ------------------------------
# 6) Scoring = reconstruction error (MSE mean) + chuẩn hóa theo VAL
# ------------------------------
SCORE_MODE = "mse_mean"   # hoặc "mae_mean"

@torch.no_grad()
def recon_error_raw(x_np: np.ndarray, model: VAE):
    model.eval()
    xt = torch.from_numpy(x_np).to(device)
    xhat, _, _, _ = model(xt)
    if SCORE_MODE == "mae_mean":
        err = torch.mean(torch.abs(xhat - xt), dim=1)
    else:  # "mse_mean"
        err = torch.mean((xhat - xt) ** 2, dim=1)
    return err.detach().cpu().numpy().astype(np.float64)

# Tính score raw trên VAL -> fit min-max -> tạo scorer chuẩn hóa
val_scores_raw = recon_error_raw(X_val, vae)
s_min, s_max = float(np.min(val_scores_raw)), float(np.max(val_scores_raw))
eps = 1e-12

def to_norm_score(s_raw: np.ndarray):
    return (s_raw - s_min) / (s_max - s_min + eps)

def recon_error_norm(x_np: np.ndarray, model: VAE):
    s_raw = recon_error_raw(x_np, model)
    return to_norm_score(s_raw)  # luôn 0..1 theo min-max của VAL

# ------------------------------
# 7) Chọn threshold trên thang chuẩn hóa
# ------------------------------
MODE = "f1"        # "manual" | "f1" | "p_at"
THR_MANUAL = float(np.percentile(to_norm_score(val_scores_raw), 95))
TARGET_P   = 0.60

val_scores = recon_error_norm(X_val, vae)  # 0..1

def best_f1_threshold(y_true, scores, percentiles=np.linspace(50, 99.5, 200)):
    ths = np.percentile(scores, percentiles)
    best_f1, best_thr = -1.0, None
    for t in ths:
        yhat = (scores >= t).astype(int)
        _, _, f1, _ = precision_recall_fscore_support(
            y_true, yhat, labels=[0,1], average=None, zero_division=0
        )
        if f1[1] > best_f1:
            best_f1, best_thr = float(f1[1]), float(t)
    return best_thr, best_f1

def threshold_for_precision(y_true, scores, target_p=0.60):
    p, r, thr = precision_recall_curve(y_true, scores)  # thr cùng thang
    idx = np.where(p[:-1] >= target_p)[0]
    if len(idx) == 0:
        return float(np.percentile(scores, 95)), float(p[1] if len(p)>1 else 0.0), float(r[1] if len(r)>1 else 0.0)
    i = idx[0]
    return float(thr[i]), float(p[i]), float(r[i])

if MODE == "manual":
    thr_norm = float(THR_MANUAL)
    print(f"\n[VAL] Manual thr(norm)={thr_norm:.6f}")
elif MODE == "f1":
    thr_norm, f1v = best_f1_threshold(y_val, val_scores)
    print(f"\n[VAL balanced] Best F1(Class 1)={f1v:.3f} at thr(norm)={thr_norm:.6f}")
else:
    thr_norm, p_at, r_at = threshold_for_precision(y_val, val_scores, TARGET_P)
    print(f"\n[VAL balanced] Thr for P≥{TARGET_P:.2f}: thr(norm)={thr_norm:.6f} (P={p_at:.3f}, R={r_at:.3f})")

# ------------------------------
# 8) Evaluate (dùng trực tiếp thr(norm) 0..1)
# ------------------------------
def evaluate(name, X, y, thr_norm):
    s = recon_error_norm(X, vae)          # score đã chuẩn hóa 0..1
    yhat = (s >= thr_norm).astype(int)
    print(f"\n===== {name} @thr(norm)={thr_norm:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y, yhat))
    print("\nClassification Report:\n", classification_report(y, yhat, digits=4))
    print("ROC AUC (scores):", roc_auc_score(y, s))

evaluate("TEST Balanced (200/200)",     X_tstb, y_tstb, thr_norm)
evaluate("TEST Imbalanced (10000/200)", X_tsti, y_tsti, thr_norm)

# ------------------------------
# 9) Quick sweep vài percentile (trên thang chuẩn hóa)
# ------------------------------
cands = [80, 85, 90, 92.5, 95, 97.5, 99]
print("\n>>> Quick sweep on percentile-based thresholds (norm-scale):")
for pctl in cands:
    t = float(np.percentile(val_scores, pctl))  # 0..1
    print(f"\n-- Try thr(norm)={t:.6f} (pctl={pctl}) on TEST Balanced --")
    evaluate("TEST Balanced (200/200)", X_tstb, y_tstb, t)
    print(f"\n-- Try thr(norm)={t:.6f} (pctl={pctl}) on TEST Imbalanced --")
    evaluate("TEST Imbalanced (10000/200)", X_tsti, y_tsti, t)


Device: cpu
TrainN=20000, Val=250/250, TestB=200/200, TestI=10000/200
[VAE] Epoch   1/60 | loss=1.008241 | recon=1.006330 | kl=0.001910
[VAE] Epoch   5/60 | loss=0.949815 | recon=0.923144 | kl=0.026671
[VAE] Epoch  10/60 | loss=0.907886 | recon=0.868432 | kl=0.039454
[VAE] Epoch  15/60 | loss=0.869692 | recon=0.804767 | kl=0.064925
[VAE] Epoch  20/60 | loss=0.850974 | recon=0.783406 | kl=0.067569
[VAE] Epoch  25/60 | loss=0.847837 | recon=0.778315 | kl=0.069521
[VAE] Epoch  30/60 | loss=0.839061 | recon=0.766724 | kl=0.072337
[VAE] Epoch  35/60 | loss=0.834638 | recon=0.761485 | kl=0.073153
[VAE] Epoch  40/60 | loss=0.830584 | recon=0.755448 | kl=0.075137
[VAE] Epoch  45/60 | loss=0.827910 | recon=0.750658 | kl=0.077252
[VAE] Epoch  50/60 | loss=0.821741 | recon=0.740348 | kl=0.081393
[VAE] Epoch  55/60 | loss=0.814558 | recon=0.730467 | kl=0.084091
[VAE] Epoch  60/60 | loss=0.807051 | recon=0.719112 | kl=0.087939

[VAL balanced] Best F1(Class 1)=0.898 at thr(norm)=0.012003

===== TEST