In [2]:
# ==========================================
# GANomaly for Tabular Anomaly Detection (creditcard.csv)
# - Splits: TrainN≈20k normal, VAL 250/250, TEST Balanced 200/200, TEST Imbalanced 10k/200
# - Scale: StandardScaler fit on train-normal
# - Losses: 
#     * L_adv (feature matching)  -> ổn định GAN, không cần dùng BCE trực tiếp
#     * L_con (reconstruction L1) -> tái tạo đầu vào
#     * L_enc (latent consistency)-> ||E(x) - E(G(x))||_2
# - Anomaly score = α * recon + β * latent (+ γ * feat)  (γ mặc định 0 cho tabular)
# - Threshold: tối ưu F1 trên VAL; có sweep percentile (80..99)
# ==========================================
import os, random, math
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support, precision_recall_curve
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# ------------------------------
# 0) Reproducibility & device
# ------------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ------------------------------
# 1) Load data
# ------------------------------
CSV = "../creditcard.csv"  # chỉnh đường dẫn nếu cần
df = pd.read_csv(CSV)
assert "Class" in df.columns
X_df = df.drop(columns=["Class"])
y_all = df["Class"].astype(int).to_numpy()

# ------------------------------
# 2) Splits (như các mô hình trước)
# ------------------------------
normal_idx = np.where(y_all==0)[0]
anom_idx   = np.where(y_all==1)[0]
rng = np.random.default_rng(SEED); rng.shuffle(normal_idx); rng.shuffle(anom_idx)

TR_N, VAL_N, VAL_A, TESTB_N, TESTB_A, TESTI_N = 20000, 250, 250, 200, 200, 10000
assert len(anom_idx) >= (VAL_A + TESTB_A), "Không đủ anomaly cho VAL/TESTB."

max_train_normal = len(normal_idx) - (VAL_N + TESTB_N + TESTI_N)
if max_train_normal < 1000:
    TESTI_N = max(2000, len(normal_idx) - (VAL_N + TESTB_N + 1000))
    max_train_normal = len(normal_idx) - (VAL_N + TESTB_N + TESTI_N)
TRAIN_N = max(5000, min(TR_N, max_train_normal))

ptr_n=0; ptr_a=0
trn_n  = normal_idx[ptr_n:ptr_n+TRAIN_N]; ptr_n+=TRAIN_N
val_n  = normal_idx[ptr_n:ptr_n+VAL_N];   ptr_n+=VAL_N
tstb_n = normal_idx[ptr_n:ptr_n+TESTB_N]; ptr_n+=TESTB_N
tsti_n = normal_idx[ptr_n:ptr_n+TESTI_N]; ptr_n+=TESTI_N

val_a  = anom_idx[ptr_a:ptr_a+VAL_A];   ptr_a+=VAL_A
tstb_a = anom_idx[ptr_a:ptr_a+TESTB_A]; ptr_a+=TESTB_A
tsti_a = tstb_a

def Xy(idxs): 
    return X_df.iloc[idxs].to_numpy().astype(np.float32), y_all[idxs]

X_tr_n,_ = Xy(trn_n)
X_val = np.vstack([X_df.iloc[val_n].to_numpy(), X_df.iloc[val_a].to_numpy()]).astype(np.float32)
y_val = np.hstack([np.zeros(len(val_n),dtype=int), np.ones(len(val_a),dtype=int)])

X_tstb = np.vstack([X_df.iloc[tstb_n].to_numpy(), X_df.iloc[tstb_a].to_numpy()]).astype(np.float32)
y_tstb = np.hstack([np.zeros(len(tstb_n),dtype=int), np.ones(len(tstb_a),dtype=int)])

X_tsti = np.vstack([X_df.iloc[tsti_n].to_numpy(), X_df.iloc[tsti_a].to_numpy()]).astype(np.float32)
y_tsti = np.hstack([np.zeros(len(tsti_n),dtype=int), np.ones(len(tsti_a),dtype=int)])

print(f"TrainN={len(trn_n)}, Val={len(val_n)}/{len(val_a)}, TestB={len(tstb_n)}/{len(tstb_a)}, TestI={len(tsti_n)}/{len(tsti_a)}")

# ------------------------------
# 3) Scale
# ------------------------------
scaler = StandardScaler().fit(X_tr_n)
def z(x): return scaler.transform(x).astype(np.float32)
X_tr_n = z(X_tr_n); X_val = z(X_val); X_tstb = z(X_tstb); X_tsti = z(X_tsti)
INPUT_DIM = X_tr_n.shape[1]

# ------------------------------
# 4) GANomaly network (MLP cho tabular)
#     G: Enc(x)->z, Dec(z)->x', Enc'(x')->z'
#     D: MLP trả vector đặc trưng (feature matching)
# ------------------------------
def mlp(in_dim, hidden, out_dim, last_act=None):
    layers = [nn.Linear(in_dim, hidden), nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden, hidden), nn.LeakyReLU(0.2, inplace=True),
              nn.Linear(hidden, out_dim)]
    if last_act == "tanh": layers.append(nn.Tanh())
    return nn.Sequential(*layers)

class Encoder(nn.Module):
    def __init__(self, in_dim, z_dim=32, hidden=128):
        super().__init__()
        self.net = mlp(in_dim, hidden, z_dim)
    def forward(self, x): return self.net(x)

class Decoder(nn.Module):
    def __init__(self, z_dim=32, out_dim=INPUT_DIM, hidden=128):
        super().__init__()
        self.net = mlp(z_dim, hidden, out_dim)
    def forward(self, z): return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, in_dim, hidden=128, feat_dim=64):
        super().__init__()
        self.feat = nn.Sequential(
            nn.Linear(in_dim, hidden), nn.LeakyReLU(0.2, True),
            nn.Linear(hidden, feat_dim), nn.LeakyReLU(0.2, True),
        )
        self.out = nn.Linear(feat_dim, 1)  # logit
    def forward(self, x):
        f = self.feat(x)
        logit = self.out(f)
        return logit, f

class GANomaly(nn.Module):
    def __init__(self, in_dim, z_dim=32, hidden=128):
        super().__init__()
        self.enc1 = Encoder(in_dim, z_dim, hidden)
        self.dec  = Decoder(z_dim, in_dim, hidden)
        self.enc2 = Encoder(in_dim, z_dim, hidden)  # encoder' cho x'
        self.disc = Discriminator(in_dim, hidden, feat_dim=64)
    def forward(self, x):
        z  = self.enc1(x)
        x_ = self.dec(z)
        z_ = self.enc2(x_)
        d_real, f_real = self.disc(x)
        d_fake, f_fake = self.disc(x_)
        return x_, z, z_, d_real, d_fake, f_real, f_fake

model = GANomaly(INPUT_DIM, z_dim=32, hidden=128).to(device)

# ------------------------------
# 5) Losses & Optimizers
# ------------------------------
LRg, LRd = 1e-4, 1e-4
opt_G = optim.Adam(list(model.enc1.parameters()) + list(model.dec.parameters()) + list(model.enc2.parameters()), lr=LRg, betas=(0.5, 0.999))
opt_D = optim.Adam(model.disc.parameters(), lr=LRd, betas=(0.5, 0.999))

l1 = nn.L1Loss()
bce = nn.BCEWithLogitsLoss()

# hệ số loss
LAMBDA_CON = 50.0   # tái tạo (mạnh hơn cho tabular)
LAMBDA_ENC =  1.0   # nhất quán latent
LAMBDA_FM  = 10.0   # feature matching (ổn định GAN)

# ------------------------------
# 6) Training (normal only) - fixed to avoid double-backward graph issue
# ------------------------------
BS, EPOCHS = 256, 60
loader = DataLoader(TensorDataset(torch.from_numpy(X_tr_n)), batch_size=BS, shuffle=True)

def set_requires_grad(module, flag: bool):
    for p in module.parameters():
        p.requires_grad = flag

for ep in range(1, EPOCHS + 1):
    model.train()
    loss_g_tot = loss_d_tot = 0.0
    steps = 0

    for (xb,) in loader:
        xb = xb.to(device)

        # ==== (1) Update D ====
        set_requires_grad(model.disc, True)   # bật grad cho D
        opt_D.zero_grad()

        # tạo x_hat để huấn luyện D, NGẮT gradient từ G
        with torch.no_grad():
            z_tmp  = model.enc1(xb)
            x_hat_tmp = model.dec(z_tmp)

        d_real, _ = model.disc(xb)
        d_fake, _ = model.disc(x_hat_tmp.detach())

        # hinge loss cho D
        loss_d = torch.relu(1.0 - d_real).mean() + torch.relu(1.0 + d_fake).mean()
        loss_d.backward()          # KHÔNG retain_graph
        opt_D.step()

        # ==== (2) Update G ====
        set_requires_grad(model.disc, False)  # đóng băng D khi tối ưu G
        opt_G.zero_grad()

        # re-forward SAU khi D đã được update (graph mới hoàn toàn)
        x_hat, z, z_hat, d_real2, d_fake2, f_real2, f_fake2 = model(xb)

        # feature matching: không backprop vào D -> detach f_real2
        loss_fm  = l1(f_real2.detach(), f_fake2)
        loss_con = l1(x_hat, xb)                       # tái tạo L1 mean
        loss_enc = torch.mean((z - z_hat) ** 2)        # latent consistency

        loss_g = LAMBDA_FM * loss_fm + LAMBDA_CON * loss_con + LAMBDA_ENC * loss_enc
        loss_g.backward()
        opt_G.step()

        # thống kê
        loss_g_tot += loss_g.item()
        loss_d_tot += loss_d.item()
        steps += 1

    if ep == 1 or ep % 5 == 0:
        print(f"[GANomaly] Epoch {ep:3d}/{EPOCHS} | L_G={loss_g_tot/steps:.4f} | L_D={loss_d_tot/steps:.4f}")

# ------------------------------
# 7) Scoring (chuẩn hóa 0..1 theo VAL)
#     score = α*recon + β*latent + γ*feat
# ------------------------------
ALPHA, BETA, GAMMA = 0.7, 0.3, 0.0  # tabular: chủ yếu recon + latent

@torch.no_grad()
def anomaly_score(x_np: np.ndarray):
    model.eval()
    xt = torch.from_numpy(x_np).to(device)
    x_hat, z, z_hat, d_real, d_fake, f_real, f_fake = model(xt)
    recon  = torch.mean(torch.abs(x_hat - xt), dim=1)         # L1 mean
    latent = torch.mean((z - z_hat)**2, dim=1)                # L2 mean
    feat   = torch.mean(torch.abs(f_real - f_fake), dim=1)    # option
    s = ALPHA*recon + BETA*latent + GAMMA*feat
    return s.detach().cpu().numpy().astype(np.float64), \
           recon.cpu().numpy(), latent.cpu().numpy(), feat.cpu().numpy()

# Scores trên VAL để min–max
val_s_raw, _, _, _ = anomaly_score(X_val)
smin, smax = float(np.min(val_s_raw)), float(np.max(val_s_raw)); eps=1e-12
def to_norm(s): return (s - smin) / (smax - smin + eps)

val_s = to_norm(val_s_raw)

# ------------------------------
# 8) Chọn ngưỡng (trên thang 0..1) & Đánh giá
# ------------------------------
def best_f1_threshold(y_true, scores, percentiles=np.linspace(50,99.5,200)):
    ths = np.percentile(scores, percentiles)
    best_f1, best_thr = -1.0, None
    for t in ths:
        yhat = (scores >= t).astype(int)
        _, _, f1, _ = precision_recall_fscore_support(y_true, yhat, labels=[0,1], average=None, zero_division=0)
        if f1[1] > best_f1: best_f1, best_thr = float(f1[1]), float(t)
    return best_thr, best_f1

thr_norm, f1v = best_f1_threshold(y_val, val_s)
print(f"\n[VAL balanced] Best F1(Class 1)={f1v:.3f} at thr(norm)={thr_norm:.6f}")

def evaluate(name, X, y, thr):
    s_raw,_,_,_ = anomaly_score(X)
    s = to_norm(s_raw)  # dùng thang chuẩn hóa 0..1
    yhat = (s >= thr).astype(int)
    print(f"\n===== {name} @thr(norm)={thr:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y, yhat))
    print("\nClassification Report:\n", classification_report(y, yhat, digits=4))
    print("ROC AUC (scores):", roc_auc_score(y, s))

evaluate("TEST Balanced (200/200)", X_tstb, y_tstb, thr_norm)
evaluate("TEST Imbalanced (10000/200)", X_tsti, y_tsti, thr_norm)

# Sweep vài percentile để quan sát trade-off
print("\n>>> Quick sweep on percentile-based thresholds (norm-scale):")
for pctl in [60,70,80,85,90,92.5,95,97.5,99]:
    t = float(np.percentile(val_s, pctl))
    print(f"\n-- Try thr(norm)={t:.6f} (pctl={pctl}) on TEST Balanced --")
    evaluate("TEST Balanced (200/200)", X_tstb, y_tstb, t)
    print(f"\n-- Try thr(norm)={t:.6f} (pctl={pctl}) on TEST Imbalanced --")
    evaluate("TEST Imbalanced (10000/200)", X_tsti, y_tsti, t)


Device: cpu
TrainN=20000, Val=250/250, TestB=200/200, TestI=10000/200
[GANomaly] Epoch   1/60 | L_G=34.4831 | L_D=1.7746
[GANomaly] Epoch   5/60 | L_G=22.1299 | L_D=1.4283
[GANomaly] Epoch  10/60 | L_G=13.7023 | L_D=1.3943
[GANomaly] Epoch  15/60 | L_G=10.7106 | L_D=1.4333
[GANomaly] Epoch  20/60 | L_G=9.1880 | L_D=1.5825
[GANomaly] Epoch  25/60 | L_G=8.7818 | L_D=1.4414
[GANomaly] Epoch  30/60 | L_G=8.4787 | L_D=1.3326
[GANomaly] Epoch  35/60 | L_G=7.7632 | L_D=1.6186
[GANomaly] Epoch  40/60 | L_G=7.3751 | L_D=1.6302
[GANomaly] Epoch  45/60 | L_G=6.8306 | L_D=1.6441
[GANomaly] Epoch  50/60 | L_G=6.3804 | L_D=1.6933
[GANomaly] Epoch  55/60 | L_G=6.3387 | L_D=1.6110
[GANomaly] Epoch  60/60 | L_G=6.3350 | L_D=1.5395

[VAL balanced] Best F1(Class 1)=0.876 at thr(norm)=0.027753

===== TEST Balanced (200/200) @thr(norm)=0.027753 =====
Confusion Matrix:
 [[192   8]
 [ 27 173]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8767    0.9600   