In [1]:
# ==========================================
# Credit Card Fraud - Time-split + Balanced VAL
# Models: Isolation Forest (CPU), VAE (DL), GAN (DL)
# - Train: ONLY normal (Class=0)
# - VAL (balanced): chọn ngưỡng bằng F1(Class 1) hoặc precision>=X
# - TEST: test_real (mặc định, imbalanced) + test_balanced (tùy chọn)
# ==========================================
import os, random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support, precision_recall_curve,
    average_precision_score
)

# ---------- Reproducibility ----------
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# ---------- Config ----------
CSV_PATH = "creditcard.csv"  # đổi path nếu cần
DROP_COLS = ["Class", "Amount", "Time"]

# ---------- Load ----------
df_raw = pd.read_csv(CSV_PATH)
assert {"Time","Amount","Class"}.issubset(df_raw.columns), "Thiếu cột Time/Amount/Class trong creditcard.csv"

# ---------- Time-split ----------
df = df_raw.sort_values("Time").reset_index(drop=True)
n = len(df)
cut1 = int(0.6*n)
cut2 = int(0.8*n)

train_part = df.iloc[:cut1].copy()
val_part   = df.iloc[cut1:cut2].copy()
test_real  = df.iloc[cut2:].copy()

# ---------- Build VAL balanced ----------
fraud_val  = val_part[val_part.Class == 1]
if len(fraud_val) == 0:
    raise ValueError("VAL không có fraud; hãy đổi tỷ lệ split hoặc chọn vùng thời gian khác.")
normal_val = val_part[val_part.Class == 0].sample(n=len(fraud_val), random_state=SEED, replace=False)
val_balanced = pd.concat([normal_val, fraud_val]).sample(frac=1, random_state=SEED)

# ---------- Train-only-normal (cho IF/VAE/GAN) ----------
train_normal = train_part[train_part.Class == 0].copy()
if len(train_normal) == 0:
    raise ValueError("TRAIN không có normal records.")

# ---------- Scaler: fit trên TRAIN NORMAL để tránh rò rỉ ----------
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(train_normal.drop(columns=["Class","Time","Amount"]))
train_normal_scaled = pd.DataFrame(X_train_norm, columns=[c for c in train_normal.columns if c not in ["Class","Time","Amount"]])
train_normal_scaled["Class"] = 0

def scale_dataframe(df_in: pd.DataFrame) -> pd.DataFrame:
    X = scaler.transform(df_in.drop(columns=["Class","Time","Amount"]))
    out = pd.DataFrame(X, columns=[c for c in df_in.columns if c not in ["Class","Time","Amount"]])
    out["Class"] = df_in["Class"].to_numpy()
    return out

val_bal_scaled = scale_dataframe(val_balanced)
test_real_scaled = scale_dataframe(test_real)

# Optional: xây test_balanced để đối chiếu
fraud_test  = test_real[test_real.Class==1]
normal_test = test_real[test_real.Class==0]
take = min(len(fraud_test), len(normal_test))
test_balanced = pd.concat([
    normal_test.sample(n=take, random_state=SEED, replace=False),
    fraud_test.sample(n=take, random_state=SEED, replace=False)
]).sample(frac=1, random_state=SEED)
test_bal_scaled = scale_dataframe(test_balanced)

# ======================================================
# Common utils: thresholding & evaluation
# ======================================================
def eval_at_threshold(y_true, scores, thr):
    y_pred = (scores >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=[0,1], average=None, zero_division=0
    )
    acc = (tp + tn) / (tp + tn + fp + fn)
    return dict(threshold=float(thr), TN=int(tn), FP=int(fp), FN=int(fn), TP=int(tp),
                precision_1=float(prec[1]), recall_1=float(rec[1]), f1_1=float(f1[1]),
                accuracy=float(acc))

def best_f1_threshold(y_true, scores, percentiles=np.linspace(50, 99.5, 200)):
    thrs = np.percentile(scores, percentiles)
    rows = [eval_at_threshold(y_true, scores, t) for t in thrs]
    df = pd.DataFrame(rows)
    return float(df.iloc[df["f1_1"].idxmax()]["threshold"]), df

def threshold_for_precision(y_true, scores, target_p=0.50):
    p, r, thr = precision_recall_curve(y_true, scores)
    idx = np.where(p >= target_p)[0]
    if len(idx) == 0 or len(thr) == 0:
        return None
    i = idx[0]
    thr_sel = thr[i-1] if i > 0 else thr[0]
    return float(thr_sel), float(p[i]), float(r[i])

def print_full_report(title, y_true, scores, thr):
    y_pred = (scores >= thr).astype(int)
    print(f"\n===== {title} @ thr={thr:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC (scores): {roc_auc_score(y_true, scores):.3f}")
    print(f"PR  AUC (AP):    {average_precision_score(y_true, scores):.3f}")

# ======================================================
# 1) Isolation Forest
# ======================================================
from sklearn.ensemble import IsolationForest

def run_isoforest(train_df_scaled, val_df_scaled, test_df_scaled, param):
    X_train = train_df_scaled.drop(columns=["Class"])
    X_val   = val_df_scaled.drop(columns=["Class"])
    X_test  = test_df_scaled.drop(columns=["Class"])
    y_val   = val_df_scaled["Class"].to_numpy()
    y_test  = test_df_scaled["Class"].to_numpy()

    iso = IsolationForest(
        n_estimators=param.get("n_estimators", 300),
        max_samples=param.get("max_samples", "auto"),
        contamination="auto",
        random_state=SEED,
        n_jobs=-1
    )
    iso.fit(X_train)
    val_scores  = -iso.score_samples(X_val)
    test_scores = -iso.score_samples(X_test)

    thr_f1, _ = best_f1_threshold(y_val, val_scores)
    print_full_report("IF - TEST (BestF1 on VAL_balanced)", y_test, test_scores, thr_f1)

    res = threshold_for_precision(y_val, val_scores, target_p=0.50)
    if res:
        thr_p50, p50, r50 = res
        print(f"[IF][VAL_bal] thr for Precision≥50%: {thr_p50:.6f} (P={p50:.3f}, R={r50:.3f})")
        print_full_report("IF - TEST (Precision≥50% on VAL_balanced)", y_test, test_scores, thr_p50)
    return test_scores, thr_f1

# ======================================================
# 2) VAE (train on normal)
# ======================================================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=8, hidden=128):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU()
        )
        self.enc_mu    = nn.Linear(hidden, latent_dim)
        self.enc_logvar= nn.Linear(hidden, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, input_dim)
        )
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar); eps = torch.randn_like(std)
        return mu + eps * std
    def forward(self, x):
        h = self.encoder(x)
        mu, lv = self.enc_mu(h), self.enc_logvar(h)
        z = self.reparameterize(mu, lv)
        xhat = self.decoder(z)
        return xhat, mu, lv, z

def vae_loss_fn(x, xhat, mu, logvar, recon="mse", beta=1.0):
    if recon == "mse":
        recon_loss = nn.functional.mse_loss(xhat, x, reduction="mean")
    else:
        recon_loss = nn.functional.l1_loss(xhat, x, reduction="mean")
    kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl, recon_loss, kl

def run_vae(train_df_scaled, val_df_scaled, test_df_scaled, epochs=30, batch_size=256, latent_dim=8, hidden=128):
    Xtr = train_df_scaled.drop(columns=["Class"]).to_numpy().astype(np.float32)
    Xva = val_df_scaled.drop(columns=["Class"]).to_numpy().astype(np.float32)
    Xte = test_df_scaled.drop(columns=["Class"]).to_numpy().astype(np.float32)
    y_val = val_df_scaled["Class"].to_numpy()
    y_test= test_df_scaled["Class"].to_numpy()

    vae = VAE(input_dim=Xtr.shape[1], latent_dim=latent_dim, hidden=hidden).to(device)
    opt = optim.Adam(vae.parameters(), lr=1e-3)
    loader = DataLoader(TensorDataset(torch.from_numpy(Xtr)), batch_size=batch_size, shuffle=True, drop_last=False)

    for ep in range(1, epochs+1):
        vae.train(); tot=recon=kl=0.0; steps=0
        for (xb,) in loader:
            xb = xb.to(device); opt.zero_grad()
            xhat, mu, lv, z = vae(xb)
            loss, rl, klv = vae_loss_fn(xb, xhat, mu, lv, recon="mse", beta=1.0)
            loss.backward(); opt.step()
            tot+=loss.item(); recon+=rl.item(); kl+=klv.item(); steps+=1
        if ep==1 or ep%5==0: print(f"[VAE] Epoch {ep:2d}/{epochs} | loss={tot/steps:.5f} | recon={recon/steps:.5f} | kl={kl/steps:.5f}")

    @torch.no_grad()
    def recon_err(X):
        xt = torch.from_numpy(X).to(device)
        xhat, _, _, _ = vae(xt)
        err = torch.mean((xhat-xt)**2, dim=1).cpu().numpy()
        return err

    val_scores  = recon_err(Xva)
    test_scores = recon_err(Xte)

    thr_f1, _ = best_f1_threshold(y_val, val_scores)
    print_full_report("VAE - TEST (BestF1 on VAL_balanced)", y_test, test_scores, thr_f1)

    res = threshold_for_precision(y_val, val_scores, target_p=0.50)
    if res:
        thr_p50, p50, r50 = res
        print(f"[VAE][VAL_bal] thr for Precision≥50%: {thr_p50:.6f} (P={p50:.3f}, R={r50:.3f})")
        print_full_report("VAE - TEST (Precision≥50% on VAL_balanced)", y_test, test_scores, thr_p50)
    return test_scores, thr_f1

# ======================================================
# 3) GAN (Discriminator scoring: 1 - D(x))
# ======================================================
class Gen(nn.Module):
    def __init__(self, latent_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim,128), nn.ReLU(),
            nn.Linear(128,256), nn.ReLU(),
            nn.Linear(256,out_dim)
        )
    def forward(self, z): return self.net(z)

class Disc(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,256), nn.ReLU(),
            nn.Linear(256,128), nn.ReLU(),
            nn.Linear(128,1), nn.Sigmoid()
        )
    def forward(self, x): return self.net(x)

def run_gan(train_df_scaled, val_df_scaled, test_df_scaled, epochs=30, batch_size=256, latent_dim=16):
    Xtr = train_df_scaled.drop(columns=["Class"]).to_numpy().astype(np.float32)
    Xva = val_df_scaled.drop(columns=["Class"]).to_numpy().astype(np.float32)
    Xte = test_df_scaled.drop(columns=["Class"]).to_numpy().astype(np.float32)
    y_val = val_df_scaled["Class"].to_numpy()
    y_test= test_df_scaled["Class"].to_numpy()

    G = Gen(latent_dim, Xtr.shape[1]).to(device)
    D = Disc(Xtr.shape[1]).to(device)
    opt_g = optim.Adam(G.parameters(), lr=1e-3)
    opt_d = optim.Adam(D.parameters(), lr=1e-3)
    bce = nn.BCELoss()

    loader = DataLoader(TensorDataset(torch.from_numpy(Xtr)), batch_size=batch_size, shuffle=True, drop_last=False)

    for ep in range(1, epochs+1):
        G.train(); D.train(); dsum=gsum=0.0; steps=0
        for (xb,) in loader:
            xb = xb.to(device); bsz = xb.size(0)
            # D
            opt_d.zero_grad()
            z = torch.randn(bsz, latent_dim, device=device)
            fake = G(z).detach()
            real_labels = torch.ones(bsz,1,device=device)
            fake_labels = torch.zeros(bsz,1,device=device)
            d_loss = bce(D(xb), real_labels) + bce(D(fake), fake_labels)
            d_loss.backward(); opt_d.step()
            # G
            opt_g.zero_grad()
            z = torch.randn(bsz, latent_dim, device=device)
            gen = G(z)
            g_loss = bce(D(gen), real_labels)
            g_loss.backward(); opt_g.step()
            dsum+=d_loss.item(); gsum+=g_loss.item(); steps+=1
        if ep==1 or ep%5==0: print(f"[GAN] Epoch {ep:2d}/{epochs} | D={dsum/steps:.5f} | G={gsum/steps:.5f}")

    @torch.no_grad()
    def d_scores(X):
        Xt = torch.from_numpy(X).to(device)
        D.eval()
        prob = D(Xt).cpu().numpy().reshape(-1)
        return 1.0 - prob  # cao => bất thường

    val_scores  = d_scores(Xva)
    test_scores = d_scores(Xte)

    thr_f1, _ = best_f1_threshold(y_val, val_scores)
    print_full_report("GAN - TEST (BestF1 on VAL_balanced)", y_test, test_scores, thr_f1)

    res = threshold_for_precision(y_val, val_scores, target_p=0.50)
    if res:
        thr_p50, p50, r50 = res
        print(f"[GAN][VAL_bal] thr for Precision≥50%: {thr_p50:.6f} (P={p50:.3f}, R={r50:.3f})")
        print_full_report("GAN - TEST (Precision≥50% on VAL_balanced)", y_test, test_scores, thr_p50)
    return test_scores, thr_f1

# ======================================================
# ----------------- RUN EXPERIMENTS --------------------
# Chọn test set: test_real_scaled (imbalanced) hoặc test_bal_scaled (balanced)
TEST_SET_NAME = "test_real"  # đổi "test_balanced" nếu muốn
test_scaled = test_real_scaled if TEST_SET_NAME=="test_real" else test_bal_scaled
print(f"\n>>> Using TEST set: {TEST_SET_NAME} | size={len(test_scaled)} | positives={int((test_scaled.Class==1).sum())}")

# Isolation Forest
print("\n================ Isolation Forest ================")
_ifscores, _ifthr = run_isoforest(train_normal_scaled, val_bal_scaled, test_scaled,
                                  param={"n_estimators":400, "max_samples":512})

# VAE
print("\n====================== VAE ======================")
_vaescores, _vaethr = run_vae(train_normal_scaled, val_bal_scaled, test_scaled,
                              epochs=30, batch_size=512, latent_dim=8, hidden=128)

# GAN
print("\n====================== GAN ======================")
_ganscores, _ganthr = run_gan(train_normal_scaled, val_bal_scaled, test_scaled,
                              epochs=30, batch_size=512, latent_dim=16)

print("\n>>> DONE. So sánh ROC/PR, F1 theo các ngưỡng ở trên để chọn mô hình.")



>>> Using TEST set: test_real | size=56962 | positives=75


===== IF - TEST (BestF1 on VAL_balanced) @ thr=0.448660 =====
Confusion Matrix:
 [[52346  4541]
 [    9    66]]

Classification Report:
               precision    recall  f1-score   support

           0      1.000     0.920     0.958     56887
           1      0.014     0.880     0.028        75

    accuracy                          0.920     56962
   macro avg      0.507     0.900     0.493     56962
weighted avg      0.999     0.920     0.957     56962

ROC AUC (scores): 0.952
PR  AUC (AP):    0.044
[IF][VAL_bal] thr for Precision≥50%: 0.355978 (P=0.500, R=1.000)

===== IF - TEST (Precision≥50% on VAL_balanced) @ thr=0.355978 =====
Confusion Matrix:
 [[  735 56152]
 [    0    75]]

Classification Report:
               precision    recall  f1-score   support

           0      1.000     0.013     0.026     56887
           1      0.001     1.000     0.003        75

    accuracy                          0.014     56962


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
