In [None]:
# ==========================================
# GAN anomaly detection (PyTorch) with VAL thresholding
# Dataset: ai4i2020.csv
# - Train: ONLY normal
# - VAL (balanced): choose threshold (max F1 or Precision≥target)
# - TEST Balanced & Imbalanced using the same threshold
# Stabilization: BCEWithLogitsLoss, label smoothing, input noise, Adam betas (0.5,0.999)
# ==========================================

import os, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support, precision_recall_curve
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# ------------------------------
# Reproducibility & determinism
# ------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# torch.use_deterministic_algorithms(True)  # mở nếu muốn tuyệt đối, nhưng có thể chậm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ------------------------------
# 1) Load & preprocess
# ------------------------------
csv_path = "../../data/ai4i2020.csv"   # chỉnh nếu cần
df = pd.read_csv(csv_path)

print("Tổng số dòng:", len(df))
print("Phân phối nhãn Machine failure:\n", df["Machine failure"].value_counts())
print("Tỷ lệ (%):\n", df["Machine failure"].value_counts(normalize=True) * 100)

# Bỏ cột không dùng
drop_cols = ['UDI', 'Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
X_df = df.drop(columns=drop_cols)
y = df['Machine failure'].to_numpy().astype(int)

# Chuẩn hoá z-score
scaler = StandardScaler()
X_all = scaler.fit_transform(X_df).astype(np.float32)
input_dim = X_all.shape[1]

# ------------------------------
# 2) Build splits (auto sizes theo 339 anomalies)
# ------------------------------
normal_idx = np.where(y == 0)[0]
anom_idx   = np.where(y == 1)[0]
n_normal, n_anom = len(normal_idx), len(anom_idx)
print(f"\nNormal: {n_normal} | Anomaly: {n_anom}")

# xáo trộn để không phụ thuộc thứ tự file
rng = np.random.default_rng(SEED)
rng.shuffle(normal_idx)
rng.shuffle(anom_idx)

# Train normal (ưu tiên lớn, nhưng không vượt quá khả dụng)
TRAIN_NORMAL = min(8000, n_normal - 2000) if n_normal > 2000 else max(1000, n_normal // 2)
train_norm_idx = normal_idx[:TRAIN_NORMAL]

# Validation balanced
VAL_ANOM = min(150, n_anom // 2)  # tăng một chút cho ổn định (vd 150 nếu đủ)
VAL_NORM = VAL_ANOM
val_anom_idx = anom_idx[:VAL_ANOM]
val_norm_idx = normal_idx[TRAIN_NORMAL : TRAIN_NORMAL + VAL_NORM]

# Test balanced (ưu tiên 200 anom nếu đủ)
TEST_ANOM = min(200, n_anom - VAL_ANOM)
TEST_NORM_BAL = TEST_ANOM
test_bal_anom_idx = anom_idx[VAL_ANOM : VAL_ANOM + TEST_ANOM]
test_bal_norm_idx = normal_idx[TRAIN_NORMAL + VAL_NORM :
                               TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL]

# Test imbalanced ~ 4:1
TEST_IMB_NORM = min(800, n_normal - (TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL))
test_imb_norm_idx = normal_idx[TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL :
                               TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL + TEST_IMB_NORM]
test_imb_anom_idx = test_bal_anom_idx  # cùng số anom như test_bal để so sánh công bằng

def take(idx):
    return X_all[idx], y[idx]

X_train, y_train = take(train_norm_idx)           # toàn 0
X_val   = np.vstack([X_all[val_norm_idx], X_all[val_anom_idx]])
y_val   = np.hstack([np.zeros(len(val_norm_idx), dtype=int),
                     np.ones (len(val_anom_idx), dtype=int)])

X_test_bal = np.vstack([X_all[test_bal_norm_idx], X_all[test_bal_anom_idx]])
y_test_bal = np.hstack([np.zeros(len(test_bal_norm_idx), dtype=int),
                        np.ones (len(test_bal_anom_idx), dtype=int)])

X_test_imb = np.vstack([X_all[test_imb_norm_idx], X_all[test_imb_anom_idx]])
y_test_imb = np.hstack([np.zeros(len(test_imb_norm_idx), dtype=int),
                        np.ones (len(test_imb_anom_idx), dtype=int)])

print("\n--- Split summary (auto) ---")
print(f"Train normal size          : {X_train.shape[0]}")
print(f"VAL balanced (norm/anom)   : {len(val_norm_idx)} / {len(val_anom_idx)}")
print(f"TEST balanced (norm/anom)  : {len(test_bal_norm_idx)} / {len(test_bal_anom_idx)}")
print(f"TEST imbalanced (norm/anom): {len(test_imb_norm_idx)} / {len(test_imb_anom_idx)}")

# ------------------------------
# 3) Define GAN (logits + BCEWithLogits)
# ------------------------------
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, 128), nn.ReLU(),
            nn.Linear(128, 256), nn.ReLU(),
            nn.Linear(256, output_dim)  # linear vì features đã z-score
        )
    def forward(self, z): return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU(),
            nn.Linear(128, 1)  # logits (không Sigmoid)
        )
    def forward(self, x): return self.net(x)

latent_dim = 16
G = Generator(latent_dim, input_dim).to(device)
D = Discriminator(input_dim).to(device)

bce_logits = nn.BCEWithLogitsLoss()
opt_g = optim.Adam(G.parameters(), lr=2e-4, betas=(0.5, 0.999))
opt_d = optim.Adam(D.parameters(), lr=2e-4, betas=(0.5, 0.999))

# ------------------------------
# 4) Train GAN (ONLY normal)
# ------------------------------
batch_size = 128
epochs     = 60
train_loader = DataLoader(
    TensorDataset(torch.from_numpy(X_train)),
    batch_size=batch_size, shuffle=True, drop_last=False
)

for epoch in range(1, epochs+1):
    G.train(); D.train()
    ep_d = ep_g = 0.0; steps = 0
    for (real_batch,) in train_loader:
        real_batch = real_batch.to(device)
        bsz = real_batch.size(0)

        # noise nhỏ vào input của D để tránh overfit
        real_noisy = real_batch + 0.01 * torch.randn_like(real_batch)

        # ---- Train D ----
        opt_d.zero_grad()
        z = torch.randn(bsz, latent_dim, device=device)
        fake_batch = G(z).detach()

        # label smoothing cho real
        real_labels = torch.full((bsz, 1), 0.9, device=device)
        fake_labels = torch.zeros(bsz, 1, device=device)

        d_real_logits = D(real_noisy)
        d_fake_logits = D(fake_batch)

        d_loss = bce_logits(d_real_logits, real_labels) + bce_logits(d_fake_logits, fake_labels)
        d_loss.backward(); opt_d.step()

        # ---- Train G ----
        opt_g.zero_grad()
        z = torch.randn(bsz, latent_dim, device=device)
        gen_batch = G(z)
        g_loss = bce_logits(D(gen_batch), torch.ones(bsz, 1, device=device))
        g_loss.backward(); opt_g.step()

        ep_d += d_loss.item(); ep_g += g_loss.item(); steps += 1

    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d}/{epochs} | D: {ep_d/steps:.4f} | G: {ep_g/steps:.4f}")

# ------------------------------
# 5) Scoring (1 - sigmoid(logits))
# ------------------------------
@torch.no_grad()
def disc_scores(x_np: np.ndarray) -> np.ndarray:
    X_t = torch.from_numpy(x_np).to(device)
    D.eval()
    logits = D(X_t).cpu().numpy().reshape(-1)
    probs  = 1.0 / (1.0 + np.exp(-logits))   # sigmoid
    return 1.0 - probs                       # higher => more anomalous

# ------------------------------
# 6) Threshold selection on VAL
#    mode = "f1"  -> chọn ngưỡng max F1(Class 1)
#    mode = "p_at" -> chọn ngưỡng đạt Precision ≥ TARGET_P trên VAL
# ------------------------------
MODE = "f1"        # "f1" hoặc "p_at"
TARGET_P = 0.60    # dùng khi MODE="p_at"

val_scores = disc_scores(X_val)

def best_f1_threshold(y_true, scores, percentiles=np.linspace(50, 99.5, 200)):
    thrs = np.percentile(scores, percentiles)
    best_f1, best_thr = -1.0, None
    for t in thrs:
        y_pred = (scores >= t).astype(int)
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, labels=[0,1], average=None, zero_division=0
        )
        if f1[1] > best_f1:
            best_f1, best_thr = float(f1[1]), float(t)
    return best_thr, best_f1

def threshold_for_precision(y_true, scores, target_p=0.60):
    p, r, thr = precision_recall_curve(y_true, scores)
    # thr có độ dài len(p)-1
    idx = np.where(p[:-1] >= target_p)[0]
    if len(idx) == 0:
        # fallback: percentile 95
        return float(np.percentile(scores, 95)), float(p[1] if len(p)>1 else 0.0), float(r[1] if len(r)>1 else 0.0)
    i = idx[0]
    return float(thr[i]), float(p[i]), float(r[i])

if MODE == "f1":
    thr, best_f1 = best_f1_threshold(y_val, val_scores)
    print(f"\n[VAL balanced] Best F1(Class 1)={best_f1:.3f} at threshold={thr:.6f}")
else:
    thr, p_at, r_at = threshold_for_precision(y_val, val_scores, TARGET_P)
    print(f"\n[VAL balanced] Threshold for Precision≥{TARGET_P:.2f}: thr={thr:.6f} (P={p_at:.3f}, R={r_at:.3f})")

# ------------------------------
# 7) Evaluate on TEST sets
# ------------------------------
def evaluate(name: str, X_np: np.ndarray, y_true: np.ndarray, thr: float):
    scores = disc_scores(X_np)
    y_pred = (scores >= thr).astype(int)
    print(f"\n===== {name} @thr={thr:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))
    print("ROC AUC (scores):", roc_auc_score(y_true, scores))

evaluate("TEST Balanced",   X_test_bal, y_test_bal, thr)
evaluate("TEST Imbalanced", X_test_imb, y_test_imb, thr)


Device: cpu


TypeError: argument of type 'method' is not iterable