In [1]:
import os, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# =========================
# 0) Reproducibility & device
# =========================
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# =========================
# 1) Load & preprocess
# =========================
csv_path = "../../data/ai4i2020.csv"   # đổi "/mnt/data/ai4i2020.csv" nếu cần
df = pd.read_csv(csv_path)

drop_cols = ['UDI', 'Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
features = df.drop(columns=drop_cols)
labels = df['Machine failure'].astype(int).to_numpy()

scaler = StandardScaler()
X_all = scaler.fit_transform(features).astype(np.float32)

# =========================
# 2) Build sets (giống VAE: 200/400 và 200/800)
# =========================
normal_idx = np.where(labels == 0)[0]
anom_idx   = np.where(labels == 1)[0]
n_normal, n_anom = len(normal_idx), len(anom_idx)
print(f"Counts -> Normal: {n_normal} | Anomaly: {n_anom}")

assert n_normal >= 8400, "Cần >= 8400 normal để 8000 train + 200 + 200 test."

# Train: ONLY normal
X_train = X_all[normal_idx[:8000]]

# 2 test sets: 200 normal khác nhau để "sạch"
testA_normal = X_all[normal_idx[8000:8200]]   # 200 normal
testB_normal = X_all[normal_idx[8200:8400]]   # 200 normal

# Yêu cầu anomaly
reqA_anom, reqB_anom = 400, 800
gotA_anom = min(reqA_anom, n_anom)
gotB_anom = min(reqB_anom, n_anom)
if gotA_anom < reqA_anom or gotB_anom < reqB_anom:
    print(f"[WARN] Không đủ anomaly theo yêu cầu. Sẽ dùng tối đa: "
          f"TEST_A={gotA_anom}, TEST_B={gotB_anom}")

# Dùng cùng phân phối anomaly (lấy từ đầu danh sách)
testA_anom = X_all[anom_idx[:gotA_anom]]
testB_anom = X_all[anom_idx[:gotB_anom]]

X_testA = np.vstack([testA_normal, testA_anom]).astype(np.float32)
y_testA = np.hstack([np.zeros(testA_normal.shape[0], dtype=int),
                     np.ones(testA_anom.shape[0], dtype=int)])

X_testB = np.vstack([testB_normal, testB_anom]).astype(np.float32)
y_testB = np.hstack([np.zeros(testB_normal.shape[0], dtype=int),
                     np.ones(testB_anom.shape[0], dtype=int)])

print("Train normal size:", X_train.shape[0])
print("TEST_A dist (200/400 mong muốn):", {0: int((y_testA==0).sum()), 1: int((y_testA==1).sum())})
print("TEST_B dist (200/800 mong muốn):", {0: int((y_testB==0).sum()), 1: int((y_testB==1).sum())})

X_train_t = torch.from_numpy(X_train)
X_testA_t = torch.from_numpy(X_testA)
X_testB_t = torch.from_numpy(X_testB)

# =========================
# 3) Define GAN (MLP)
# =========================
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, 128), nn.ReLU(),
            nn.Linear(128, 256), nn.ReLU(),
            nn.Linear(256, output_dim)  # z-score space
        )
    def forward(self, z):
        return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU(),
            nn.Linear(128, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

latent_dim = 16
input_dim  = X_train.shape[1]
G = Generator(latent_dim, input_dim).to(device)
D = Discriminator(input_dim).to(device)

opt_g = optim.Adam(G.parameters(), lr=1e-3)
opt_d = optim.Adam(D.parameters(), lr=1e-3)
bce   = nn.BCELoss()

# =========================
# 4) Train GAN (ONLY normal)
# =========================
batch_size = 128
epochs     = 40   # giảm 10–20 để chạy nhanh nếu không có GPU
train_loader = DataLoader(TensorDataset(X_train_t), batch_size=batch_size, shuffle=True, drop_last=False)

for epoch in range(1, epochs+1):
    G.train(); D.train()
    ep_d = ep_g = 0.0; steps = 0
    for (real_batch,) in train_loader:
        real_batch = real_batch.to(device)
        bsz = real_batch.size(0)

        # --- Train D ---
        opt_d.zero_grad()
        z = torch.randn(bsz, latent_dim, device=device)
        fake_batch = G(z).detach()

        real_labels = torch.ones(bsz, 1, device=device)
        fake_labels = torch.zeros(bsz, 1, device=device)

        d_real = D(real_batch)
        d_fake = D(fake_batch)

        d_loss = bce(d_real, real_labels) + bce(d_fake, fake_labels)
        d_loss.backward()
        opt_d.step()

        # --- Train G ---
        opt_g.zero_grad()
        z = torch.randn(bsz, latent_dim, device=device)
        gen_batch = G(z)
        g_loss = bce(D(gen_batch), real_labels)  # trick D
        g_loss.backward()
        opt_g.step()

        ep_d += d_loss.item(); ep_g += g_loss.item(); steps += 1

    if epoch == 1 or epoch % 5 == 0:
        print(f"Epoch {epoch:3d}/{epochs} | D: {ep_d/steps:.4f} | G: {ep_g/steps:.4f}")

# =========================
# 5) Scoring & thresholds
# =========================
@torch.no_grad()
def disc_scores(x_np: np.ndarray) -> np.ndarray:
    """Anomaly score = 1 - D(x)  (cao hơn => bất thường hơn)"""
    X_t = torch.from_numpy(x_np).to(device)
    D.eval()
    probs = D(X_t).cpu().numpy().reshape(-1)
    return 1.0 - probs

train_scores = disc_scores(X_train)
testA_scores = disc_scores(X_testA)
testB_scores = disc_scores(X_testB)

# Ngưỡng theo train (không rò rỉ): 95th percentile
thr_train95 = float(np.percentile(train_scores, 95.0))

def best_f1_threshold(y_true, scores, percentiles=np.linspace(80, 99.9, 200)):
    thrs = np.percentile(scores, percentiles)
    best_f1, best_thr = -1.0, None
    for t in thrs:
        y_pred = (scores >= t).astype(int)
        prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, labels=[0,1], average=None, zero_division=0)
        if f1[1] > best_f1:
            best_f1, best_thr = f1[1], t
    return float(best_thr), float(best_f1)

thrA_opt, f1A_opt = best_f1_threshold(y_testA, testA_scores)
thrB_opt, f1B_opt = best_f1_threshold(y_testB, testB_scores)

print(f"\n[Train95th] thr={thr_train95:.6f}")
print(f"[TEST_A maxF1] thr={thrA_opt:.6f} | F1_1={f1A_opt:.4f}")
print(f"[TEST_B maxF1] thr={thrB_opt:.6f} | F1_1={f1B_opt:.4f}")

# =========================
# 6) Evaluate
# =========================
def evaluate(name, scores, y_true, thr_use):
    y_pred = (scores >= thr_use).astype(int)
    print(f"\n===== {name} @thr={thr_use:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))
    print("ROC AUC (scores):", roc_auc_score(y_true, scores))

evaluate("TEST_A (200 normal + 400 anomaly) - Train95th", testA_scores, y_testA, thr_train95)
evaluate("TEST_A (200 normal + 400 anomaly) - BestF1",    testA_scores, y_testA, thrA_opt)

evaluate("TEST_B (200 normal + 800 anomaly) - Train95th", testB_scores, y_testB, thr_train95)
evaluate("TEST_B (200 normal + 800 anomaly) - BestF1",    testB_scores, y_testB, thrB_opt)


Device: cpu
Counts -> Normal: 9661 | Anomaly: 339
[WARN] Không đủ anomaly theo yêu cầu. Sẽ dùng tối đa: TEST_A=339, TEST_B=339
Train normal size: 8000
TEST_A dist (200/400 mong muốn): {0: 200, 1: 339}
TEST_B dist (200/800 mong muốn): {0: 200, 1: 339}
Epoch   1/40 | D: 0.8702 | G: 1.5420
Epoch   5/40 | D: 0.5726 | G: 2.1599
Epoch  10/40 | D: 0.8472 | G: 2.6718
Epoch  15/40 | D: 1.1864 | G: 1.0912
Epoch  20/40 | D: 1.2860 | G: 0.8688
Epoch  25/40 | D: 1.3260 | G: 0.7184
Epoch  30/40 | D: 1.2951 | G: 0.7364
Epoch  35/40 | D: 1.3655 | G: 0.7040
Epoch  40/40 | D: 1.3360 | G: 0.7459

[Train95th] thr=0.765946
[TEST_A maxF1] thr=0.630570 | F1_1=0.2736
[TEST_B maxF1] thr=0.714399 | F1_1=0.1806

===== TEST_A (200 normal + 400 anomaly) - Train95th @thr=0.765946 =====
Confusion Matrix:
 [[200   0]
 [313  26]]

Classification Report:
               precision    recall  f1-score   support

           0     0.3899    1.0000    0.5610       200
           1     1.0000    0.0767    0.1425       339

  