In [5]:
import os, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# =========================
# 0) Reproducibility & device
# =========================
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# =========================
# 1) Load & preprocess
# =========================
csv_path = "../../data/ai4i2020.csv"   # đổi thành "/mnt/data/ai4i2020.csv" nếu cần
df = pd.read_csv(csv_path)

drop_cols = ['UDI', 'Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
features = df.drop(columns=drop_cols)
labels = df['Machine failure'].astype(int).to_numpy()

scaler = StandardScaler()
X_all = scaler.fit_transform(features).astype(np.float32)

# =========================
# 2) Build sets theo yêu cầu
#    - Train: 8000 normal
#    - TEST_A: 200 normal + 400 anomaly (giới hạn nếu thiếu)
#    - TEST_B: 200 normal + 800 anomaly (giới hạn nếu thiếu)
# =========================
normal_idx = np.where(labels == 0)[0]
anom_idx   = np.where(labels == 1)[0]
n_normal = len(normal_idx)
n_anom   = len(anom_idx)
print(f"Counts -> Normal: {n_normal} | Anomaly: {n_anom}")

assert n_normal >= 8400, "Cần >= 8400 normal để lấy 8000 train + 200 + 200 test."

# Train: ONLY normal
X_train = X_all[normal_idx[:8000]]

# Lấy normal cho 2 test set (không trùng nhau để sạch)
testA_normal = X_all[normal_idx[8000:8200]]   # 200 normal
testB_normal = X_all[normal_idx[8200:8400]]   # 200 normal

# Yêu cầu anomaly
reqA_anom = 400
reqB_anom = 800

# Giới hạn theo thực tế
gotA_anom = min(reqA_anom, n_anom)
gotB_anom = min(reqB_anom, n_anom)

if gotA_anom < reqA_anom or gotB_anom < reqB_anom:
    print(f"[WARN] Không đủ anomaly theo yêu cầu. Sẽ dùng tối đa: "
          f"TEST_A={gotA_anom} anomaly, TEST_B={gotB_anom} anomaly.")

# Dùng cùng tập anomaly đầu cho 2 test set (đảm bảo dùng chung phân phối)
testA_anom = X_all[anom_idx[:gotA_anom]]
testB_anom = X_all[anom_idx[:gotB_anom]]

X_testA = np.vstack([testA_normal, testA_anom]).astype(np.float32)
y_testA = np.hstack([np.zeros(testA_normal.shape[0], dtype=int),
                     np.ones(testA_anom.shape[0], dtype=int)])

X_testB = np.vstack([testB_normal, testB_anom]).astype(np.float32)
y_testB = np.hstack([np.zeros(testB_normal.shape[0], dtype=int),
                     np.ones(testB_anom.shape[0], dtype=int)])

print("Train normal size:", X_train.shape[0])
print("TEST_A dist (200/400 mong muốn):", {0: int((y_testA==0).sum()), 1: int((y_testA==1).sum())})
print("TEST_B dist (200/800 mong muốn):", {0: int((y_testB==0).sum()), 1: int((y_testB==1).sum())})

X_train_t = torch.from_numpy(X_train)
X_testA_t = torch.from_numpy(X_testA)
X_testB_t = torch.from_numpy(X_testB)

# =========================
# 3) VAE definition
# =========================
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=8, hidden=128):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU()
        )
        self.enc_mu    = nn.Linear(hidden, latent_dim)
        self.enc_logvar= nn.Linear(hidden, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, input_dim)
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h   = self.encoder(x)
        mu  = self.enc_mu(h)
        lv  = self.enc_logvar(h)
        z   = self.reparameterize(mu, lv)
        xhat= self.decoder(z)
        return xhat, mu, lv, z

def vae_loss_fn(x, xhat, mu, logvar, recon="mse", beta=1.0):
    if recon == "mse":
        recon_loss = nn.functional.mse_loss(xhat, x, reduction="mean")
    else:
        recon_loss = nn.functional.l1_loss(xhat, x, reduction="mean")
    kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl, recon_loss, kl

# =========================
# 4) Train VAE on normal only
# =========================
input_dim = X_train.shape[1]
vae = VAE(input_dim, latent_dim=8, hidden=128).to(device)
opt = optim.Adam(vae.parameters(), lr=1e-3)

batch_size = 128
epochs = 40  # giảm 10–20 nếu cần chạy nhanh
train_loader = DataLoader(TensorDataset(X_train_t), batch_size=batch_size, shuffle=True, drop_last=False)

for ep in range(1, epochs+1):
    vae.train()
    total = recon_total = kl_total = 0.0
    steps = 0
    for (xb,) in train_loader:
        xb = xb.to(device)
        opt.zero_grad()
        xhat, mu, lv, z = vae(xb)
        loss, rl, kl = vae_loss_fn(xb, xhat, mu, lv, recon="mse", beta=1.0)
        loss.backward()
        opt.step()
        total += loss.item(); recon_total += rl.item(); kl_total += kl.item(); steps += 1
    if ep == 1 or ep % 5 == 0:
        print(f"Epoch {ep:3d}/{epochs} | loss={total/steps:.6f} | recon={recon_total/steps:.6f} | kl={kl_total/steps:.6f}")

# =========================
# 5) Scoring: reconstruction error
# =========================
@torch.no_grad()
def recon_error(x_np: np.ndarray, model: VAE, agg="mse"):
    model.eval()
    xt = torch.from_numpy(x_np).to(device)
    xhat, _, _, _ = model(xt)
    if agg == "mse":
        err = torch.mean((xhat - xt)**2, dim=1)
    else:
        err = torch.mean(torch.abs(xhat - xt), dim=1)
    return err.detach().cpu().numpy()

train_err = recon_error(X_train, vae, agg="mse")
testA_err = recon_error(X_testA, vae, agg="mse")
testB_err = recon_error(X_testB, vae, agg="mse")

# =========================
# 6) Thresholds
# =========================
thr_train95 = float(np.percentile(train_err, 95.0))

def best_f1_threshold(y_true, scores, percentiles=np.linspace(80, 99.9, 200)):
    thrs = np.percentile(scores, percentiles)
    best_f1, best_thr = -1.0, None
    for t in thrs:
        y_pred = (scores >= t).astype(int)
        prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, labels=[0,1], average=None, zero_division=0)
        if f1[1] > best_f1:
            best_f1, best_thr = f1[1], t
    return float(best_thr), float(best_f1)

thrA_opt, f1A_opt = best_f1_threshold(y_testA, testA_err)
thrB_opt, f1B_opt = best_f1_threshold(y_testB, testB_err)

print(f"\n[Train95th] thr={thr_train95:.6f}")
print(f"[TEST_A maxF1] thr={thrA_opt:.6f} | F1_1={f1A_opt:.4f}")
print(f"[TEST_B maxF1] thr={thrB_opt:.6f} | F1_1={f1B_opt:.4f}")

# =========================
# 7) Evaluate
# =========================
def evaluate(name, scores, y_true, thr_use):
    y_pred = (scores >= thr_use).astype(int)
    print(f"\n===== {name} @thr={thr_use:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))
    print("ROC AUC (scores):", roc_auc_score(y_true, scores))

evaluate("TEST_A (200 normal + 400 anomaly) - Train95th", testA_err, y_testA, thr_train95)
evaluate("TEST_A (200 normal + 400 anomaly) - BestF1",    testA_err, y_testA, thrA_opt)

evaluate("TEST_B (200 normal + 800 anomaly) - Train95th", testB_err, y_testB, thr_train95)
evaluate("TEST_B (200 normal + 800 anomaly) - BestF1",    testB_err, y_testB, thrB_opt)


Device: cpu
Counts -> Normal: 9661 | Anomaly: 339
[WARN] Không đủ anomaly theo yêu cầu. Sẽ dùng tối đa: TEST_A=339 anomaly, TEST_B=339 anomaly.
Train normal size: 8000
TEST_A dist (200/400 mong muốn): {0: 200, 1: 339}
TEST_B dist (200/800 mong muốn): {0: 200, 1: 339}
Epoch   1/40 | loss=0.794748 | recon=0.678909 | kl=0.115839
Epoch   5/40 | loss=0.516554 | recon=0.231281 | kl=0.285273
Epoch  10/40 | loss=0.517975 | recon=0.229551 | kl=0.288425
Epoch  15/40 | loss=0.511042 | recon=0.224404 | kl=0.286638
Epoch  20/40 | loss=0.514568 | recon=0.225287 | kl=0.289281
Epoch  25/40 | loss=0.511787 | recon=0.223783 | kl=0.288003
Epoch  30/40 | loss=0.511829 | recon=0.224208 | kl=0.287621
Epoch  35/40 | loss=0.513064 | recon=0.222136 | kl=0.290928
Epoch  40/40 | loss=0.513160 | recon=0.224509 | kl=0.288651

[Train95th] thr=0.554915
[TEST_A maxF1] thr=0.442104 | F1_1=0.3535
[TEST_B maxF1] thr=0.436837 | F1_1=0.4251

===== TEST_A (200 normal + 400 anomaly) - Train95th @thr=0.554915 =====
Confusion