In [8]:
# ==========================================
# GANomaly for ai4i2020 (Predictive Maintenance) - Unsupervised
# Split fix: reuse anomaly for TESTB & TESTI (Cách A)
# ==========================================
import os, random, numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, precision_recall_fscore_support)
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# ---------- Repro ----------
SEED=42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark=False
torch.backends.cudnn.deterministic=True
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------- Load & preprocess ----------
csv_path = "../../../data/ai4i2020.csv"   # đổi thành "/mnt/data/ai4i2020.csv" nếu cần
df = pd.read_csv(csv_path)

# chuẩn hoá tên cột
orig_cols = df.columns.tolist()
df.columns = [c.strip() for c in df.columns]
lower_map = {c.lower(): c for c in df.columns}
def has(c): return c in lower_map
def col(c): return lower_map[c]

# nhãn
label_candidates = ["machine failure","target","class","label"]
label_col = None
for cand in label_candidates:
    if has(cand):
        label_col = col(cand); break
assert label_col is not None, f"Không tìm thấy cột nhãn trong {orig_cols}"

y_all = pd.to_numeric(df[label_col], errors="coerce").fillna(0).astype(int).to_numpy()

# drop cột phụ
drop_lowers = set([label_col.lower(),
                   "udi","product id","failure type",
                   "twf","hdf","pwf","osf","rnf"])
drop_cols = [lower_map[x] for x in drop_lowers if x in lower_map]
X_df = df.drop(columns=drop_cols, errors="ignore")

# one-hot type
if has("type"): X_df = pd.get_dummies(X_df, columns=[col("type")], drop_first=True)
for c in X_df.columns:
    X_df[c] = pd.to_numeric(X_df[c], errors="coerce")
X_df = X_df.fillna(X_df.median(numeric_only=True))

print(f"Data shape after processing: X={X_df.shape}, y={y_all.shape}")

# ---------- Splits ----------
normal_idx = np.where(y_all==0)[0]
anom_idx   = np.where(y_all==1)[0]
rng = np.random.default_rng(SEED)
rng.shuffle(normal_idx); rng.shuffle(anom_idx)

TR_N_MAX=20000; VAL_N=150; VAL_A=150; TESTB_N=189; TESTB_A=189; TESTI_N=800; TESTI_A=189

# lấy anomaly: reuse cho TESTB & TESTI
val_a  = rng.choice(anom_idx, size=min(VAL_A, len(anom_idx)), replace=False)
tstb_a = rng.choice(anom_idx, size=min(TESTB_A, len(anom_idx)), replace=False)
tsti_a = tstb_a.copy()

# lấy normal
val_n  = rng.choice(normal_idx, size=VAL_N, replace=False)
tstb_n = rng.choice(normal_idx, size=TESTB_N, replace=False)
tsti_n = rng.choice(normal_idx, size=TESTI_N, replace=False)
trn_n  = np.setdiff1d(normal_idx, np.concatenate([val_n,tstb_n,tsti_n]))

def Xy_from_idxs(idxs):
    return X_df.iloc[idxs].to_numpy().astype(np.float32), y_all[idxs]

X_tr_n,_ = Xy_from_idxs(trn_n)
X_val, y_val   = Xy_from_idxs(np.concatenate([val_n,val_a]))
X_tstb,y_tstb  = Xy_from_idxs(np.concatenate([tstb_n,tstb_a]))
X_tsti,y_tsti  = Xy_from_idxs(np.concatenate([tsti_n,tsti_a]))

print(f"TrainN={len(trn_n)}, Val={len(val_n)}/{len(val_a)}, "
      f"TestB={len(tstb_n)}/{len(tstb_a)}, TestI={len(tsti_n)}/{len(tsti_a)}")

# ---------- Scale ----------
scaler = StandardScaler().fit(X_tr_n)
def z(x): return scaler.transform(x).astype(np.float32)
X_tr_n=z(X_tr_n); X_val=z(X_val); X_tstb=z(X_tstb); X_tsti=z(X_tsti)
INPUT_DIM = X_tr_n.shape[1]

# ---------- GANomaly (E–D–E + D feat) ----------
def mlp(in_dim, hidden, out_dim):
    return nn.Sequential(
        nn.Linear(in_dim, hidden), nn.LeakyReLU(0.2, inplace=False),
        nn.Linear(hidden, hidden), nn.LeakyReLU(0.2, inplace=False),
        nn.Linear(hidden, out_dim)
    )
class Encoder(nn.Module):
    def __init__(self,in_dim,z_dim=32,h=128):
        super().__init__(); self.net=mlp(in_dim,h,z_dim)
    def forward(self,x): return self.net(x)
class Decoder(nn.Module):
    def __init__(self,z_dim=32,out_dim=INPUT_DIM,h=128):
        super().__init__(); self.net=mlp(z_dim,h,out_dim)
    def forward(self,z): return self.net(z)
class Discriminator(nn.Module):
    def __init__(self,in_dim,h=128,feat_dim=64):
        super().__init__()
        self.feat=nn.Sequential(nn.Linear(in_dim,h),nn.LeakyReLU(0.2,False),
                                nn.Linear(h,feat_dim),nn.LeakyReLU(0.2,False))
        self.out=nn.Linear(feat_dim,1)
    def forward(self,x):
        f=self.feat(x); return self.out(f),f
class GANomaly(nn.Module):
    def __init__(self,in_dim,z_dim=32,h=128):
        super().__init__()
        self.enc1=Encoder(in_dim,z_dim,h); self.dec=Decoder(z_dim,in_dim,h)
        self.enc2=Encoder(in_dim,z_dim,h); self.disc=Discriminator(in_dim,h)
    def forward(self,x):
        z=self.enc1(x); x_=self.dec(z); z_=self.enc2(x_)
        d_r,f_r=self.disc(x); d_f,f_f=self.disc(x_)
        return x_,z,z_,d_r,d_f,f_r,f_f

model=GANomaly(INPUT_DIM,32,128).to(device)

# ---------- Loss & Optim ----------
LR=1e-4
opt_G=optim.Adam(list(model.enc1.parameters())+list(model.dec.parameters())+list(model.enc2.parameters()),
                 lr=LR,betas=(0.5,0.999))
opt_D=optim.Adam(model.disc.parameters(),lr=LR,betas=(0.5,0.999))
l1=nn.L1Loss(); L_CON,L_ENC,L_FM=50.0,1.0,10.0
def set_grad(m,flag):
    for p in m.parameters(): p.requires_grad=flag

# ---------- Train ----------
BS,EPOCHS=256,60
loader=DataLoader(TensorDataset(torch.from_numpy(X_tr_n)),batch_size=BS,shuffle=True)
for ep in range(1,EPOCHS+1):
    model.train(); lg,ld,steps=0,0,0
    for (xb,) in loader:
        xb=xb.to(device)
        # D
        set_grad(model.disc,True); opt_D.zero_grad()
        with torch.no_grad():
            zt=model.enc1(xb); xh=model.dec(zt)
        dr,_=model.disc(xb); df,_=model.disc(xh.detach())
        loss_d=torch.relu(1-dr).mean()+torch.relu(1+df).mean()
        loss_d.backward(); opt_D.step()
        # G
        set_grad(model.disc,False); opt_G.zero_grad()
        xh,z,zh,dr2,df2,fr,ff=model(xb)
        loss_fm=l1(fr.detach(),ff)
        loss_con=l1(xh,xb)
        loss_enc=torch.mean((z-zh)**2)
        loss_g=L_FM*loss_fm+L_CON*loss_con+L_ENC*loss_enc
        loss_g.backward(); opt_G.step()
        lg+=loss_g.item(); ld+=loss_d.item(); steps+=1
    if ep==1 or ep%5==0:
        print(f"[GANomaly] Epoch {ep}/{EPOCHS} | L_G={lg/steps:.4f} | L_D={ld/steps:.4f}")

# ---------- Score ----------
ALPHA,BETA,GAMMA=0.7,0.3,0.0
@torch.no_grad()
def anomaly_score(x):
    model.eval()
    xt=torch.from_numpy(x).to(device)
    xh,z,zh,dr,df,fr,ff=model(xt)
    recon=torch.mean(torch.abs(xh-xt),1)
    latent=torch.mean((z-zh)**2,1)
    feat=torch.mean(torch.abs(fr-ff),1)
    return (ALPHA*recon+ BETA*latent+ GAMMA*feat).cpu().numpy()

val_raw=anomaly_score(X_val)
smin,smax=val_raw.min(),val_raw.max(); eps=1e-12
def to_norm(s): return (s-smin)/(smax-smin+eps)

# ---------- Threshold ----------
def best_f1_threshold(y,s):
    best_thr,best_f1=None,-1
    for p in np.linspace(50,99.5,200):
        t=np.percentile(s,p)
        yhat=(s>=t).astype(int)
        _,_,f1,_=precision_recall_fscore_support(y,yhat,labels=[0,1],average=None,zero_division=0)
        if f1[1]>best_f1: best_f1,best_thr=f1[1],t
    return best_thr,best_f1

thr,f1v=best_f1_threshold(y_val,to_norm(val_raw))
print(f"\n[VAL balanced] Best F1(Class1)={f1v:.3f} @thr={thr:.6f}")

def evaluate(name,X,y,thr):
    s=to_norm(anomaly_score(X)); yhat=(s>=thr).astype(int)
    print(f"\n===== {name} @thr={thr:.6f} =====")
    print("Confusion:\n",confusion_matrix(y,yhat,labels=[0,1]))
    print(classification_report(y,yhat,labels=[0,1],digits=4,zero_division=0))
    if len(np.unique(y))==2:
        print("ROC AUC:",roc_auc_score(y,s))
    else:
        print("ROC AUC: N/A (only one class)")

evaluate("TEST Balanced",X_tstb,y_tstb,thr)
evaluate("TEST Imbalanced",X_tsti,y_tsti,thr)

print("\n>>> Quick sweep:")
for p in [60,70,80,85,90,95]:
    t=np.percentile(to_norm(val_raw),p)
    print(f"\n-- pctl={p}, thr={t:.6f}")
    evaluate("TEST Balanced",X_tstb,y_tstb,t)
    evaluate("TEST Imbalanced",X_tsti,y_tsti,t)


Device: cpu
Data shape after processing: X=(10000, 7), y=(10000,)
TrainN=8565, Val=150/150, TestB=189/189, TestI=800/189
[GANomaly] Epoch 1/60 | L_G=43.1092 | L_D=1.9411
[GANomaly] Epoch 5/60 | L_G=13.9190 | L_D=1.9055
[GANomaly] Epoch 10/60 | L_G=7.1102 | L_D=1.9392
[GANomaly] Epoch 15/60 | L_G=6.6548 | L_D=1.8828
[GANomaly] Epoch 20/60 | L_G=5.8621 | L_D=1.8134
[GANomaly] Epoch 25/60 | L_G=4.0485 | L_D=1.8416
[GANomaly] Epoch 30/60 | L_G=1.9336 | L_D=1.9307
[GANomaly] Epoch 35/60 | L_G=1.0858 | L_D=1.9982
[GANomaly] Epoch 40/60 | L_G=0.9303 | L_D=1.9986
[GANomaly] Epoch 45/60 | L_G=0.8339 | L_D=1.9989
[GANomaly] Epoch 50/60 | L_G=0.7805 | L_D=1.9989
[GANomaly] Epoch 55/60 | L_G=0.7555 | L_D=1.9993
[GANomaly] Epoch 60/60 | L_G=0.7192 | L_D=1.9992

[VAL balanced] Best F1(Class1)=0.669 @thr=0.141909

===== TEST Balanced @thr=0.141909 =====
Confusion:
 [[124  65]
 [ 59 130]]
              precision    recall  f1-score   support

           0     0.6776    0.6561    0.6667       189
     