In [1]:
import os, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support

# =========================
# 0) Reproducibility
# =========================
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# =========================
# 1) Load & preprocess
# =========================
csv_path = "../../data/ai4i2020.csv"  # đổi "/mnt/data/ai4i2020.csv" nếu cần
df = pd.read_csv(csv_path)

drop_cols = ['UDI', 'Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
features = df.drop(columns=drop_cols)
labels = df['Machine failure'].astype(int).to_numpy()

scaler = StandardScaler()
X_all = scaler.fit_transform(features).astype(np.float32)

# =========================
# 2) Split (giống VAE/GAN): Train 8000 normal; TEST_A: 200/400; TEST_B: 200/800
# =========================
normal_idx = np.where(labels == 0)[0]
anom_idx   = np.where(labels == 1)[0]
n_normal, n_anom = len(normal_idx), len(anom_idx)
print(f"Counts -> Normal: {n_normal} | Anomaly: {n_anom}")
assert n_normal >= 8400, "Cần >= 8400 normal để 8000 train + 200 + 200 test."

X_train = X_all[normal_idx[:8000]]

testA_normal = X_all[normal_idx[8000:8200]]  # 200 normal
testB_normal = X_all[normal_idx[8200:8400]]  # 200 normal

reqA_anom, reqB_anom = 400, 800
gotA_anom = min(reqA_anom, n_anom)
gotB_anom = min(reqB_anom, n_anom)
if gotA_anom < reqA_anom or gotB_anom < reqB_anom:
    print(f"[WARN] Không đủ anomaly theo yêu cầu. Sẽ dùng tối đa: TEST_A={gotA_anom}, TEST_B={gotB_anom}")

testA_anom = X_all[anom_idx[:gotA_anom]]
testB_anom = X_all[anom_idx[:gotB_anom]]

X_testA = np.vstack([testA_normal, testA_anom]).astype(np.float32)
y_testA = np.hstack([np.zeros(testA_normal.shape[0], dtype=int),
                     np.ones(testA_anom.shape[0], dtype=int)])

X_testB = np.vstack([testB_normal, testB_anom]).astype(np.float32)
y_testB = np.hstack([np.zeros(testB_normal.shape[0], dtype=int),
                     np.ones(testB_anom.shape[0], dtype=int)])

print("Train normal size:", X_train.shape[0])
print("TEST_A dist (200/400 mong muốn):", {0: int((y_testA==0).sum()), 1: int((y_testA==1).sum())})
print("TEST_B dist (200/800 mong muốn):", {0: int((y_testB==0).sum()), 1: int((y_testB==1).sum())})

# =========================
# 3) Fit Isolation Forest (only normal)
# =========================
# Không dùng predict() để phụ thuộc contamination; tự tính score và tự đặt ngưỡng.
clf = IsolationForest(
    n_estimators=300,
    max_samples='auto',
    contamination='auto',   # không ảnh hưởng vì ta tự threshold
    random_state=SEED,
    n_jobs=-1,
    bootstrap=False
)
clf.fit(X_train)

# =========================
# 4) Anomaly scores (cao hơn => bất thường hơn)
# =========================
# score_samples: higher => more normal, nên đảo dấu
def anomaly_score(model, X):
    return -model.score_samples(X)

train_scores = anomaly_score(clf, X_train)
testA_scores = anomaly_score(clf, X_testA)
testB_scores = anomaly_score(clf, X_testB)

# =========================
# 5) Thresholds
# =========================
thr_train95 = float(np.percentile(train_scores, 95.0))

def best_f1_threshold(y_true, scores, percentiles=np.linspace(80, 99.9, 200)):
    thrs = np.percentile(scores, percentiles)
    best_f1, best_thr = -1.0, None
    for t in thrs:
        y_pred = (scores >= t).astype(int)  # 1 = anomaly
        prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, labels=[0,1], average=None, zero_division=0)
        if f1[1] > best_f1:
            best_f1, best_thr = f1[1], t
    return float(best_thr), float(best_f1)

thrA_opt, f1A_opt = best_f1_threshold(y_testA, testA_scores)
thrB_opt, f1B_opt = best_f1_threshold(y_testB, testB_scores)

print(f"\n[Train95th] thr={thr_train95:.6f}")
print(f"[TEST_A maxF1] thr={thrA_opt:.6f} | F1_1={f1A_opt:.4f}")
print(f"[TEST_B maxF1] thr={thrB_opt:.6f} | F1_1={f1B_opt:.4f}")

# =========================
# 6) Evaluate
# =========================
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate(name, scores, y_true, thr_use):
    y_pred = (scores >= thr_use).astype(int)
    print(f"\n===== {name} @thr={thr_use:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))
    print("ROC AUC (scores):", roc_auc_score(y_true, scores))

evaluate("TEST_A (200 normal + 400 anomaly) - Train95th", testA_scores, y_testA, thr_train95)
evaluate("TEST_A (200 normal + 400 anomaly) - BestF1",    testA_scores, y_testA, thrA_opt)

evaluate("TEST_B (200 normal + 800 anomaly) - Train95th", testB_scores, y_testB, thr_train95)
evaluate("TEST_B (200 normal + 800 anomaly) - BestF1",    testB_scores, y_testB, thrB_opt)


Counts -> Normal: 9661 | Anomaly: 339
[WARN] Không đủ anomaly theo yêu cầu. Sẽ dùng tối đa: TEST_A=339, TEST_B=339
Train normal size: 8000
TEST_A dist (200/400 mong muốn): {0: 200, 1: 339}
TEST_B dist (200/800 mong muốn): {0: 200, 1: 339}

[Train95th] thr=0.574772
[TEST_A maxF1] thr=0.566733 | F1_1=0.4653
[TEST_B maxF1] thr=0.566917 | F1_1=0.4653

===== TEST_A (200 normal + 400 anomaly) - Train95th @thr=0.574772 =====
Confusion Matrix:
 [[196   4]
 [244  95]]

Classification Report:
               precision    recall  f1-score   support

           0     0.4455    0.9800    0.6125       200
           1     0.9596    0.2802    0.4338       339

    accuracy                         0.5399       539
   macro avg     0.7025    0.6301    0.5231       539
weighted avg     0.7688    0.5399    0.5001       539

ROC AUC (scores): 0.8741150442477876

===== TEST_A (200 normal + 400 anomaly) - BestF1 @thr=0.566733 =====
Confusion Matrix:
 [[196   4]
 [235 104]]

Classification Report:
           