In [2]:
# ==========================================
# Isolation Forest anomaly detection with manual threshold option
# Dataset: ai4i2020.csv
# - Train: ONLY normal
# - VAL (balanced): chọn ngưỡng (manual / F1 / Precision≥target)
# - TEST Balanced & Imbalanced dùng cùng ngưỡng
# ==========================================
import os, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support, precision_recall_curve
)
from sklearn.ensemble import IsolationForest

# ------------------------------
# 0) Reproducibility
# ------------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# ------------------------------
# 1) Load & preprocess
# ------------------------------
csv_path = "../../../data/ai4i2020.csv"   # đổi thành "/mnt/data/ai4i2020.csv" nếu cần
df = pd.read_csv(csv_path)

print("Tổng số dòng:", len(df))
print("Phân phối nhãn Machine failure:\n", df["Machine failure"].value_counts())
print("Tỷ lệ (%):\n", df["Machine failure"].value_counts(normalize=True) * 100)

drop_cols = ['UDI', 'Product ID', 'Type', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
X_df = df.drop(columns=drop_cols)
y = df['Machine failure'].to_numpy().astype(int)

scaler = StandardScaler()
X_all = scaler.fit_transform(X_df).astype(np.float32)

# ------------------------------
# 2) Build splits (auto theo thực tế n_anom≈339)
# ------------------------------
normal_idx = np.where(y == 0)[0]
anom_idx   = np.where(y == 1)[0]
n_normal, n_anom = len(normal_idx), len(anom_idx)
print(f"\nNormal: {n_normal} | Anomaly: {n_anom}")

# xáo trộn
rng = np.random.default_rng(SEED)
rng.shuffle(normal_idx); rng.shuffle(anom_idx)

# Train normal
TRAIN_NORMAL = min(8000, n_normal - 2000) if n_normal > 2000 else max(1000, n_normal // 2)
train_norm_idx = normal_idx[:TRAIN_NORMAL]

# Validation balanced
VAL_ANOM = min(150, n_anom // 2)
VAL_NORM = VAL_ANOM
val_anom_idx = anom_idx[:VAL_ANOM]
val_norm_idx = normal_idx[TRAIN_NORMAL : TRAIN_NORMAL + VAL_NORM]

# Test balanced
TEST_ANOM = min(200, n_anom - VAL_ANOM)
TEST_NORM_BAL = TEST_ANOM
test_bal_anom_idx = anom_idx[VAL_ANOM : VAL_ANOM + TEST_ANOM]
test_bal_norm_idx = normal_idx[TRAIN_NORMAL + VAL_NORM :
                               TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL]

# Test imbalanced ~4:1
TEST_IMB_NORM = min(800, n_normal - (TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL))
test_imb_norm_idx = normal_idx[TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL :
                               TRAIN_NORMAL + VAL_NORM + TEST_NORM_BAL + TEST_IMB_NORM]
test_imb_anom_idx = test_bal_anom_idx  # dùng cùng anomaly để so công bằng

def take(idx):
    return X_all[idx], y[idx]

X_train, y_train = take(train_norm_idx)           # toàn 0
X_val   = np.vstack([X_all[val_norm_idx], X_all[val_anom_idx]])
y_val   = np.hstack([np.zeros(len(val_norm_idx), dtype=int),
                     np.ones (len(val_anom_idx), dtype=int)])

X_test_bal = np.vstack([X_all[test_bal_norm_idx], X_all[test_bal_anom_idx]])
y_test_bal = np.hstack([np.zeros(len(test_bal_norm_idx), dtype=int),
                        np.ones (len(test_bal_anom_idx), dtype=int)])

X_test_imb = np.vstack([X_all[test_imb_norm_idx], X_all[test_imb_anom_idx]])
y_test_imb = np.hstack([np.zeros(len(test_imb_norm_idx), dtype=int),
                        np.ones (len(test_imb_anom_idx), dtype=int)])

print("\n--- Split summary (auto) ---")
print(f"Train normal size          : {X_train.shape[0]}")
print(f"VAL balanced (norm/anom)   : {len(val_norm_idx)} / {len(val_anom_idx)}")
print(f"TEST balanced (norm/anom)  : {len(test_bal_norm_idx)} / {len(test_bal_anom_idx)}")
print(f"TEST imbalanced (norm/anom): {len(test_imb_norm_idx)} / {len(test_imb_anom_idx)}")

# ------------------------------
# 3) Fit Isolation Forest (ONLY normal)
# ------------------------------
clf = IsolationForest(
    n_estimators=400,
    max_samples=512,
    contamination="auto",   # ta sẽ tự đặt ngưỡng nên không dựa vào predict()
    random_state=SEED,
    n_jobs=-1
)
clf.fit(X_train)  # chỉ normal

# ------------------------------
# 4) Scoring (cao hơn => bất thường hơn)
# ------------------------------
def anomaly_score(model, X):
    # score_samples: higher => more normal, nên đảo dấu
    return -model.score_samples(X)

val_scores  = anomaly_score(clf, X_val)
# (test scores sẽ tính trong evaluate để in ROC AUC thống nhất)

# ------------------------------
# 5) Threshold selection (manual / f1 / p_at)
# ------------------------------
MODE = "manual"    # "manual" | "f1" | "p_at"
THR_MANUAL = np.percentile(val_scores, 95)  # gợi ý: dùng percentile trên VAL (thay số tùy mục tiêu)
TARGET_P   = 0.60  # dùng khi MODE="p_at"

def best_f1_threshold(y_true, scores, percentiles=np.linspace(50, 99.5, 200)):
    thrs = np.percentile(scores, percentiles)
    best_f1, best_thr = -1.0, None
    for t in thrs:
        y_pred = (scores >= t).astype(int)
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, labels=[0,1], average=None, zero_division=0
        )
        if f1[1] > best_f1:
            best_f1, best_thr = float(f1[1]), float(t)
    return best_thr, best_f1

def threshold_for_precision(y_true, scores, target_p=0.60):
    p, r, thr = precision_recall_curve(y_true, scores)
    # thr có độ dài len(p)-1
    idx = np.where(p[:-1] >= target_p)[0]
    if len(idx) == 0:
        # fallback: 95th percentile nếu không đạt precision mục tiêu
        return float(np.percentile(scores, 95)), float(p[1] if len(p)>1 else 0.0), float(r[1] if len(r)>1 else 0.0)
    i = idx[0]
    return float(thr[i]), float(p[i]), float(r[i])

if MODE == "manual":
    thr = float(THR_MANUAL)
    pct = (val_scores <= thr).mean() * 100.0
    print(f"\n[VAL] Manual threshold selected: thr={thr:.6f} (~percentile {pct:.2f}%)")
elif MODE == "f1":
    thr, best_f1 = best_f1_threshold(y_val, val_scores)
    print(f"\n[VAL balanced] Best F1(Class 1)={best_f1:.3f} at threshold={thr:.6f}")
else:  # MODE == "p_at"
    thr, p_at, r_at = threshold_for_precision(y_val, val_scores, TARGET_P)
    print(f"\n[VAL balanced] Threshold for Precision≥{TARGET_P:.2f}: thr={thr:.6f} (P={p_at:.3f}, R={r_at:.3f})")

# ------------------------------
# 6) Evaluate
# ------------------------------
def evaluate(name: str, X_np: np.ndarray, y_true: np.ndarray, thr: float):
    scores = anomaly_score(clf, X_np)
    y_pred = (scores >= thr).astype(int)
    print(f"\n===== {name} @thr={thr:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))
    print("ROC AUC (scores):", roc_auc_score(y_true, scores))

evaluate("TEST Balanced",   X_test_bal, y_test_bal, thr)
evaluate("TEST Imbalanced", X_test_imb, y_test_imb, thr)

# ------------------------------
# 7) (Optional) Quick sweep ngưỡng để so sánh
# ------------------------------
candidates = [np.percentile(val_scores, p) for p in [80, 85, 90, 95, 97.5, 99]]
print("\n>>> Quick sweep on percentile-based thresholds (from VAL):")
for t in candidates:
    print(f"\n-- Try thr={t:.6f} on TEST Balanced --")
    evaluate("TEST Balanced", X_test_bal, y_test_bal, t)
    print(f"\n-- Try thr={t:.6f} on TEST Imbalanced --")
    evaluate("TEST Imbalanced", X_test_imb, y_test_imb, t)


Tổng số dòng: 10000
Phân phối nhãn Machine failure:
 Machine failure
0    9661
1     339
Name: count, dtype: int64
Tỷ lệ (%):
 Machine failure
0    96.61
1     3.39
Name: proportion, dtype: float64

Normal: 9661 | Anomaly: 339

--- Split summary (auto) ---
Train normal size          : 7661
VAL balanced (norm/anom)   : 150 / 150
TEST balanced (norm/anom)  : 189 / 189
TEST imbalanced (norm/anom): 800 / 189

[VAL] Manual threshold selected: thr=0.635004 (~percentile 95.00%)

===== TEST Balanced @thr=0.635004 =====
Confusion Matrix:
 [[187   2]
 [164  25]]

Classification Report:
               precision    recall  f1-score   support

           0     0.5328    0.9894    0.6926       189
           1     0.9259    0.1323    0.2315       189

    accuracy                         0.5608       378
   macro avg     0.7293    0.5608    0.4620       378
weighted avg     0.7293    0.5608    0.4620       378

ROC AUC (scores): 0.8493323255228017

===== TEST Imbalanced @thr=0.635004 =====
Confusion