In [None]:
# === Assumes you already have: train_part, val_part, test_real (time-split)
# And you created a balanced validation set "val_balanced"
# If bạn đang ở kịch bản TEST cân bằng: gán test_df = test_balanced; ngược lại dùng test_real.

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_fscore_support, precision_recall_curve,
    average_precision_score
)

drop_cols = ["Class", "Amount", "Time"]

def fit_iso_and_score(train_if_df, val_df, test_df, param):
    iso = IsolationForest(
        n_estimators=param.get("n_estimators", 200),
        max_samples=param.get("max_samples", "auto"),
        contamination="auto",
        random_state=42,
        n_jobs=-1
    )
    X_train_if = train_if_df.drop(columns=drop_cols)
    X_val  = val_df.drop(columns=drop_cols)
    X_test = test_df.drop(columns=drop_cols)

    iso.fit(X_train_if)
    val_scores  = -iso.score_samples(X_val)   # đảo dấu: cao = bất thường
    test_scores = -iso.score_samples(X_test)
    return val_scores, test_scores

def eval_at_threshold(y_true, scores, thr):
    y_pred = (scores >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=[0,1], average=None, zero_division=0
    )
    acc = (tp + tn) / (tp + tn + fp + fn)
    return dict(threshold=float(thr), TN=int(tn), FP=int(fp), FN=int(fn), TP=int(tp),
                precision_1=float(prec[1]), recall_1=float(rec[1]), f1_1=float(f1[1]),
                accuracy=float(acc))

def best_f1_threshold(y_true, scores, percentiles=np.linspace(50, 99.5, 200)):
    thrs = np.percentile(scores, percentiles)
    rows = [eval_at_threshold(y_true, scores, t) for t in thrs]
    df = pd.DataFrame(rows)
    return float(df.iloc[df["f1_1"].idxmax()]["threshold"]), df

def threshold_for_precision(y_true, scores, target_p=0.50):
    p, r, thr = precision_recall_curve(y_true, scores)
    idx = np.where(p >= target_p)[0]
    if len(idx) == 0 or len(thr) == 0:
        return None
    i = idx[0]
    # precision_recall_curve trả thr dài = len(p)-1
    thr_sel = thr[i-1] if i > 0 else thr[0]
    return float(thr_sel), float(p[i]), float(r[i])

def print_full_report(title, y_true, scores, thr):
    y_pred = (scores >= thr).astype(int)
    print(f"\n===== {title} @ thr={thr:.6f} =====")
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC (scores): {roc_auc_score(y_true, scores):.3f}")
    print(f"PR  AUC (AP):    {average_precision_score(y_true, scores):.3f}")

# ---------- Build sets ----------
# Validation cân bằng để chọn ngưỡng
fraud_val  = val_part[val_part.Class == 1]
normal_val = val_part[val_part.Class == 0].sample(n=len(fraud_val), random_state=42, replace=False)
val_balanced = pd.concat([normal_val, fraud_val]).sample(frac=1, random_state=42)

# Train IF chỉ normal
train_if = train_part[train_part.Class == 0].copy()

# Chọn TEST tuỳ kịch bản
test_df = test_real  # hoặc test_balanced nếu bạn đang ở kịch bản cân bằng

# ---------- Hyperparam sweep (nhẹ) ----------
param_grid = [
    {"n_estimators": 200, "max_samples": 256},
    {"n_estimators": 400, "max_samples": 256},
    {"n_estimators": 200, "max_samples": 512},
    {"n_estimators": 400, "max_samples": 512},
    {"n_estimators": 400, "max_samples": 1024},  # chậm hơn, thường tốt hơn
]

best = None
store = {}
y_val_bal = val_balanced["Class"].to_numpy()
y_test    = test_df["Class"].to_numpy()

for p in param_grid:
    val_scores, test_scores = fit_iso_and_score(train_if, val_balanced, test_df, p)
    thr_f1, df_val = best_f1_threshold(y_val_bal, val_scores)
    # Lấy F1 tốt nhất để so sánh các bộ tham số
    f1_peak = df_val["f1_1"].max()
    store[tuple(p.items())] = (f1_peak, thr_f1, val_scores, test_scores)
    if best is None or f1_peak > best[0]:
        best = (f1_peak, p, thr_f1, val_scores, test_scores)

print(">> Best params by F1(VAL_bal):", best[1], " with F1 =", round(best[0], 3),
      " and thr_f1 =", round(best[2], 6))

# ---------- Reports ----------
best_p, thr_f1, val_scores, test_scores = best[1], best[2], best[3], best[4]

# 1) Theo tiêu chí bài viết: ngưỡng = max F1 trên VAL cân bằng
print_full_report("TEST - Best F1 (chosen on VAL_balanced)", y_test, test_scores, thr_f1)

# 2) Theo tiêu chí vận hành (tuỳ chọn): ngưỡng đạt Precision≥50% (hoặc 70%)
res = threshold_for_precision(y_val_bal, val_scores, target_p=0.50)
if res:
    thr_p50, p50, r50 = res
    print(f"\n[VAL_bal] Threshold for Precision≥50%: thr={thr_p50:.6f} (P={p50:.3f}, R={r50:.3f})")
    print_full_report("TEST - Precision≥50% threshold", y_test, test_scores, thr_p50)
else:
    print("\nKhông đạt Precision mục tiêu trên đường PR của VAL_balanced.")


Train_part distribution: {0: 204667, 1: 393}
Val_part distribution  : {0: 22761, 1: 24}
Test_real distribution : {0: 56887, 1: 75}
val_scores range:  0.3781 ~ 0.6455
test_scores range: 0.3649 ~ 0.7826

== Best threshold by F1 on VAL (balanced) ==
threshold       0.504359
TN             22.000000
FP              2.000000
FN              4.000000
TP             20.000000
precision_1     0.909091
recall_1        0.833333
f1_1            0.869565
accuracy        0.875000
Name: 13, dtype: float64

===== TEST - Percentile Threshold (top 10% on VAL_balanced) =====
Confusion Matrix:
 [[56700   187]
 [   70     5]]

Classification Report:
               precision    recall  f1-score   support

           0      0.999     0.997     0.998     56887
           1      0.026     0.067     0.037        75

    accuracy                          0.995     56962
   macro avg      0.512     0.532     0.518     56962
weighted avg      0.997     0.995     0.996     56962

ROC AUC (using scores): 0.948
PR  