In [3]:
# ---------------------------------------------------------
# ISOLATION FOREST MODEL EVALUATION
# ---------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score

import joblib

# ---------------------------------------------------------
# 1. Load preprocessed features
# ---------------------------------------------------------

X = pd.read_csv("transaction_features.csv")
print("Features loaded:", X.shape)

# ---------------------------------------------------------
# 2. Load trained Isolation Forest model + imputer
# ---------------------------------------------------------

iso_model = joblib.load("transaction_isolation_forest_model.pkl")
imputer = joblib.load("transaction_imputer.pkl")

# ---------------------------------------------------------
# 3. Impute missing values
# ---------------------------------------------------------

X_imputed = imputer.transform(X)

# ---------------------------------------------------------
# 4. Generate anomaly predictions
# ---------------------------------------------------------

# 1 = normal, -1 = anomaly
pred_labels = iso_model.predict(X_imputed)
anomaly_flag = (pred_labels == -1).astype(int)

# Anomaly scores (decision_function)
scores = iso_model.decision_function(X_imputed)

df_eval = pd.DataFrame(X.copy())
df_eval["is_anomaly"] = anomaly_flag
df_eval["anomaly_score"] = scores

# ---------------------------------------------------------
# 5. Basic statistics
# ---------------------------------------------------------

total = len(df_eval)
anomalies = df_eval["is_anomaly"].sum()

print("\n--- ANOMALY STATISTICS ---")
print(f"Total events       : {total}")
print(f"Detected anomalies : {anomalies}")
print(f"Anomaly %          : {anomalies/total*100:.2f}%")

# ---------------------------------------------------------
# 6. Score distribution
# ---------------------------------------------------------

plt.figure(figsize=(10,5))
sns.histplot(df_eval["anomaly_score"], bins=100, kde=True)
plt.title("Isolation Forest Decision Score Distribution")
plt.xlabel("Anomaly Score (higher = normal, lower = anomalous)")
plt.ylabel("Frequency")
plt.show()

# ---------------------------------------------------------
# 7. Top N anomalies for inspection
# ---------------------------------------------------------

top_n = 20
print(f"\nTop {top_n} most anomalous events:")
print(df_eval.sort_values("anomaly_score").head(top_n))

# ---------------------------------------------------------
# 8. Optional: If ground truth exists
# ---------------------------------------------------------
# Uncomment and use if your dataset has a true label column
# df_eval['true_label'] = ...  # 1 = fraud/anomaly, 0 = normal
# print("\nPrecision / Recall / F1")
# precision = precision_score(df_eval['true_label'], df_eval['is_anomaly'])
# recall = recall_score(df_eval['true_label'], df_eval['is_anomaly'])
# f1 = f1_score(df_eval['true_label'], df_eval['is_anomaly'])
# print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

# ---------------------------------------------------------
# 9. Save evaluation results
# ---------------------------------------------------------

df_eval.to_csv("transaction_eval_results.csv", index=False)
print("\nSaved: transaction_eval_results.csv")
print("Evaluation complete!")


Features loaded: (2512, 21)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Channel
- CustomerAge
- MerchantID
- TransactionType
Feature names seen at fit time, yet now missing:
- category
- category_amount_zscore
- category_usage_ratio
- high_risk_category
- new_category_flag
- ...


In [None]:
# ---------------------------------------------------------
# ISOLATION FOREST EVALUATION: K-FOLD & STABILITY
# ---------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------------------------------------
# 1. Load preprocessed features
# ---------------------------------------------------------

df = pd.read_csv("transaction_features.csv")
print("Loaded features:", df.shape)

# Load imputer
imputer = joblib.load("transaction_imputer.pkl")
X = imputer.transform(df)

# ---------------------------------------------------------
# 2. Evaluation parameters
# ---------------------------------------------------------

n_splits = 5          # 5-fold
n_estimators = 350
contamination = 0.03
random_state = 42

kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

# Store results
all_labels = []
all_scores = []

# ---------------------------------------------------------
# 3. K-Fold Isolation Forest Training & Evaluation
# ---------------------------------------------------------

print(f"\nRunning {n_splits}-Fold Stability Evaluation...")

fold_idx = 1
for train_idx, test_idx in kf.split(X):
    
    X_train, X_test = X[train_idx], X[test_idx]
    
    model = IsolationForest(
        n_estimators=n_estimators,
        contamination=contamination,
        max_samples="auto",
        random_state=random_state,
        n_jobs=-1
    )
    
    model.fit(X_train)
    
    # Predict anomalies on test fold
    pred_labels = model.predict(X_test)
    anomaly_flag = (pred_labels == -1).astype(int)
    scores = model.decision_function(X_test)
    
    all_labels.append(anomaly_flag)
    all_scores.append(scores)
    
    print(f"Fold {fold_idx}: Anomalies detected = {anomaly_flag.sum()} / {len(anomaly_flag)}")
    fold_idx += 1

# ---------------------------------------------------------
# 4. Analyze Stability Across Folds
# ---------------------------------------------------------

all_labels_arr = np.concatenate(all_labels)
all_scores_arr = np.concatenate(all_scores)

print("\n--- Overall Statistics ---")
print(f"Total samples evaluated: {len(all_labels_arr)}")
print(f"Total anomalies detected: {all_labels_arr.sum()}")
print(f"Anomaly percentage: {all_labels_arr.sum() / len(all_labels_arr) * 100:.2f}%")

# ---------------------------------------------------------
# 5. Score distribution visualization
# ---------------------------------------------------------

plt.figure(figsize=(10,5))
sns.histplot(all_scores_arr, bins=100, kde=True)
plt.title("Isolation Forest Decision Score Distribution (All Folds)")
plt.xlabel("Anomaly Score (higher = normal, lower = anomalous)")
plt.ylabel("Frequency")
plt.show()

# ---------------------------------------------------------
# 6. Optional: Stability metric (Jaccard similarity)
# ---------------------------------------------------------
# # Measure how consistent anomalies are across folds

# from itertools import combinations

# def jaccard_similarity(a, b):
#     return np.sum(a & b) / np.sum(a | b)

# similarities = []

# for fold_a, fold_b in combinations(all_labels, 2):
#     similarities.append(jaccard_similarity(fold_a.astype(bool), fold_b.astype(bool)))

# print("\nAnomaly detection stability (Jaccard similarity) across folds:")
# print(f"Mean similarity: {np.mean(similarities):.3f}")
# print(f"Min similarity : {np.min(similarities):.3f}")
# print(f"Max similarity : {np.max(similarities):.3f}")

# ---------------------------------------------------------
# 7. Save evaluation results
# ---------------------------------------------------------

eval_df = pd.DataFrame({
    "anomaly_flag": all_labels_arr,
    "anomaly_score": all_scores_arr
})

eval_df.to_csv("transaction_model_kfold_eval.csv", index=False)
print("\nSaved: transaction_model_kfold_eval.csv")
print("K-Fold Stability Evaluation COMPLETE!")
