In [5]:
import fraud_detection.viz.model_plots as viz
from fraud_detection.models.pipeline import build_pipeline
from fraud_detection.models.train import train_and_evaluate
from fraud_detection.models.compare import score_models, compare_models,promote_best_model
from fraud_detection.core.settings import settings
from fraud_detection.data.loader import DataHandler

In [None]:

test_original = DataHandler.from_registry("DATA", "processed_dir", "test_original.parquet").load()
train_original = DataHandler.from_registry("DATA", "processed_dir", "train_original.parquet").load()
train_resampled = DataHandler.from_registry("DATA", "processed_dir", "train_resampled.parquet").load()

FEATURES = settings.get("features")
TARGET = FEATURES["target"]

X_train_orig = train_original.drop(columns=[TARGET])
y_train_orig = train_original[TARGET]

X_train_res = train_resampled.drop(columns=[TARGET])
y_train_res = train_resampled[TARGET]

X_test = test_original.drop(columns=[TARGET])
y_test = test_original[TARGET]

# ---------------------------
# Config
# ---------------------------

def get_profile(profile_name: str) -> dict:
    profile = settings.get("profiles", {}).get(profile_name)
    if not profile:
        raise ValueError(f"Profile '{profile_name}' not found in settings.")
    return profile
PROFILE_NAME = "credit_card"
profile = get_profile(PROFILE_NAME)

# Registry
experiment_name = profile["registry"]["experiment_name"]
registered_model_name = profile["registry"]["registered_model_name"]

results = {}
run_ids = {}
thresholds = {}
pipelines = {}


In [None]:

# ---------------------------
# Baseline models (original data)
# ---------------------------
baseline_models = {
    "Logistic Regression": {"pipeline": build_pipeline("logistic_regression"), "threshold_metric": "f1"},
}

for model_name, cfg in baseline_models.items():
    print(f"\n=== Training {model_name} (baseline) ===")
    pipe, metrics, threshold, run_id = train_and_evaluate(
        pipeline=cfg["pipeline"],
        X_train=X_train_orig,
        y_train=y_train_orig,
        X_test=X_test,
        y_test=y_test,
        model_name=model_name,
        profile_name=PROFILE_NAME,
    )
    results[model_name] = metrics
    run_ids[model_name] = run_id
    thresholds[model_name] = threshold
    pipelines[model_name] = pipe


In [None]:

# ---------------------------
# Ensemble / more complex models (resampled data)
# ---------------------------
ensemble_models = {
    "Random Forest": {"pipeline": build_pipeline("random_forest"), "threshold_metric": "f1"},
    "XGBoost": {"pipeline": build_pipeline("xgboost"), "threshold_metric": "f1"},
    "LightGBM": {"pipeline": build_pipeline("lightgbm"), "threshold_metric": "f1"},
}

for model_name, cfg in ensemble_models.items():
    print(f"\n=== Training {model_name} (resampled) ===")
    pipe, metrics, threshold, run_id = train_and_evaluate(
        pipeline=cfg["pipeline"],
        X_train=X_train_res,
        y_train=y_train_res,
        X_test=X_test,
        y_test=y_test,
        model_name=model_name,
        profile_name=PROFILE_NAME,
    )
    results[model_name] = metrics
    run_ids[model_name] = run_id
    thresholds[model_name] = threshold
    pipelines[model_name] = pipe

# ---------------------------
# Compare & score models
# ---------------------------
print("\n=== Raw metrics comparison ===")
print(compare_models(results))

print("\n=== Profile-based scoring ===")
scored_df = score_models(results, PROFILE_NAME)
print(scored_df)

# ---------------------------
# Promote best model
# ---------------------------
best_model_name = scored_df.index[0]
best_metrics = scored_df.iloc[0].to_dict()
run_id_to_promote = run_ids[best_model_name]
promotion_msg = promote_best_model(profile_name=PROFILE_NAME, run_id=run_id_to_promote)

print(f"\nPromotion result: {promotion_msg}")
print(f"Best model selected: {best_model_name}")
print(f"Metrics: {best_metrics}")
print(f"Thresholds: {thresholds}")

In [None]:
predictions = {}

for model_name, pipe in pipelines.items():
    y_proba = pipe.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= thresholds[model_name]).astype(int)

    predictions[model_name] = {
        "y_pred": y_pred,
        "y_proba": y_proba,
    }

In [None]:
for model_name, data in predictions.items():
    y_pred = data["y_pred"]
    y_proba = data["y_proba"]

    print(f"\n=== {model_name} ===")

    fig, axes = plt.subplots(
        nrows=1,
        ncols=3,
        figsize=(20, 5),
    )

    # 1️⃣ Confusion Matrix
    viz.plot_confusion_matrix(
        y_true=y_test.values,
        y_pred=y_pred,
        title="Confusion Matrix",
        ax=axes[0],
    )

    # 2️⃣ Precision–Recall Curve
    viz.plot_precision_recall_curve(
        y_true=y_test.values,
        y_proba=y_proba,
        model_name=model_name,
        ax=axes[1],
    )

    # 3️⃣ ROC Curve
    viz.plot_roc_curve(
        y_true=y_test.values,
        y_proba=y_proba,
        model_name=model_name,
        ax=axes[2],
    )

    fig.suptitle(model_name, fontsize=16, y=1.05)
    plt.tight_layout()
    plt.show()

    # 4️⃣ Classification Report
    report_df = viz.get_classification_report_df(
        y_true=y_test.values,
        y_pred=y_pred,
    )
    print(report_df)