In [None]:
import matplotlib.pyplot as plt
import fraud_detection.viz.model_plots as viz
from fraud_detection.data.loader import DataHandler
from fraud_detection.models.pipeline import build_pipeline
from fraud_detection.models.train import train_and_evaluate
from fraud_detection.core.settings import settings
import fraud_detection.models.compare as cp

In [None]:
# Test_original Data
test_original = DataHandler.from_registry(
    "DATA", "processed_dir", "test_original.parquet").load()

# Train_original Data
train_original = DataHandler.from_registry(
    "DATA", "processed_dir", "train_original.parquet").load()

# Train_resampled Data
train_resampled = DataHandler.from_registry(
    "DATA", "processed_dir", "train_resampled.parquet").load()

In [None]:
print("Train (original):", train_original.shape)
print("Train (resampled):", train_resampled.shape)
print("Test (original):", test_original.shape)

print("\nClass distribution (original train):")
print(train_original["class"].value_counts(normalize=True))

print("\nClass distribution (resampled train):")
print(train_resampled["class"].value_counts(normalize=True))

In [None]:
FEATURES = settings.get("features")

TARGET = FEATURES["target"]


X_train_orig = train_original.drop(columns=[TARGET])
y_train_orig = train_original[TARGET]

X_train_res = train_resampled.drop(columns=[TARGET])
y_train_res = train_resampled[TARGET]

X_test = test_original.drop(columns=[TARGET])
y_test = test_original[TARGET]

#### Training and Evalute on Original Data

In [None]:
results = {}
run_ids = {}
thresholds = {}
pipelines = {}


In [None]:
# -------------------------------
# Logistic Regression
# -------------------------------

profile_name = "ecommerce"
experiment_name = "fraud_detection_models"

baseline_models = {
    "Logistic Regression": {
        "pipeline": build_pipeline("logistic_regression"),
        "threshold_metric": "f1",
    },
}

for model_name, cfg in baseline_models.items():
    pipe, metrics, threshold, run_id = train_and_evaluate(
        pipeline=cfg["pipeline"],
        X_train=X_train_orig,
        y_train=y_train_orig,
        X_test=X_test,
        y_test=y_test,
        optimize_threshold=True,
        threshold_metric=cfg["threshold_metric"],
        model_name=model_name,
        profile_name=profile_name,
    )

    results[model_name] = metrics
    run_ids[model_name] = run_id
    thresholds[model_name] = threshold
    pipelines[model_name] = pipe

### Train and Evaluate on SMOTE Resampled Data

In [None]:

profile_name = "ecommerce"
experiment_name = "fraud_detection_models"

ensemble_models = {
    "Random Forest": {
        "pipeline": build_pipeline("random_forest"),
        "threshold_metric": "f1",
    },
    "XGBoost": {
        "pipeline": build_pipeline("xgboost"),
        "threshold_metric": "f1",
    },
}

for model_name, cfg in ensemble_models.items():
    pipe, metrics, threshold, run_id = train_and_evaluate(
        pipeline=cfg["pipeline"],
        X_train=X_train_res,
        y_train=y_train_res,
        X_test=X_test,
        y_test=y_test,
        optimize_threshold=True,
        threshold_metric=cfg["threshold_metric"],
        model_name=model_name,
        profile_name=profile_name,
    )

    results[model_name] = metrics
    run_ids[model_name] = run_id
    thresholds[model_name] = threshold
    pipelines[model_name] = pipe

In [None]:
scored_df = cp.score_models(
    results=results,
    profile_name=profile_name,
)

print(scored_df)

In [None]:
best_model, best_metrics, reason = cp.select_best_model(
    results=results,
    profile_name=profile_name,
)

print(reason)

In [None]:
best_run_id = run_ids[best_model]

promotion_result = cp.promote_best_model(
    profile_name=profile_name,
    run_id=best_run_id,
)

print(f"{best_model} promoted to Production:", promotion_result)

In [None]:
best_run_id = run_ids[best_model]

promotion_result = cp.promote_best_model(
    profile_name="ecommerce",
    run_id=best_run_id,
)

print(f"{best_model} promoted to Production:", promotion_result)

In [None]:
predictions = {}

for model_name, pipe in pipelines.items():
    y_proba = pipe.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= thresholds[model_name]).astype(int)

    predictions[model_name] = {
        "y_pred": y_pred,
        "y_proba": y_proba,
    }

In [None]:
for model_name, data in predictions.items():
    y_pred = data["y_pred"]
    y_proba = data["y_proba"]

    print(f"\n=== {model_name} ===")

    fig, axes = plt.subplots(
        nrows=1,
        ncols=3,
        figsize=(20, 5),
    )

    # 1️⃣ Confusion Matrix
    viz.plot_confusion_matrix(
        y_true=y_test.values,
        y_pred=y_pred,
        title="Confusion Matrix",
        ax=axes[0],
    )

    # 2️⃣ Precision–Recall Curve
    viz.plot_precision_recall_curve(
        y_true=y_test.values,
        y_proba=y_proba,
        model_name=model_name,
        ax=axes[1],
    )

    # 3️⃣ ROC Curve
    viz.plot_roc_curve(
        y_true=y_test.values,
        y_proba=y_proba,
        model_name=model_name,
        ax=axes[2],
    )

    fig.suptitle(model_name, fontsize=16, y=1.05)
    plt.tight_layout()
    plt.show()

    # 4️⃣ Classification Report
    report_df = viz.get_classification_report_df(
        y_true=y_test.values,
        y_pred=y_pred,
    )
    print(report_df)