In [2]:
!pip install xgboost



In [None]:
# Final tuned model for best accuracy

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import joblib
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("../data/processed/features_enriched_v2.csv")

y = df["is_fraud"]
X = df.drop(columns=["is_fraud", "timestamp"])

print("âœ… Data Loaded:", X.shape, "Fraud %:", round(y.mean()*100, 2))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)


print("\nðŸš€ XGBoost Optimization Started...")

scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

xgb_param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.7, 0.9],
    "gamma": [0, 0.1, 0.2],
}

xgb = XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    n_jobs=-1,
)

xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=xgb_param_grid,
    n_iter=15,
    scoring="roc_auc",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)

print("\nâœ… Best XGBoost Parameters:")
print(xgb_search.best_params_)
print(f"Best Mean CV ROC-AUC: {xgb_search.best_score_:.4f}")

best_xgb = xgb_search.best_estimator_


print("\nðŸš€ Random Forest Optimization Started...")

rf_param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
}

rf = RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1)

rf_search = RandomizedSearchCV(
    rf,
    param_distributions=rf_param_grid,
    n_iter=10,
    scoring="roc_auc",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print("\n Best Random Forest Parameters:")
print(rf_search.best_params_)
print(f"Best Mean CV ROC-AUC: {rf_search.best_score_:.4f}")

best_rf = rf_search.best_estimator_



def evaluate_model(y_true, y_prob, name="Model"):
    roc = roc_auc_score(y_true, y_prob)
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)
    k = int(0.05 * len(y_prob))
    top_k_idx = np.argsort(y_prob)[-k:]
    precision_topk = y_true.iloc[top_k_idx].mean()
    print(f"{name} | ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | Precision@Top5%: {precision_topk:.4f}")
    return roc, pr_auc, precision_topk


xgb_probs = best_xgb.predict_proba(X_test)[:, 1]
evaluate_model(y_test, xgb_probs, "ðŸŽ¯ XGBoost (Tuned)")


rf_probs = best_rf.predict_proba(X_test)[:, 1]
evaluate_model(y_test, rf_probs, "ðŸŒ² Random Forest (Tuned)")




âœ… Data Loaded: (10000, 53) Fraud %: 2.14
Train: (8000, 53)  Test: (2000, 53)

ðŸš€ XGBoost Optimization Started...
Fitting 3 folds for each of 15 candidates, totalling 45 fits

âœ… Best XGBoost Parameters:
{'subsample': 0.7, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.9}
Best Mean CV ROC-AUC: 0.9883

ðŸš€ Random Forest Optimization Started...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

 Best Random Forest Parameters:
{'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
Best Mean CV ROC-AUC: 0.9787
ðŸŽ¯ XGBoost (Tuned) | ROC-AUC: 0.9885 | PR-AUC: 0.8988 | Precision@Top5%: 0.3900
ðŸŒ² Random Forest (Tuned) | ROC-AUC: 0.9747 | PR-AUC: 0.8529 | Precision@Top5%: 0.3800


(0.9747358914332569, 0.8528699684950707, 0.38)

In [12]:


import os
import joblib

# Create directory for models if it doesn't exist
os.makedirs("../models", exist_ok=True)

# Save the best XGBoost model
xgb_model_path = "../models/xgboost_tuned.pkl"
joblib.dump(best_xgb, xgb_model_path)
print(f"âœ… XGBoost model saved successfully â†’ {xgb_model_path}")

# Save the best Random Forest model
rf_model_path = "../models/random_forest_tuned.pkl"
joblib.dump(best_rf, rf_model_path)
print(f"âœ… Random Forest model saved successfully â†’ {rf_model_path}")


âœ… XGBoost model saved successfully â†’ ../models/xgboost_tuned.pkl
âœ… Random Forest model saved successfully â†’ ../models/random_forest_tuned.pkl
