In [1]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.1


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score,classification_report,precision_recall_curve,auc
from xgboost import XGBClassifier
import joblib

In [17]:
df=pd.read_csv("../data/processed/features_processed_cleaned.csv")
y=df["is_fraud"]
X=df.drop(columns=["is_fraud"])
print("shape of data",X.shape)
print("Fraud Percentage",round(y.mean()*100,2),"%")

shape of data (10000, 33)
Fraud Percentage 2.14 %


In [18]:
print(df.nunique())

amount                          9956
is_fraud                           2
hour                              24
is_weekend                         2
user_txn_count                    36
user_txn_sum                     100
user_avg_amount                  100
user_std_amount                  100
amount_zscore_user             10000
time_diff                       1588
txn_velocity                    1588
merchant_txn_count                35
merchant_unique_users             13
merchant_fraud_rate               48
device_risk_score                  3
location_risk_score                5
location_device_interaction       15
txn_count_1h                       4
txn_count_24h                     13
is_high_amount                     2
is_high_velocity                   2
user_risk_score                   15
transaction_type_purchase          2
transaction_type_top-up            2
transaction_type_transfer          2
device_type_Web                    2
device_type_iOS                    2
d

In [19]:
# train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
print("Train Size:",X_train.shape)
print("Test Size:",X_test.shape)

Train Size: (8000, 33)
Test Size: (2000, 33)


In [20]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=500, class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    ),
}


In [None]:
def evaluate_model(y_true, y_prob, name="Model"):
    """Custom function to print key metrics"""
    roc = roc_auc_score(y_true, y_prob)
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)

    # Precision at top 5% (for alert prioritization)
    k = int(0.05 * len(y_prob))
    top_k_idx = np.argsort(y_prob)[-k:]
    precision_topk = y_true.iloc[top_k_idx].mean()

    print(f"\n{name}")
    print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | Precision@Top5%: {precision_topk:.4f}")

# Loop through each model
for name, model in models.items():
    print(f"\nTraining {name} ...")
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]
    evaluate_model(y_test, y_prob, name)



ðŸš€ Training Logistic Regression ...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



ðŸ“Š Logistic Regression
ROC-AUC: 0.7834 | PR-AUC: 0.5414 | Precision@Top5%: 0.2400

ðŸš€ Training Random Forest ...

ðŸ“Š Random Forest
ROC-AUC: 0.8216 | PR-AUC: 0.6273 | Precision@Top5%: 0.2700

ðŸš€ Training XGBoost ...

ðŸ“Š XGBoost
ROC-AUC: 0.8053 | PR-AUC: 0.6316 | Precision@Top5%: 0.2700


In [None]:
# Phase II

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import joblib
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("../data/processed/features_enriched.csv")

y = df["is_fraud"]
X = df.drop(columns=["is_fraud", "timestamp"])

print("âœ… Data Loaded:", X.shape, "Fraud %:", round(y.mean()*100, 2))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)


print("\nðŸš€ XGBoost Optimization Started...")

scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

xgb_param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.7, 0.9],
    "gamma": [0, 0.1, 0.2],
}

xgb = XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    n_jobs=-1,
)

xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=xgb_param_grid,
    n_iter=15,
    scoring="roc_auc",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)

print("\nâœ… Best XGBoost Parameters:")
print(xgb_search.best_params_)
print(f"Best Mean CV ROC-AUC: {xgb_search.best_score_:.4f}")

best_xgb = xgb_search.best_estimator_


print("\nðŸš€ Random Forest Optimization Started...")

rf_param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
}

rf = RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1)

rf_search = RandomizedSearchCV(
    rf,
    param_distributions=rf_param_grid,
    n_iter=10,
    scoring="roc_auc",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print("\n Best Random Forest Parameters:")
print(rf_search.best_params_)
print(f"Best Mean CV ROC-AUC: {rf_search.best_score_:.4f}")

best_rf = rf_search.best_estimator_



def evaluate_model(y_true, y_prob, name="Model"):
    roc = roc_auc_score(y_true, y_prob)
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)
    k = int(0.05 * len(y_prob))
    top_k_idx = np.argsort(y_prob)[-k:]
    precision_topk = y_true.iloc[top_k_idx].mean()
    print(f"{name} | ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | Precision@Top5%: {precision_topk:.4f}")
    return roc, pr_auc, precision_topk


xgb_probs = best_xgb.predict_proba(X_test)[:, 1]
evaluate_model(y_test, xgb_probs, "ðŸŽ¯ XGBoost (Tuned)")


rf_probs = best_rf.predict_proba(X_test)[:, 1]
evaluate_model(y_test, rf_probs, "ðŸŒ² Random Forest (Tuned)")




âœ… Data Loaded: (10000, 43) Fraud %: 2.14
Train: (8000, 43)  Test: (2000, 43)

ðŸš€ XGBoost Optimization Started...
Fitting 3 folds for each of 15 candidates, totalling 45 fits

âœ… Best XGBoost Parameters:
{'subsample': 0.7, 'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.9}
Best Mean CV ROC-AUC: 0.9879

ðŸš€ Random Forest Optimization Started...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

âœ… Best Random Forest Parameters:
{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}
Best Mean CV ROC-AUC: 0.9805
ðŸŽ¯ XGBoost (Tuned) | ROC-AUC: 0.9784 | PR-AUC: 0.9028 | Precision@Top5%: 0.3900
ðŸŒ² Random Forest (Tuned) | ROC-AUC: 0.9772 | PR-AUC: 0.8727 | Precision@Top5%: 0.3800


(0.9771719884493351, 0.8726805807209803, 0.38)

In [27]:
# Phase III

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import joblib
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("../data/processed/features_enriched_v2.csv")

y = df["is_fraud"]
X = df.drop(columns=["is_fraud", "timestamp"])

print("âœ… Data Loaded:", X.shape, "Fraud %:", round(y.mean()*100, 2))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)


print("\nðŸš€ XGBoost Optimization Started...")

scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

xgb_param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.7, 0.9],
    "gamma": [0, 0.1, 0.2],
}

xgb = XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    n_jobs=-1,
)

xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=xgb_param_grid,
    n_iter=15,
    scoring="roc_auc",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)

print("\nâœ… Best XGBoost Parameters:")
print(xgb_search.best_params_)
print(f"Best Mean CV ROC-AUC: {xgb_search.best_score_:.4f}")

best_xgb = xgb_search.best_estimator_


print("\nðŸš€ Random Forest Optimization Started...")

rf_param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
}

rf = RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1)

rf_search = RandomizedSearchCV(
    rf,
    param_distributions=rf_param_grid,
    n_iter=10,
    scoring="roc_auc",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print("\n Best Random Forest Parameters:")
print(rf_search.best_params_)
print(f"Best Mean CV ROC-AUC: {rf_search.best_score_:.4f}")

best_rf = rf_search.best_estimator_



def evaluate_model(y_true, y_prob, name="Model"):
    roc = roc_auc_score(y_true, y_prob)
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)
    k = int(0.05 * len(y_prob))
    top_k_idx = np.argsort(y_prob)[-k:]
    precision_topk = y_true.iloc[top_k_idx].mean()
    print(f"{name} | ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | Precision@Top5%: {precision_topk:.4f}")
    return roc, pr_auc, precision_topk


xgb_probs = best_xgb.predict_proba(X_test)[:, 1]
evaluate_model(y_test, xgb_probs, "ðŸŽ¯ XGBoost (Tuned)")


rf_probs = best_rf.predict_proba(X_test)[:, 1]
evaluate_model(y_test, rf_probs, "ðŸŒ² Random Forest (Tuned)")




âœ… Data Loaded: (10000, 53) Fraud %: 2.14
Train: (8000, 53)  Test: (2000, 53)

ðŸš€ XGBoost Optimization Started...
Fitting 3 folds for each of 15 candidates, totalling 45 fits

âœ… Best XGBoost Parameters:
{'subsample': 0.7, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.9}
Best Mean CV ROC-AUC: 0.9883

ðŸš€ Random Forest Optimization Started...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

 Best Random Forest Parameters:
{'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
Best Mean CV ROC-AUC: 0.9787
ðŸŽ¯ XGBoost (Tuned) | ROC-AUC: 0.9885 | PR-AUC: 0.8988 | Precision@Top5%: 0.3900
ðŸŒ² Random Forest (Tuned) | ROC-AUC: 0.9747 | PR-AUC: 0.8529 | Precision@Top5%: 0.3800


(0.9747358914332569, 0.8528699684950707, 0.38)