In [15]:
# ================================================================
#  MOBILE PAYMENT FRAUD DETECTION - SIMPLE MODEL TRAINING PIPELINE
# ================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, precision_recall_curve, auc
import joblib

# ================================================================
# STEP 1: LOAD THE FINAL FEATURES DATASET
# ================================================================
df = pd.read_csv("features_engineered.csv")

# Separate the target variable
y = df["is_fraud"]
X = df.drop(columns=["is_fraud", "timestamp","transaction_id","location"])  # we drop timestamp, it’s not needed for training

print("✅ Data Loaded")
print("Shape of data:", X.shape)
print("Fraud percentage:", round(y.mean() * 100, 2), "%")

# ================================================================
# STEP 2: SPLIT INTO TRAIN AND TEST SETS
# ================================================================
# We’ll do a simple 80-20 split (this is okay for now)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train size:", X_train.shape, "| Test size:", X_test.shape)

# ================================================================
# STEP 3: TRAIN A RANDOM FOREST MODEL
# ================================================================
# Random Forest is good for tabular fraud data
model = RandomForestClassifier(
    n_estimators=200,          # number of trees
    max_depth=None,           # let it grow fully
    random_state=42,
    class_weight="balanced",  # handles imbalance automatically
    n_jobs=-1                 # use all CPU cores
)

model.fit(X_train, y_train)

print("✅ Model Training Complete")

# ================================================================
# STEP 4: EVALUATE THE MODEL
# ================================================================
# Get fraud probabilities (not just 0/1)
y_prob = model.predict_proba(X_test)[:, 1]

# ROC-AUC score (measures how well we rank frauds)
roc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC Score: {roc:.4f}")

# Precision-Recall Curve (important for fraud)
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
pr_auc = auc(recall, precision)
print(f"PR-AUC Score: {pr_auc:.4f}")

# Print classification report at default threshold = 0.5
y_pred = (y_prob >= 0.5).astype(int)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

# ================================================================
# STEP 5: SAVE THE MODEL
# ================================================================
joblib.dump(model, "fraud_rf_model.pkl")
print("✅ Model saved as fraud_rf_model.pkl")

# ================================================================
# STEP 6: SCORE NEW TRANSACTIONS (Example)
# ================================================================
# Take any one row from test data as an example
sample = X_test.iloc[0]
sample_prob = model.predict_proba(sample.values.reshape(1, -1))[0][1]

print("\nExample Fraud Probability for one transaction:")
print(f"Risk Score = {sample_prob:.3f} (1.0 = high fraud risk)")


✅ Data Loaded
Shape of data: (10000, 26)
Fraud percentage: 2.14 %
Train size: (8000, 26) | Test size: (2000, 26)
✅ Model Training Complete
ROC-AUC Score: 0.7709
PR-AUC Score: 0.6331

Classification Report:
              precision    recall  f1-score   support

           0     0.9914    1.0000    0.9957      1957
           1     1.0000    0.6047    0.7536        43

    accuracy                         0.9915      2000
   macro avg     0.9957    0.8023    0.8746      2000
weighted avg     0.9916    0.9915    0.9905      2000

✅ Model saved as fraud_rf_model.pkl

Example Fraud Probability for one transaction:
Risk Score = 0.005 (1.0 = high fraud risk)


In [14]:
df.head()

Unnamed: 0,transaction_id,timestamp,user_id,amount,merchant_id,location,device_type,transaction_type,is_fraud,hour,...,user_txn_sum,merchant_txn_count,merchant_avg_amount,merchant_std_amount,merchant_fraud_rate,hour_fraud_rate,dow_fraud_rate,user_merchant_ratio,unusual_amount_flag,merchant_unique_users
0,TXN009738,2025-05-05 01:02:00,1000,3518.44,2027,Kolkata,0.023056,0.023336,0,1,...,486639.3,204,5194.839755,3318.624361,0.02451,0.019048,0.020371,0.482927,0,90
1,TXN003367,2025-05-05 05:33:00,1000,7672.91,2009,Kolkata,0.015703,0.026379,0,5,...,486639.3,187,5606.06385,4903.170978,0.048128,0.034014,0.020371,0.526596,0,84
2,TXN008530,2025-05-05 07:58:00,1000,3881.75,2027,Bangalore,0.015703,0.023336,0,7,...,486639.3,204,5194.839755,3318.624361,0.02451,0.016055,0.020371,0.482927,0,90
3,TXN000201,2025-05-05 10:04:00,1000,5997.26,2013,Bangalore,0.015927,0.021967,0,10,...,486639.3,181,5122.96453,3586.81026,0.022099,0.016317,0.020371,0.543956,0,84
4,TXN002371,2025-05-05 10:08:00,1000,3780.48,2018,Kolkata,0.024601,0.020039,0,10,...,486639.3,212,5284.337877,3146.784533,0.009434,0.016317,0.020371,0.464789,0,89
