In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", palette="muted", font_scale=1.1)

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib


In [12]:
use_cols = ["step", "type", "amount", "oldbalanceOrg", "newbalanceOrig",
            "oldbalanceDest", "newbalanceDest", "isFraud"]

df = pd.read_csv("Fraud.csv", usecols=use_cols, low_memory=True)
df = df.sample(n=1_000_000, random_state=42)  # Sample 1M for faster processing


In [13]:
def create_features(X):
    X = X.copy()
    X['orig_balance_diff'] = X['oldbalanceOrg'] - X['newbalanceOrig']
    X['dest_balance_diff'] = X['newbalanceDest'] - X['oldbalanceDest']
    X['amount_log'] = np.log1p(X['amount'])
    X['hour'] = X['step'] % 24
    X['is_night'] = (X['hour'] < 6).astype(int)
    X['orig_empty'] = (X['oldbalanceOrg'] == 0).astype(int)
    X['dest_empty'] = (X['newbalanceDest'] == 0).astype(int)
    return X

feature_transformer = FunctionTransformer(create_features)


In [14]:
X = df.drop(columns=["isFraud"])
y = df["isFraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [15]:
numeric_features = ['amount_log', 'oldbalanceOrg', 'newbalanceOrig', 
                    'oldbalanceDest', 'newbalanceDest', 
                    'orig_balance_diff', 'dest_balance_diff', 'hour']

numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

categorical_features = ['type', 'orig_empty', 'dest_empty', 'is_night']

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [16]:
pipeline = ImbPipeline([
    ('feature_engineering', feature_transformer),
    ('preprocessor', preprocessor),
    ('smote', SMOTE(sampling_strategy=0.1, random_state=42)),  # Upsample minority
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_leaf=50,
        class_weight='balanced',
        n_jobs=-1,
        random_state=42
    ))
])


In [17]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print("5-Fold CV ROC-AUC scores:", cv_scores)
print("Mean CV ROC-AUC:", cv_scores.mean())


5-Fold CV ROC-AUC scores: [0.99914288 0.99986446 0.99953146 0.9995063  0.99976248]
Mean CV ROC-AUC: 0.9995615161967407


In [18]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print("5-Fold CV ROC-AUC scores:", cv_scores)
print("Mean CV ROC-AUC:", cv_scores.mean())


5-Fold CV ROC-AUC scores: [0.99914288 0.99986446 0.99953146 0.9995063  0.99976248]
Mean CV ROC-AUC: 0.9995615161967407


In [None]:
pipeline.fit(X_train, y_train)


In [None]:
y_pred_proba = pipeline.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Test ROC-AUC: {roc_auc:.3f}")

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.3f}")

# High threshold classification report
y_pred = (y_pred_proba > 0.9).astype(int)
print(classification_report(y_test, y_pred))


In [None]:
rf = pipeline.named_steps['classifier']
preproc = pipeline.named_steps['preprocessor']
ohe = preproc.named_transformers_['cat']['onehot']
cat_features = ohe.get_feature_names_out(categorical_features)
all_features = numeric_features + list(cat_features)

importances = pd.Series(rf.feature_importances_, index=all_features).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=importances.values[:15], y=importances.index[:15], palette='viridis')
plt.title("Top 15 Features by Importance")
plt.show()


In [None]:
joblib.dump(pipeline, "fraud_detection_advanced_pipeline.pkl")
print("Advanced pipeline saved! Ready for deployment.")
