# Fraud Detection — End-to-End Solution

**Goal:** Build a model that predicts fraudulent transactions and provide business insights and an action plan.

Dataset: fraud.csv (6.36M rows, 11 columns)

Target: `isFraud`

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
pd.options.display.max_columns = 200

In [None]:
# Load dataset
df = pd.read_csv("fraud.csv")
print("Shape:", df.shape)
df.head()

In [None]:
# Target distribution
print(df['isFraud'].value_counts(normalize=True))

In [None]:
# Drop identifiers and isFlaggedFraud (not predictive)
df_model = df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])

X = df_model.drop(columns=['isFraud'])
y = df_model['isFraud']

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)

numeric_features = ['step','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']
categorical_features = ['type']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
from sklearn.pipeline import Pipeline as SkPipeline

lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
lr_pipe = SkPipeline(steps=[('pre', preprocessor), ('clf', lr)])

print("Training Logistic Regression...")
lr_pipe.fit(X_train, y_train)
y_val_proba = lr_pipe.predict_proba(X_val)[:,1]
print("ROC AUC:", roc_auc_score(y_val, y_val_proba))
precision, recall, _ = precision_recall_curve(y_val, y_val_proba)
print("PR AUC:", auc(recall, precision))

In [None]:
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', n_jobs=-1, random_state=RANDOM_SEED)
rf_pipe = SkPipeline(steps=[('pre', preprocessor), ('clf', rf)])

print("Training RandomForest...")
rf_pipe.fit(X_train, y_train)
y_val_proba_rf = rf_pipe.predict_proba(X_val)[:,1]
print("ROC AUC:", roc_auc_score(y_val, y_val_proba_rf))
precision, recall, _ = precision_recall_curve(y_val, y_val_proba_rf)
print("PR AUC:", auc(recall, precision))

In [None]:
import seaborn as sns

def eval_at_threshold(y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    print(classification_report(y_true, y_pred, digits=4))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

eval_at_threshold(y_val, y_val_proba_rf, threshold=0.5)