# Credit Card Fraud Detection — Python Reproduction & Optimization

In [None]:

DATA_PATH = "/mnt/data/your_creditcard.csv"   # <-- CHANGE THIS
TARGET_COL = "Class"
ID_COLS = []

import os, warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, roc_curve, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import shap, joblib

assert os.path.exists(DATA_PATH), f"Dataset not found: {DATA_PATH}"
df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
print(df.head())
assert TARGET_COL in df.columns
print(df[TARGET_COL].value_counts())

if ID_COLS:
    df = df.drop(columns=ID_COLS, errors="ignore")

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET_COL in num_cols:
    num_cols.remove(TARGET_COL)

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

preprocess = ColumnTransformer([("num", StandardScaler(), X_train.select_dtypes(include=[np.number]).columns.tolist())], remainder="drop")

classes = np.unique(y_train)
cw = compute_class_weight("balanced", classes=classes, y=y_train)
cw = {c:w for c,w in zip(classes, cw)}
log_reg = LogisticRegression(max_iter=2000, class_weight=cw)
pipe_lr = Pipeline([("prep", preprocess), ("clf", log_reg)]).fit(X_train, y_train)
proba_lr = pipe_lr.predict_proba(X_test)[:,1]
print("LR ROC-AUC, PR-AUC:", roc_auc_score(y_test, proba_lr), average_precision_score(y_test, proba_lr))

rf = RandomForestClassifier(random_state=42, n_estimators=300, n_jobs=-1)
imb_pipe_rf = ImbPipeline([("prep", preprocess), ("smote", SMOTE(random_state=42)), ("model", rf)])
param_dist = {"model__n_estimators":[200,300,500],"model__max_depth":[None,8,12,16,24],"model__min_samples_split":[2,5,10],"model__min_samples_leaf":[1,2,4],"model__max_features":["sqrt","log2",None]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search_rf = RandomizedSearchCV(imb_pipe_rf, param_distributions=param_dist, n_iter=12, scoring="average_precision", n_jobs=-1, cv=cv, random_state=42, verbose=1).fit(X_train, y_train)
best_rf = search_rf.best_estimator_
proba_rf = best_rf.predict_proba(X_test)[:,1]
print("RF ROC-AUC, PR-AUC:", roc_auc_score(y_test, proba_rf), average_precision_score(y_test, proba_rf))

# Calibration on the better of LR vs RF
use_rf = average_precision_score(y_test, proba_rf) >= average_precision_score(y_test, proba_lr)
base = best_rf if use_rf else pipe_lr
cal = CalibratedClassifierCV(base, method="isotonic", cv=5).fit(X_train, y_train)
proba_cal = cal.predict_proba(X_test)[:,1]
print("Calibrated ROC-AUC, PR-AUC:", roc_auc_score(y_test, proba_cal), average_precision_score(y_test, proba_cal))

# Threshold tuning with a cost matrix
import numpy as np
COST_FN, COST_FP = 25.0, 1.0
def expected_cost(y_true, y_prob, t):
    y_pred = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp*COST_FP + fn*COST_FN

thresholds = np.linspace(0.01, 0.99, 99)
costs = [expected_cost(y_test, proba_cal, t) for t in thresholds]
best_t = thresholds[int(np.argmin(costs))]
print("Best threshold:", best_t, "Min expected cost:", min(costs))
y_pred_best = (proba_cal >= best_t).astype(int)
print("Confusion matrix at best threshold:\n", confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best, digits=4))

# Save artifacts
out_dir = "/mnt/data/cc_fraud_outputs"
import os, json
os.makedirs(out_dir, exist_ok=True)
joblib.dump(cal, os.path.join(out_dir, "model_calibrated.joblib"))
joblib.dump(preprocess, os.path.join(out_dir, "preprocess.joblib"))
with open(os.path.join(out_dir, "metadata.json"), "w") as f:
    json.dump({"best_threshold": float(best_t)}, f, indent=2)

print("Saved to:", out_dir)
