In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Credit Card Fraud Detection – Single Kaggle Notebook
# Concepts: Imbalanced datasets, anomaly detection, precision-recall metrics
# Models: Logistic Regression, Random Forest, XGBoost, Isolation Forest baseline

# =========================
# 1) Imports & Setup
# =========================
import os, sys, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    roc_curve,
    classification_report,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # to use SMOTE in pipeline

import matplotlib.pyplot as plt

# XGBoost
from xgboost import XGBClassifier

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# =========================
# 2) Load Data
# =========================
# Adjust if your dataset path is different
DATA_PATH = "/kaggle/input/creditcardfraud/creditcard.csv"
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(
        f"Dataset not found at {DATA_PATH}. "
        "In Kaggle, add the dataset 'Credit Card Fraud Detection' to the notebook, "
        "or update DATA_PATH."
    )

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
print(df.head())

# Basic sanity checks
assert "Class" in df.columns, "Expected target column 'Class' not found."
target = "Class"

# =========================
# 3) Quick EDA: Class Imbalance
# =========================
class_counts = df[target].value_counts().sort_index()
fraud_ratio = class_counts[1] / class_counts.sum()
print("\nClass distribution:")
print(class_counts)
print(f"Fraud ratio: {fraud_ratio:.6f} ({fraud_ratio*100:.4f}%)")

# Optional: show a bar chart
plt.figure()
class_counts.plot(kind="bar")
plt.title("Class Distribution (0=Legit, 1=Fraud)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# =========================
# 4) Train/Validation Split
# =========================
X = df.drop(columns=[target])
y = df[target].astype(int)

# Identify numeric columns (these are all numeric in this dataset)
num_cols = X.columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

print(f"\nTrain shape: {X_train.shape}, Valid shape: {X_valid.shape}")
print("Train class balance:", np.bincount(y_train))
print("Valid class balance:", np.bincount(y_valid))

# =========================
# 5) Helpers: Metrics & Threshold Tuning
# =========================
def evaluate_probs(y_true, y_prob, name="model", plot_curves=True):
    """Compute ROC-AUC, PR-AUC (Average Precision), and optionally plot curves."""
    roc = roc_auc_score(y_true, y_prob)
    pr_auc = average_precision_score(y_true, y_prob)

    if plot_curves:
        # Precision-Recall
        precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
        plt.figure()
        plt.plot(recall, precision)
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"Precision-Recall Curve: {name} (AP={pr_auc:.4f})")
        plt.show()

        # ROC
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        plt.figure()
        plt.plot(fpr, tpr)
        plt.plot([0,1],[0,1], linestyle="--")
        plt.xlabel("FPR")
        plt.ylabel("TPR")
        plt.title(f"ROC Curve: {name} (AUC={roc:.4f})")
        plt.show()

    return {"model": name, "roc_auc": roc, "pr_auc": pr_auc}

def tune_threshold_for_precision(y_true, y_prob, min_precision=0.90):
    """Return the highest threshold that achieves at least `min_precision` (default 0.90).
       Fallback to threshold maximizing F1 if precision target not achievable."""
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    thresholds = np.r_[0, thresholds]  # align lengths (precision/recall start at threshold=0)

    # Find thresholds where precision >= min_precision
    mask = precision >= min_precision
    viable_thresholds = thresholds[mask]

    if len(viable_thresholds) > 0:
        thr = viable_thresholds[-1]  # highest threshold achieving target precision
        return thr, {"precision": precision[mask][-1], "recall": recall[mask][-1]}
    else:
        # Maximize F1 as fallback
        f1s = (2 * precision * recall) / (precision + recall + 1e-12)
        idx = np.nanargmax(f1s)
        return thresholds[idx], {"precision": precision[idx], "recall": recall[idx]}

def print_confusion_and_report(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))

# =========================
# 6) Baseline Anomaly Detection (Isolation Forest)
#    Trained on majority class only
# =========================
# IsolationForest scores: negative values anomalous; we convert to probabilities-ish via min-max scaling
iso = IsolationForest(
    n_estimators=200,
    contamination=fraud_ratio,  # approximate proportion of anomalies
    random_state=RANDOM_STATE
)

# Fit only on legit transactions from training set
X_train_legit = X_train[y_train == 0]
iso.fit(X_train_legit)

# Score validation
iso_scores = -iso.decision_function(X_valid)  # higher => more anomalous
# Normalize to [0,1] for comparability
iso_prob = (iso_scores - iso_scores.min()) / (iso_scores.max() - iso_scores.min() + 1e-9)

res_table = []
res_table.append(evaluate_probs(y_valid, iso_prob, name="IsolationForest", plot_curves=True))

# =========================
# 7) Logistic Regression (with Standardization + SMOTE)
# =========================
logreg_pipeline = ImbPipeline(steps=[
    ("scaler", StandardScaler(with_mean=False)),  # with_mean=False for sparse-safety (though data is dense)
    ("smote", SMOTE(random_state=RANDOM_STATE, sampling_strategy="auto", k_neighbors=5)),
    ("clf", LogisticRegression(max_iter=500, class_weight=None, solver="liblinear", random_state=RANDOM_STATE))
])

logreg_pipeline.fit(X_train, y_train)
logreg_prob = logreg_pipeline.predict_proba(X_valid)[:, 1]
res_table.append(evaluate_probs(y_valid, logreg_prob, name="LogReg+SMOTE", plot_curves=True))

# =========================
# 8) Random Forest (with SMOTE)
# =========================
rf_pipeline = ImbPipeline(steps=[
    ("smote", SMOTE(random_state=RANDOM_STATE, sampling_strategy="auto", k_neighbors=5)),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        n_jobs=-1,
        random_state=RANDOM_STATE
    ))
])

rf_pipeline.fit(X_train, y_train)
rf_prob = rf_pipeline.predict_proba(X_valid)[:, 1]
res_table.append(evaluate_probs(y_valid, rf_prob, name="RandomForest+SMOTE", plot_curves=True))

# =========================
# 9) XGBoost (scale_pos_weight instead of SMOTE)
#    scale_pos_weight ~ (neg / pos) helps skewed data
# =========================
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / max(pos, 1)

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="aucpr",  # focus on PR
    tree_method="hist",
    random_state=RANDOM_STATE,
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1
)

xgb.fit(X_train, y_train)
xgb_prob = xgb.predict_proba(X_valid)[:, 1]
res_table.append(evaluate_probs(y_valid, xgb_prob, name="XGBoost(spw)", plot_curves=True))

# =========================
# 10) Compare Models by PR-AUC (Average Precision)
# =========================
results_df = pd.DataFrame(res_table).sort_values("pr_auc", ascending=False)
print("\nModel comparison (sorted by PR-AUC):")
print(results_df)

# =========


ModuleNotFoundError: No module named 'sklearn.utils._metadata_requests'