In [1]:
# Credit Card Fraud Detection
# ===============================

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("credit_card_dataset.csv")
print(df.head())

   TransactionID TransactionDate   Amount  MerchantID TransactionType  \
0              1         15:35.5  4189.27         688          refund   
1              2         20:35.5  2659.71         109          refund   
2              3         08:35.5   784.00         394        purchase   
3              4         50:35.5  3514.40         944        purchase   
4              5         51:35.5   369.07         475        purchase   

       Location  IsFraud  
0   San Antonio        0  
1        Dallas        0  
2      New York        0  
3  Philadelphia        0  
4       Phoenix        0  


In [3]:
# -------------------------------
# STEP 2: Basic Cleaning
# -------------------------------

# TransactionID has no predictive value
df.drop(columns=["TransactionID"], inplace=True)

In [4]:
# -------------------------------
# STEP 3: Feature & Target Split
# -------------------------------

TARGET = "IsFraud"

X = df.drop(columns=[TARGET])
y = df[TARGET]

In [5]:
# -------------------------------
# STEP 4: Feature Engineering
# -------------------------------

# Log transform amount (real-world finance practice)
X["Amount_log"] = np.log1p(X["Amount"])
X.drop(columns=["Amount"], inplace=True)

# TransactionDate not clean â†’ drop for now
X.drop(columns=["TransactionDate"], inplace=True)

In [6]:
# -------------------------------
# STEP 5: Column Types
# -------------------------------

categorical_cols = ["MerchantID", "TransactionType", "Location"]
numerical_cols = ["Amount_log"]

In [7]:
# -------------------------------
# STEP 6: Preprocessing Pipeline
# -------------------------------

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

In [8]:
# -------------------------------
# STEP 7: Train-Test Split
# -------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (80000, 4)
Test size: (20000, 4)


In [9]:
# -------------------------------
# STEP 8: Model Selection
# -------------------------------

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=10,   # ðŸ”¥ VERY IMPORTANT
    eval_metric="logloss",
    random_state=42
)

In [10]:
print(y.value_counts())

IsFraud
0    99000
1     1000
Name: count, dtype: int64


In [11]:
# -------------------------------
# STEP 9: Full ML Pipeline
# -------------------------------

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", model)
])

In [12]:
# -------------------------------
# STEP 10: Train Model
# -------------------------------

print("Training model...")
pipeline.fit(X_train, y_train)
print("Model training completed")

Training model...
Model training completed


In [13]:
# -------------------------------
# STEP 11: Evaluation
# -------------------------------

y_pred = pipeline.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     19800
           1       0.00      0.00      0.00       200

    accuracy                           0.99     20000
   macro avg       0.49      0.50      0.50     20000
weighted avg       0.98      0.99      0.98     20000


Confusion Matrix:
[[19797     3]
 [  200     0]]


In [14]:
# -------------------------------
# STEP 12: Save Model
# -------------------------------

MODEL_PATH = "fraud_detection_pipeline.pkl"
joblib.dump(pipeline, MODEL_PATH)

print(f"\nâœ… Model saved successfully at: {MODEL_PATH}")


âœ… Model saved successfully at: fraud_detection_pipeline.pkl
