# Project: Credit Card Fraud Detection

### Problem Statement: Build a machine learning model to detect fraudulent credit card transactions using historical transaction data.

# Import Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import(
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)

# Load Dataset

In [3]:
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

print("Train Shape:", train_df.shape)
print("Test Shape:", test_df.shape)

Train Shape: (1296675, 23)
Test Shape: (555719, 23)


# Reduce Dataset Size

In [4]:
train_df = train_df.sample(200000, random_state=42)

# Drop High - Cardinality & Useless Columns

In [5]:
cols_to_drop = [
    'unnamed: 0',
    'trans_num',
    'cc_num',
    'first',
    'last',
    'street',
    'merchant',
    'city',
    'zip'
]

train_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
test_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

In [7]:
print(train_df.columns)

Index(['Unnamed: 0', 'trans_date_trans_time', 'category', 'amt', 'gender',
       'state', 'lat', 'long', 'city_pop', 'job', 'dob', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'hour'],
      dtype='object')


# Feature Engineering(Date - Hour feature)

In [8]:
train_df['trans_date_trans_time'] = pd.to_datetime(train_df['trans_date_trans_time'])
test_df['trans_date_trans_time'] = pd.to_datetime(test_df['trans_date_trans_time'])

train_df['hour'] = train_df['trans_date_trans_time'].dt.hour
test_df['hour'] = test_df['trans_date_trans_time'].dt.hour

train_df.drop(columns=['trans_date_trans_time'], inplace=True)
test_df.drop(columns=['trans_date_trans_time'], inplace=True)

# Convert DOB to Age

In [14]:
train_df['dob'] = pd.to_datetime(train_df['dob'])
test_df['dob'] = pd.to_datetime(test_df['dob'])

train_df['age'] = 2025 - train_df['dob'].dt.year
test_df['age'] = 2025 -test_df['dob'].dt.year

train_df.drop(columns=['dob'], inplace=True)
test_df.drop(columns=['dob'], inplace=True)

# Drop High Cardinality Columns

In [15]:
cols_to_drop = ['first', 'last', 'street', 'merchant', 'city', 'job']

train_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
test_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Encode Categorical Columns

In [16]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = train_df.select_dtypes(include='object').columns

for col in categorical_cols:
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    le = LabelEncoder()
    le.fit(combined)
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

# Define Features & Target

In [17]:
X_train = train_df.drop("is_fraud", axis=1)
y_train = train_df["is_fraud"]

X_test = test_df.drop("is_fraud", axis=1)
y_test = test_df["is_fraud"]

# Feature Scaling

In [18]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model 1: Logistic Regression

In [19]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("===== Logistic Regression =====")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

===== Logistic Regression =====
Accuracy: 0.9941157311518951
ROC-AUC: 0.6865990609712317
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.29      0.38      0.33      2145

    accuracy                           0.99    555719
   macro avg       0.65      0.69      0.66    555719
weighted avg       0.99      0.99      0.99    555719



# Model 2: Decision Tree

In [21]:
dt = DecisionTreeClassifier(class_weight='balanced', max_depth=10)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("===== Decision Tree =====")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

===== Decision Tree =====
Accuracy: 0.9673468065695072
ROC-AUC: 0.9243999027198745
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    553574
           1       0.10      0.88      0.17      2145

    accuracy                           0.97    555719
   macro avg       0.55      0.92      0.58    555719
weighted avg       1.00      0.97      0.98    555719



# Model 3: Random Forest

In [22]:
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("===== Random Forest =====")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC_AUC:", roc_auc_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

===== Random Forest =====
Accuracy: 0.997660688225524
ROC_AUC: 0.7060253804177832
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.96      0.41      0.58      2145

    accuracy                           1.00    555719
   macro avg       0.98      0.71      0.79    555719
weighted avg       1.00      1.00      1.00    555719



# Confusion Matrix

In [23]:
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))

Confusion Matrix (Random Forest):
[[553535     39]
 [  1261    884]]
