In [1]:
"""
Credit Card Fraud Detection
Description: Classifies fraudulent vs. non-fraudulent transactions using multiple classification models and compares them
Dataset: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
"""

# --- Imports --- #
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Imbalanced-learn module and preprocessing (Very crucial as we will later see)
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset 
df = pd.read_csv('creditcard.csv')

In [3]:
# Check the data
print(df.describe())

                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.168375e-15  3.416908e-16 -1.379537e-15  2.074095e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V8            V9  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   9.604066e-16  1.487313e-15 -5.556467e-16  1.213481e-16 -2.406331e-15   
std    1.380247e+00  1.332271e+00  1.23709

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
df['Class'].value_counts(normalize=True)
# Heavily imbalanced dataset, only 0.17% of the samples are frauds.

Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

In [7]:
# --- Preprocessing the data --- #
# Split into features and target 
X = df.drop('Class', axis=1)
y = df['Class']
# Split the dataset into training and testing data; 80% and 20% of the original dataset respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
#Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# To tackle the huge imbalance, apply SMOTE to oversample the training set only.
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [10]:
print("Class dist. before resampling:\n\n",y_train.value_counts(normalize=True),end='\n\n') # HUGE IMBALANCE
print("Class dist. after resampling:\n\n",y_train_res.value_counts(normalize=True)) # PERFECTLY BALANCED AFTER OVERSAMPLING

Class dist. before resampling:

 Class
0    0.998293
1    0.001707
Name: proportion, dtype: float64

Class dist. after resampling:

 Class
0    0.5
1    0.5
Name: proportion, dtype: float64


In [15]:
# --- Evaluation Function --- #
"""
Evaluates a classification model and prints metrics.
May use customed threshold optionally.
"""
def evaluate_model(name, model, X_test, y_test, threshold=None):
    
    print(f"\n---{name.upper()}---")
    # Evaluates the classification model
    if threshold:
        y_prob = model.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
    else:
        y_pred = model.predict(X_test)
        
    # Prints metrics; Classification report, Confusion matrix and ROC AUC Score
    print(classification_report(y_test, y_pred))
    print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))
    print('\nROC AUC SCORE:',roc_auc_score(y_test, y_pred))

In [16]:
"""--- Model 1: Logistic Regression ---"""
clf1 = LogisticRegression(max_iter=1000, random_state=42)
clf1.fit(X_train_res, y_train_res)

In [17]:
evaluate_model("Logistic Regression", clf1, X_test_scaled, y_test, threshold=0.7)


---LOGISTIC REGRESSION---
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     56859
           1       0.12      0.91      0.21       103

    accuracy                           0.99     56962
   macro avg       0.56      0.95      0.60     56962
weighted avg       1.00      0.99      0.99     56962

Predicted      0    1
Actual               
0          56139  720
1              9   94

ROC AUC SCORE: 0.9499792281263975


RESULTS USING LOGISTIC REGRESSION:
    POOR PRECISION: ONLY 12% OF POSITIVE PREDICTIONS WERE ACTUALLY FRAUD
    743 FALSE ALARMS
    GOOD RECALL AS YOU CAUGHT 93% OF ALL ACTUAL FRAUDS AVAILBLE
    ROC AUC SCORE: 96.076%

In [18]:
"""--- Model 2: Stochastic Gradient Descent ---"""
clf2 = SGDClassifier(loss='log_loss', random_state=42)
clf2.fit(X_train_res, y_train_res)

In [19]:
evaluate_model("Stochastic Gradient Descent", clf2, X_test_scaled, y_test, threshold=1.0)


---STOCHASTIC GRADIENT DESCENT---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56859
           1       0.82      0.41      0.55       103

    accuracy                           1.00     56962
   macro avg       0.91      0.70      0.77     56962
weighted avg       1.00      1.00      1.00     56962

Predicted      0   1
Actual              
0          56850   9
1             61  42

ROC AUC SCORE: 0.7038043520020654


RESULTS USING SGD CLASSIFIER:
    GOOD PRECISION: 85% OF POSITIVE PREDICTIONS WERE ACTUALLY FRAUD
    only 8 FALSE ALARMS
    POOR RECALL AS YOU ONLY CAUGHT 42% OF ALL ACTUAL FRAUDS AVAILBLE
    ROC AUC SCORE: 71.021%

In [20]:
"""--- Model 3: Ridge Classifier ---"""
clf3 = RidgeClassifier(max_iter=1000, random_state=42)
clf3.fit(X_train_res, y_train_res)

In [21]:
evaluate_model("Ridge Classifier", clf3, X_test_scaled, y_test)


---RIDGE CLASSIFIER---
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     56859
           1       0.09      0.83      0.17       103

    accuracy                           0.99     56962
   macro avg       0.55      0.91      0.58     56962
weighted avg       1.00      0.99      0.99     56962

Predicted      0    1
Actual               
0          56047  812
1             18   85

ROC AUC SCORE: 0.9054808889371546


RESULTS USING RIDGE CLASSIFIER:
    POOR PRECISION: ONLY 11% OF POSITIVE PREDICTIONS WERE ACTUALLY FRAUD
    769 FALSE ALARMS
    GOOD RECALL AS YOU CAUGHT 89% OF ALL ACTUAL FRAUDS AVAILABLE
    ROC AUC SCORE: 93.716%

In [22]:
"""--- Model 4: Naive Bayes Classifier ---"""
clf4 = GaussianNB()
clf4.fit(X_train_res, y_train_res)

In [23]:
evaluate_model("Gaussian Naive Bayes Classifier", clf4, X_test_scaled, y_test)


---GAUSSIAN NAIVE BAYES CLASSIFIER---
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56859
           1       0.06      0.89      0.11       103

    accuracy                           0.97     56962
   macro avg       0.53      0.93      0.55     56962
weighted avg       1.00      0.97      0.99     56962

Predicted      0     1
Actual                
0          55441  1418
1             11    92

ROC AUC SCORE: 0.9341324997946717


RESULTS USING NAIVE BAYES CLASSIFIER:
    POOR PRECISION: ONLY 6% OF POSITIVE PREDICTIONS WERE ACTUALLY FRAUD
    1382 FALSE ALARMS
    GOOD RECALL AS YOU CAUGHT 90% OF ALL ACTUAL FRAUDS AVAILBLE
    ROC AUC SCORE: 93.644%

# --- Conclusion --- #

-> Logistic and Ridge Regression both performed similarly with solid recall (86-88%) but poor precision ( <12%).
-> SGD performed well in precision; most of positive predictions were correct, perhaps due
   to the low threshold given. However, performed bad in recall, as it flagged only 42% of all
   fraudulent transactions in the dataset.
-> Naive-Bayes performed slightly worse than Logistc Regression and Ridge Regression in both precision and recall.
-> ROC AUC SCORES are consistently high~ except in SGD case; may be improperly tuned.
