
​
Presentation on : Credit Card Fraud Detection with Machine Learning​



## *Importing all the required libraries*

In [1]:
# Load Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Enable inline plotting for Jupyter Notebooks
%matplotlib inline


## Uploading the Data Set

In [4]:
import pandas as pd

# Load the uploaded dataset to inspect its contents
file_path = '/content/Data.csv.csv'
data = pd.read_csv(file_path)

## Data Exploration

In [6]:
data.columns

Index(['CUST_ID', 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE', 'TRANSACTION_TIME',
       'DEVICE_TYPE', 'TRANSACTION_LOCATION', 'CUSTOMER_LOCATION',
       'FRAUD_FLAG'],
      dtype='object')

In [12]:
data.shape

(8950, 23)

In [7]:
data.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,...,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,TRANSACTION_TIME,DEVICE_TYPE,TRANSACTION_LOCATION,CUSTOMER_LOCATION,FRAUD_FLAG
0,C10001,-0.731989,-0.249434,-0.4249,-0.356934,-0.349079,-0.466786,-0.80649,-0.678661,-0.707313,...,-0.960433,-0.528979,-0.3024,-0.525551,12,1,2,9,6,0
1,C10002,0.786961,0.134325,-0.469552,-0.356934,-0.454576,2.605605,-1.221758,-0.678661,-0.916995,...,0.688639,0.818642,0.0975,0.234227,12,3,2,2,2,0
2,C10003,0.447135,0.518084,-0.107668,0.108889,-0.454576,-0.466786,1.269843,2.673451,-0.916995,...,0.826062,-0.383805,-0.093293,-0.525551,12,2,2,7,9,0
3,C10004,0.049099,-1.016953,0.232058,0.546189,-0.454576,-0.368653,-1.014125,-0.399319,-0.916995,...,0.826062,-0.598688,-0.228307,-0.525551,12,1,1,9,6,0
4,C10005,-0.358775,0.518084,-0.462063,-0.347294,-0.454576,-0.466786,-1.014125,-0.399319,-0.916995,...,-0.905464,-0.364368,-0.257266,-0.525551,12,1,0,5,3,0


In [11]:
data.describe()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,...,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,TRANSACTION_TIME,DEVICE_TYPE,TRANSACTION_LOCATION,CUSTOMER_LOCATION,FRAUD_FLAG
count,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,...,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0
mean,-6.35122e-18,1.492537e-16,3.17561e-18,-4.763415e-17,3.8107320000000006e-17,-4.763415e-18,1.063829e-16,1.9053660000000003e-17,5.874878e-17,-1.492537e-16,...,1.905366e-16,-2.5404880000000003e-17,2.3817070000000002e-17,-6.35122e-18,11.517318,1.509944,0.986704,4.504134,4.491844,0.048268
std,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,...,1.000056,1.000056,1.000056,1.000056,1.338331,1.116952,0.815772,2.871349,2.874336,0.214344
min,-0.7516398,-3.703271,-0.4695519,-0.356934,-0.4545762,-0.4667856,-1.221758,-0.6786608,-0.9169952,-0.6753489,...,-1.221536,-0.5986883,-0.3621989,-0.525551,6.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6900078,0.04904486,-0.4510006,-0.356934,-0.4545762,-0.4667856,-1.014125,-0.6786608,-0.9169952,-0.6753489,...,-0.7955261,-0.4662913,-0.2889613,-0.525551,12.0,1.0,0.0,2.0,2.0,0.0
50%,-0.3320286,0.5180838,-0.3004541,-0.3340396,-0.3561562,-0.4667856,0.02404259,-0.3993193,-0.4976286,-0.6753489,...,-0.4107426,-0.3026846,-0.2283069,-0.525551,12.0,2.0,1.0,4.0,5.0,0.0
75%,0.2352559,0.5180838,0.05004652,-0.009056763,0.06366321,0.06435242,1.062211,0.3269728,0.9701506,0.4351492,...,0.5512163,0.05802976,-0.02408976,-0.03712234,12.0,3.0,2.0,7.0,7.0,0.0
max,8.397489,0.5180838,22.48351,24.20107,24.42689,22.01112,1.269843,2.673451,1.599199,6.820521,...,7.010083,16.92228,32.39273,2.893453,12.0,3.0,2.0,9.0,9.0,1.0


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 23 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   CUST_ID                           8950 non-null   object 
 1   BALANCE                           8950 non-null   float64
 2   BALANCE_FREQUENCY                 8950 non-null   float64
 3   PURCHASES                         8950 non-null   float64
 4   ONEOFF_PURCHASES                  8950 non-null   float64
 5   INSTALLMENTS_PURCHASES            8950 non-null   float64
 6   CASH_ADVANCE                      8950 non-null   float64
 7   PURCHASES_FREQUENCY               8950 non-null   float64
 8   ONEOFF_PURCHASES_FREQUENCY        8950 non-null   float64
 9   PURCHASES_INSTALLMENTS_FREQUENCY  8950 non-null   float64
 10  CASH_ADVANCE_FREQUENCY            8950 non-null   float64
 11  CASH_ADVANCE_TRX                  8950 non-null   float64
 12  PURCHA

In [15]:
# Check missing values
missing_values = data.isnull().sum()
print(missing_values)

CUST_ID                             0
BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
TENURE                              0
TRANSACTION_TIME                    0
DEVICE_TYPE                         0
TRANSACTION_LOCATION                0
CUSTOMER_LOCATION                   0
FRAUD_FLAG                          0
dtype: int64


## *Linear Regression*

In [23]:
# Create X and y variables
X = data.drop('FRAUD_FLAG', axis=1).select_dtypes(include=['float64', 'int64']).to_numpy()  # Use numeric columns only
y = data['FRAUD_FLAG'].to_numpy()

# Create Train and Test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=100)

# Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [45]:
df = pd.read_csv(file_path)

# Selecting relevant numeric features and dropping non-numeric or non-relevant columns
# We'll predict 'CREDIT_LIMIT' using features like 'BALANCE', 'PURCHASES', etc.
target = 'CREDIT_LIMIT'
features = ['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES',
            'CASH_ADVANCE', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT']

# Create a dataset for regression
df_regression = df[features + [target]].dropna()  # Dropping rows with missing values

# Define X (independent variables) and y (dependent variable)
X = df_regression[features].to_numpy()
y = df_regression[target].to_numpy()

# Splitting into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Implementing Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

# Initialize and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Coefficients and performance metrics
intercept = model.intercept_
coefficients = model.coef_
r2_score = metrics.r2_score(y_test, predictions)
mae = metrics.mean_absolute_error(y_test, predictions)
mse = metrics.mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

# Creating a forecast table
forecast_table = pd.DataFrame({
    'Actual': y_test,
    'Predicted': predictions.round(2),
    'Difference': (y_test - predictions).round(2)
})

# Displaying results
results = {
    'Intercept': intercept,
    'Coefficients': coefficients,
    'R2 Score': r2_score,
    'MAE': mae,
    'MSE': mse,
    'RMSE': rmse,
    'Forecast Table': forecast_table.head()
}

results

{'Intercept': -0.0019145602119166102,
 'Coefficients': array([ 0.54444297,  1.02325184, -0.67330946, -0.34940698,  0.02749706,
         0.13559714, -0.10255139,  0.16970045]),
 'R2 Score': 0.4059539174225809,
 'MAE': 0.5771376408587707,
 'MSE': 0.6252576219847104,
 'RMSE': 0.7907323327047594,
 'Forecast Table':      Actual  Predicted  Difference
 0 -0.685588      -0.09       -0.59
 1  0.413794       0.57       -0.16
 2 -0.135897       0.08       -0.22
 3 -0.410743      -0.05       -0.36
 4  0.138948      -0.13        0.27}

---------------------------------

## Logistic Regression


In [19]:
# Dropping ID column
data2 = data.drop('CUST_ID', axis=1)

# Show Key Statistics
data2.describe()


Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,...,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,TRANSACTION_TIME,DEVICE_TYPE,TRANSACTION_LOCATION,CUSTOMER_LOCATION,FRAUD_FLAG
count,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,...,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0,8950.0
mean,-6.35122e-18,1.492537e-16,3.17561e-18,-4.763415e-17,3.8107320000000006e-17,-4.763415e-18,1.063829e-16,1.9053660000000003e-17,5.874878e-17,-1.492537e-16,...,1.905366e-16,-2.5404880000000003e-17,2.3817070000000002e-17,-6.35122e-18,11.517318,1.509944,0.986704,4.504134,4.491844,0.048268
std,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,1.000056,...,1.000056,1.000056,1.000056,1.000056,1.338331,1.116952,0.815772,2.871349,2.874336,0.214344
min,-0.7516398,-3.703271,-0.4695519,-0.356934,-0.4545762,-0.4667856,-1.221758,-0.6786608,-0.9169952,-0.6753489,...,-1.221536,-0.5986883,-0.3621989,-0.525551,6.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6900078,0.04904486,-0.4510006,-0.356934,-0.4545762,-0.4667856,-1.014125,-0.6786608,-0.9169952,-0.6753489,...,-0.7955261,-0.4662913,-0.2889613,-0.525551,12.0,1.0,0.0,2.0,2.0,0.0
50%,-0.3320286,0.5180838,-0.3004541,-0.3340396,-0.3561562,-0.4667856,0.02404259,-0.3993193,-0.4976286,-0.6753489,...,-0.4107426,-0.3026846,-0.2283069,-0.525551,12.0,2.0,1.0,4.0,5.0,0.0
75%,0.2352559,0.5180838,0.05004652,-0.009056763,0.06366321,0.06435242,1.062211,0.3269728,0.9701506,0.4351492,...,0.5512163,0.05802976,-0.02408976,-0.03712234,12.0,3.0,2.0,7.0,7.0,0.0
max,8.397489,0.5180838,22.48351,24.20107,24.42689,22.01112,1.269843,2.673451,1.599199,6.820521,...,7.010083,16.92228,32.39273,2.893453,12.0,3.0,2.0,9.0,9.0,1.0


In [38]:
# Import required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Perform Logistic Regression
for name, method in [('Logistic Regression', LogisticRegression(solver='liblinear', random_state=100))]:
    # Train the model
    method.fit(X_train_scaled, y_train)  # Use scaled training data
    # Make predictions
    predictions = method.predict(X_test_scaled)  # Predict on scaled test data

    # Confusion Matrix
    print(f"\nConfusion Matrix for {name}:\n")
    print(confusion_matrix(y_test, predictions))

    # Classification Report
    print(f"\nClassification Report for {name}:\n")
    print(classification_report(y_test, predictions, target_names=['Non-Fraudulent', 'Fraudulent']))



Confusion Matrix for Logistic Regression:

[[1704    0]
 [  86    0]]

Classification Report for Logistic Regression:

                precision    recall  f1-score   support

Non-Fraudulent       0.95      1.00      0.98      1704
    Fraudulent       0.00      0.00      0.00        86

      accuracy                           0.95      1790
     macro avg       0.48      0.50      0.49      1790
  weighted avg       0.91      0.95      0.93      1790



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


----------------------------------------------------


## Naive Bayes

In [28]:
# Create X and y variables to test and train
X = data.drop('FRAUD_FLAG', axis=1).select_dtypes(include=['float64', 'int64']).to_numpy()  # Use numeric columns only
y = data['FRAUD_FLAG'].to_numpy()

# Create Train and Test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=100)

# Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [30]:
#importing required libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

# Naive Bayes Classifier
for name, method in [('Naive Bayes - Gaussian', GaussianNB())]:
    method.fit(X_train_scaled, y_train)  # Fit the model using scaled training data
    predict = method.predict(X_test_scaled)  # Predict on scaled test data

    # Evaluation Report and Matrix
    target_names = ['Non-Fraudulent', 'Fraudulent']  # Class names for FRAUD_FLAG values (0 and 1)
    print('\nEstimator: {}'.format(name))
    print('\nConfusion Matrix:\n', confusion_matrix(y_test, predict))
    print("\nClassification Report:\n")
    print(classification_report(y_test, predict, target_names=target_names))



Estimator: Naive Bayes - Gaussian

Confusion Matrix:
 [[1666   38]
 [  83    3]]

Classification Report:

                precision    recall  f1-score   support

Non-Fraudulent       0.95      0.98      0.96      1704
    Fraudulent       0.07      0.03      0.05        86

      accuracy                           0.93      1790
     macro avg       0.51      0.51      0.51      1790
  weighted avg       0.91      0.93      0.92      1790



## Neural Network

In [34]:
#import required libraries
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize and train the Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(4, 3, 3),  # Adjust hidden layers if needed
                    activation='relu', solver='adam',
                    max_iter=10000, random_state=100)
mlp.fit(X_train_scaled, y_train)

# Make predictions
predictions = mlp.predict(X_test_scaled)

# Evaluation Report and Matrix
target_names = ['Non-Fraudulent', 'Fraudulent']  # Class names based on FRAUD_FLAG values (0 and 1)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("\nClassification Report:")
print(classification_report(y_test, predictions, target_names=target_names))


Confusion Matrix:
[[1704    0]
 [  86    0]]

Classification Report:
                precision    recall  f1-score   support

Non-Fraudulent       0.95      1.00      0.98      1704
    Fraudulent       0.00      0.00      0.00        86

      accuracy                           0.95      1790
     macro avg       0.48      0.50      0.49      1790
  weighted avg       0.91      0.95      0.93      1790



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Decision Tree

In [35]:
#import required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Decision Tree Classifier
for name, method in [('DT', DecisionTreeClassifier(random_state=100))]:
    method.fit(X_train_scaled, y_train)  # Use scaled training data
    predict = method.predict(X_test_scaled)  # Predict on scaled test data

    # Evaluation Report and Matrix
    target_names = ['Non-Fraudulent', 'Fraudulent']  # Class names for FRAUD_FLAG values (0 and 1)
    print('\nEstimator: {}'.format(name))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predict))

    print("\nClassification Report:")
    print(classification_report(y_test, predict, target_names=target_names))



Estimator: DT
Confusion Matrix:
[[1600  104]
 [  80    6]]

Classification Report:
                precision    recall  f1-score   support

Non-Fraudulent       0.95      0.94      0.95      1704
    Fraudulent       0.05      0.07      0.06        86

      accuracy                           0.90      1790
     macro avg       0.50      0.50      0.50      1790
  weighted avg       0.91      0.90      0.90      1790

