In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl

import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')

2023-11-04 11:12:59.634677: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
df = pd.read_csv("bs_dataset.csv")
df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [None]:
df.describe()

Unnamed: 0,step,amount,fraud
count,594643.0,594643.0,594643.0
mean,94.986827,37.890135,0.012108
std,51.053632,111.402831,0.109369
min,0.0,0.0,0.0
25%,52.0,13.74,0.0
50%,97.0,26.9,0.0
75%,139.0,42.54,0.0
max,179.0,8329.96,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   step         594643 non-null  int64  
 1   customer     594643 non-null  object 
 2   age          594643 non-null  object 
 3   gender       594643 non-null  object 
 4   zipcodeOri   594643 non-null  object 
 5   merchant     594643 non-null  object 
 6   zipMerchant  594643 non-null  object 
 7   category     594643 non-null  object 
 8   amount       594643 non-null  float64
 9   fraud        594643 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 45.4+ MB


**DATA PRE PROCESSING**

In [None]:
print(df.dtypes)

df = df.loc[df["age"] != "'U'" ]

# remove quotes
df['age'] = df['age'].str.replace("'", '')
df['gender'] = df['gender'].str.replace("'", '')


# remove the "U" and "E"
df = df.loc[df["age"] != "U" ]
df = df.loc[df["gender"] != "U"]
df = df.loc[df["gender"] != "E"]

# Convert age into integer
df = df.astype({'age':'int'})
print(df[['age','gender']].head())



step             int64
customer        object
age             object
gender          object
zipcodeOri      object
merchant        object
zipMerchant     object
category        object
amount         float64
fraud            int64
dtype: object
   age gender
0    4      M
1    2      M
2    4      F
3    3      M
4    5      M


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
temp = df.copy()

labelencoder = LabelEncoder()
temp = df

# Hot encode gender
temp['gender_encoded'] = labelencoder.fit_transform(df['gender'])


# Label encode customer, zipcodeOri, merchant and category
temp['customer_encoded'] = labelencoder.fit_transform(df['customer'])
temp['zipcodeOri_encoded'] = labelencoder.fit_transform(df['zipcodeOri'])
temp['merchant_encoded'] = labelencoder.fit_transform(df['merchant'])
temp['zipMerchant_encoded'] = labelencoder.fit_transform(df['zipMerchant'])
temp['category_encoded'] = labelencoder.fit_transform(df['category'])

temp.head()
temp.columns
temp = temp.drop(columns=['customer', 'gender','zipcodeOri','merchant','zipMerchant','category'])
temp.head()

Unnamed: 0,step,age,amount,fraud,gender_encoded,customer_encoded,zipcodeOri_encoded,merchant_encoded,zipMerchant_encoded,category_encoded
0,0,4,4.55,0,1,210,0,30,0,12
1,0,2,39.68,0,1,2746,0,30,0,12
2,0,4,26.89,0,0,2280,0,18,0,12
3,0,3,17.25,0,1,1646,0,30,0,12
4,0,5,35.72,0,1,3578,0,30,0,12


In [None]:
X = temp.drop(["fraud" ], axis = 1)
y = temp.fraud

# Split the data into training (70%), testing (20%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3) #Taken out random_state
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=2/3) #random_state

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# RandomForest Start

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
pred_rf_test = clf.predict(X_test)
accuracy_test = accuracy_score(y_test, pred_rf_test) * 100
report_test = classification_report(y_test, pred_rf_test)
conf_matrix_test = confusion_matrix(y_test, pred_rf_test)

print("Test Set Accuracy:", accuracy_test)
print("Classification Report for Test Set:\n", report_test)
print("Confusion Matrix for Test Set:\n", conf_matrix_test)

# Predict on the validation data
pred_rf_val = clf.predict(X_val)
accuracy_val = accuracy_score(y_val, pred_rf_val) * 100
report_val = classification_report(y_val, pred_rf_val)
conf_matrix_val = confusion_matrix(y_val, pred_rf_val)


print("Validation Set Accuracy:", accuracy_val)
print("Classification Report for Validation Set:\n", report_val)
print("Confusion Matrix for Validation Set:\n", conf_matrix_val)

Test Set Accuracy: 99.08761278353992
Classification Report for Test Set:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     58555
           1       0.60      0.82      0.69       740

    accuracy                           0.99     59295
   macro avg       0.80      0.91      0.84     59295
weighted avg       0.99      0.99      0.99     59295

Confusion Matrix for Test Set:
 [[58145   410]
 [  131   609]]
Validation Set Accuracy: 99.24782865334345
Classification Report for Validation Set:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    117154
           1       0.64      0.88      0.74      1436

    accuracy                           0.99    118590
   macro avg       0.82      0.94      0.87    118590
weighted avg       0.99      0.99      0.99    118590

Confusion Matrix for Validation Set:
 [[116437    717]
 [   175   1261]]


# NN Model

In [None]:
not_fraud_count, fraud_count = np.bincount(df["fraud"])

total_count = not_fraud_count + fraud_count
print(
    (
        f"Data:\n"
        f"    Total: {total_count}\n"
        f"    Fraud: {fraud_count} ({100 * fraud_count / total_count:.2f}% of total)\n"
    )
)

Data:
    Total: 592950
    Fraud: 7193 (1.21% of total)



In [None]:
# bias fix to speed up training
# see https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#optional_set_the_correct_initial_bias
output_bias = tf.keras.initializers.Constant(np.log([fraud_count / not_fraud_count]))

In [None]:
model = keras.Sequential(
    [
        keras.layers.Dense(
            500, activation="relu", input_dim=X_train_resampled.shape[1]
        ),

        keras.layers.Dense(
            500, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)
        ),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation="sigmoid", bias_initializer=output_bias),
    ]
)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
        keras.metrics.AUC(name="auc"),
        keras.metrics.AUC(name="prc", curve="PR"),
    ],
)

2023-11-04 11:14:18.152236: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa7bcf27d30>

In [None]:
# Save the model weights to a file
# model.save_weights('fraud_detection_model_weights_2.h5')

### Load Model

In [None]:
# model = keras.Sequential(
#     [
#         keras.layers.Dense(
#             500, activation="relu", input_dim=X_train_resampled.shape[1]
#         ),
#         keras.layers.Dense(
#             500, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)
#         ),
#         keras.layers.Dropout(0.2),
#         keras.layers.Dense(1, activation="sigmoid", bias_initializer=output_bias),
#     ]
# )

# # Load the saved model weights
# model.load_weights('fraud_detection_model_weights_2.h5')

In [None]:
# Evaluate the model on the test set
y_pred_nn_test = model.predict(X_test)
y_pred_test_binary = (y_pred_nn_test > 0.5).astype(int)

# Calculate the confusion matrix for the test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test_binary)

print("Confusion Matrix for Test Set:")
print(conf_matrix_test)

# Evaluate the model on the validation set
y_pred_nn_val = model.predict(X_val)
y_pred_val_binary = (y_pred_nn_val > 0.5).astype(int)

# Generate a classification report for the test set
report_test = classification_report(y_test, y_pred_test_binary)
print("Classification Report for Test Set:\n", report_test)

# Calculate the confusion matrix for the validation set
conf_matrix_val = confusion_matrix(y_val, y_pred_val_binary)

print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

# Generate a classification report for the validation set
report_val = classification_report(y_val, y_pred_val_binary)
print("Classification Report for Validation Set:\n", report_val)

Confusion Matrix for Test Set:
[[56502  2053]
 [   55   685]]
Classification Report for Test Set:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98     58555
           1       0.25      0.93      0.39       740

    accuracy                           0.96     59295
   macro avg       0.62      0.95      0.69     59295
weighted avg       0.99      0.96      0.97     59295

Confusion Matrix for Validation Set:
[[113266   3888]
 [    78   1358]]
Classification Report for Validation Set:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98    117154
           1       0.26      0.95      0.41      1436

    accuracy                           0.97    118590
   macro avg       0.63      0.96      0.69    118590
weighted avg       0.99      0.97      0.98    118590



# Ensemble

In [None]:
# Define the number of samples you want to include
num_samples = 25000

# Create an array of random indices to select samples
random_indices = np.random.choice(len(pred_rf_val), num_samples, replace=False)

# Combine the predictions with equal weightage for the randomly selected samples
combined_predictions_val = 0.6 * pred_rf_val[random_indices] + 0.4 * y_pred_nn_val[random_indices]


In [None]:
# Create an array of random indices for the test set
random_indices_test = np.random.choice(len(pred_rf_test), num_samples, replace=False)

# Combine the predictions with equal weightage for the randomly selected test samples
combined_predictions_test = 0.6 * pred_rf_test[random_indices_test] + 0.4 * y_pred_nn_test[random_indices_test]

In [None]:
y_val_resampled =  y_val.to_numpy()
y_val_resampled = y_val_resampled[random_indices]

In [None]:
# Train a Logistic Regression model using the combined predictions as input features
lr_model = LogisticRegression()
lr_model.fit(combined_predictions_val, y_val_resampled)

LogisticRegression()

In [None]:
from sklearn.metrics import confusion_matrix

# Evaluate the ensemble model on the validation set
ensemble_predictions_val = lr_model.predict(combined_predictions_val)

# Compute the confusion matrix
confusion = confusion_matrix(y_val_resampled, ensemble_predictions_val)

accuracy_val = accuracy_score(y_val_resampled, ensemble_predictions_val) * 100
precision_val = precision_score(y_val_resampled, ensemble_predictions_val)
recall_val = recall_score(y_val_resampled, ensemble_predictions_val)
f1_val = f1_score(y_val_resampled, ensemble_predictions_val)

print("Ensemble Model Accuracy on Validation Set:", accuracy_val)
print("Ensemble Model Precision on Validation Set:", precision_val)
print("Ensemble Model Recall on Validation Set:", recall_val)
print("Ensemble Model F1-Score on Validation Set:", f1_val)

# Calculate the confusion matrix for the validation set
conf_matrix_val = confusion_matrix(y_val_resampled, ensemble_predictions_val)


# Generate a classification report for the validation set
report_val = classification_report(y_val_resampled, ensemble_predictions_val)
print("Classification Report for Validation Set:\n", report_val)

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion)

Ensemble Model Accuracy on Validation Set: 99.35199999999999
Ensemble Model Precision on Validation Set: 0.7830188679245284
Ensemble Model Recall on Validation Set: 0.7280701754385965
Ensemble Model F1-Score on Validation Set: 0.7545454545454546
Classification Report for Validation Set:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     24658
           1       0.78      0.73      0.75       342

    accuracy                           0.99     25000
   macro avg       0.89      0.86      0.88     25000
weighted avg       0.99      0.99      0.99     25000

Confusion Matrix:
[[24589    69]
 [   93   249]]
