# Unmasking the Web of Deceit: An Analysis of Online Payment Fraud

## Overview
Lorem ipsum dolor sit amet

## Setup

### Import dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from kaggle.api.kaggle_api_extended import KaggleApi
import os.path
import zipfile
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import time


### Import Dataset
This downloads the dataset from Kaggle and reads it into a Pandas DataFrame. See README for instructions on how to set up a Kaggle API key.  Otherwise, download the file and put it in the `/data` folder with the name `online-payments-fraud-detection-dataset.csv`.

In [None]:
# Authenticate with your Kaggle credentials
api = KaggleApi()
api.authenticate()

# Specify the dataset name
dataset_name = 'online-payments-fraud-detection-dataset'
dataset_folder_path = './data/'
kaggle_dataset_path = 'rupakroy/' + dataset_name

if os.path.isfile(dataset_folder_path + dataset_name + '.csv'):
    print("Found dataset archive.")
else:
    # Download the dataset files
    print("Downloading dataset from Kaggle.")
    api.dataset_download_files(kaggle_dataset_path, path=dataset_folder_path)

    zip_file_path = dataset_folder_path + dataset_name + '.zip'

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all contents to the specified directory
        zip_ref.extractall(dataset_folder_path)

    # Delete the zip file
    os.remove(zip_file_path)
    for filename in zip_ref.namelist():
        new_filename = dataset_name + '.csv'
        old_filepath = os.path.join(dataset_folder_path, filename)
        new_filepath = os.path.join(dataset_folder_path, new_filename)
        os.rename(old_filepath, new_filepath)
    print("Downloaded dataset from Kaggle.")

df = pd.read_csv(dataset_folder_path + dataset_name + '.csv')
# make a copy to preserve the original data
dff = df.copy()

## Data Preprocessing

### Check for empty data

In [None]:
if dff.isnull().values.any():
    print("Error: Missing data")
else:
    print("No missing values found.")

### Rename Columns

We rename the columns to be easier to understand, based on descriptions of the dataset.

In [None]:
print(f'Before: {dff.columns}')

dff.columns = ['Transaction_Hours','Type','Transaction_Amount','Sender','Sender_Balance_Previous_Transaction','Sender_Balance_After_Transaction','Receiver','Receiver_Balance_Previous_Transaction','Receiver_Balance_After_Transaction','Is_Fraud', 'Is_Flagged_Fraud']

print(f'After: {dff.columns}')


### Drop irrelevant columns

In [None]:
print(f'Before: {dff.columns}')

dff = dff.drop('Sender', axis=1)
dff = dff.drop('Receiver', axis=1)
dff = dff.drop('Is_Flagged_Fraud', axis=1)

print(f'After: {dff.columns}')


### Check for correct data types
We look at type for each column. All columns look good. We explore the Type column to find nominal data which we will convert later. We explore Is_Fraud and find it to correctly be values 0, meaning not fraud, and 1, meaning fraud. We will convert this to a boolean later for ease of use.

In [None]:
# Print datatypes of features to ensure they are the correct type
dff.info()
print(dff['Type'].value_counts())
print(dff['Is_Fraud'].value_counts())


### Convert Nominal Features to Boolean
We convert the Type category to columns using one-hot encoding. This creates a new binary column for each category and is appropriate for nominal variables without an intrinsic order.

In [None]:

# Assuming 'df' is your DataFrame
categorical_columns = dff.select_dtypes(include=['object', 'category']).columns

# One-hot encoding: This creates a new binary column for each category and is appropriate for nominal variables without an intrinsic order.
dff = pd.get_dummies(dff, columns=categorical_columns, drop_first=True)

dff.info()

### Update Is_Fraud to Boolean type

In [None]:
# Assuming 'dff' is your DataFrame and 'isFraud' is a column with int64 type containing 0s and 1s
dff['Is_Fraud'] = dff['Is_Fraud'].astype(bool)
# Check the updated data type
dff.info()

### Check for skewed distributions

In [None]:
plt.figure(figsize=(20,15))

numerical_features = dff.select_dtypes(include=['int64', 'float64']).columns

# Create separate histograms with density plots for each numerical column
for i, feature in enumerate(numerical_features, 1):
    # plt.figure(figsize=(10, 6))
    plt.subplot(3, 3, i)
    plt.hist(dff[feature], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution for {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.ticklabel_format(style='plain', axis='x')
    plt.ticklabel_format(style='plain', axis='y')

plt.show()

### Remove outliers

In [None]:
plt.figure(figsize=(20,15))

# Create separate histograms with density plots for each numerical column
for i, feature in enumerate(numerical_features, 1):
    # Calculate the IQR
    Q1 = dff[feature].quantile(0.25)
    Q3 = dff[feature].quantile(0.75)
    IQR = Q3 - Q1

    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify and filter out outliers
    df_no_outliers = dff[(dff[feature] >= lower_bound) & (dff[feature] <= upper_bound)]

    # Plot histogram of Transaction Amount after removing outliers
    plt.subplot(3, 3, i)
    plt.hist(df_no_outliers[feature], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'{feature} Distribution (No Outliers)')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.ticklabel_format(style='plain', axis='y')
    plt.ticklabel_format(style='plain', axis='y')

plt.show()


### Check for Class Imbalance

In [None]:
target = dff['Is_Fraud']

plt.figure(figsize=(8, 5))

# Plot the class imbalance
sns.countplot(x=target, hue=target, palette=["skyblue", "coral"], legend=False, edgecolor='black')

plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Imbalance in Is_Fraud Column')

plt.show()

### Undersample data

In [None]:
# Perform undersampling on the majority class
features = dff.drop('Is_Fraud', axis=1)
target = dff['Is_Fraud']

undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
features_undersampled, target_undersampled = undersampler.fit_resample(features, target)

# Plot the distribution before and after undersampling
plt.figure(figsize=(12, 6))

# Plot the distribution before undersampling
plt.subplot(1, 2, 1)
sns.countplot(x=target, hue=target, palette=["skyblue", "coral"], legend=False, edgecolor='black')
plt.title('Distribution before Undersampling')

# Plot the distribution after undersampling
plt.subplot(1, 2, 2)
sns.countplot(x=target_undersampled, hue=target_undersampled, palette=["skyblue", "coral"], legend=False, edgecolor='black')
plt.title('Distribution after Undersampling')

plt.show()


### Oversample the data

In [None]:
# Set the desired samples for each class to 100,000
desired_samples = 100000

sampling_strategy = {0: desired_samples, 1: desired_samples}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
features_oversampled_smote, target_oversampled_smote = smote.fit_resample(features_undersampled, target_undersampled)

# Plot the distribution before and after SMOTE oversampling
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.countplot(x=target_undersampled, hue=target_undersampled, palette=["skyblue", "coral"], legend=False, edgecolor='black')

plt.title('Distribution before SMOTE Oversampling')

plt.subplot(1, 2, 2)
sns.countplot(x=target_oversampled_smote,  hue=target_oversampled_smote, palette=["skyblue", "coral"], legend=False, edgecolor='black')
plt.title('Distribution after SMOTE Oversampling')

plt.show()


### Split Data
Use stratify to keep even distribution between sets of class data

In [None]:
# Split the resampled data into training, validation, and two testing sets (80/10/10 split)
X_train, X_temp, y_train, y_temp = train_test_split(features_oversampled_smote, target_oversampled_smote, test_size=0.2, random_state=42, stratify=target_oversampled_smote)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Plot the class distribution of all three sets
plt.figure(figsize=(15, 5))

# Training Set Class Distribution
plt.subplot(1, 3, 1)
pd.Series(y_train).value_counts().plot(kind='bar', color=['skyblue', 'coral'], edgecolor='black')
plt.title('Training Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')

# Testing Set Class Distribution
plt.subplot(1, 3, 2)
pd.Series(y_test).value_counts().plot(kind='bar', color=['skyblue', 'coral'], edgecolor='black')
plt.title('Testing Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')

# Validation Set Class Distribution
plt.subplot(1, 3, 3)
pd.Series(y_validation).value_counts().plot(kind='bar', color=['skyblue', 'coral'], edgecolor='black')
plt.title('Validation set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')

plt.tight_layout()
plt.show()


### Naive Bayes Classifier

In [None]:
nb_model = GaussianNB()

# Train the Naive Bayes model
start_time = time.time()
nb_model.fit(X_train, y_train)

# Make predictions on the validation set for Naive Bayes
y_val_pred_nb = nb_model.predict(X_val)

# Evaluate the Naive Bayes model on the validation set
accuracy_val_nb = accuracy_score(y_val, y_val_pred_nb)
conf_matrix_val_nb = confusion_matrix(y_val, y_val_pred_nb)
class_report_val_nb = classification_report(y_val, y_val_pred_nb)

# Record the run time
end_time = time.time()
elapsed_time = end_time - start_time
minutes, seconds = divmod(elapsed_time, 60)

print("Naive Bayes - Validation Set Results:")
print(f"Accuracy: {accuracy_val_nb:.2f}")
print(f"{int(minutes)} minutes and {seconds:.2f} seconds")
print("Confusion Matrix:")
print(conf_matrix_val_nb)
print("Classification Report:")
print(class_report_val_nb)

### Decision Trees Classifier

In [None]:
# Initialize the classifiers
dt_model = DecisionTreeClassifier()

# Train the Decision Trees model
start_time = time.time()
dt_model.fit(X_train, y_train)

# Make predictions on the validation set for Decision Trees
y_val_pred_dt = dt_model.predict(X_val)

# Evaluate the Decision Trees model on the validation set
accuracy_val_dt = accuracy_score(y_val, y_val_pred_dt)
conf_matrix_val_dt = confusion_matrix(y_val, y_val_pred_dt)
class_report_val_dt = classification_report(y_val, y_val_pred_dt)

# Record the run time
end_time = time.time()
elapsed_time = end_time - start_time
minutes, seconds = divmod(elapsed_time, 60)

print("\nDecision Trees - Validation Set Results:")
print(f"Accuracy: {accuracy_val_dt:.2f}")
print(f"{int(minutes)} minutes and {seconds:.2f} seconds")
print("Confusion Matrix:")
print(conf_matrix_val_dt)
print("Classification Report:")
print(class_report_val_dt)

In [None]:
### Support Vector Machine Classifier

In [None]:
svm_model = SVC()

# Train the SVM model
start_time = time.time()
svm_model.fit(X_train, y_train)

# Make predictions on the validation set for SVM
y_val_pred_svm = svm_model.predict(X_val)

# Evaluate the SVM model on the validation set
accuracy_val_svm = accuracy_score(y_val, y_val_pred_svm)
conf_matrix_val_svm = confusion_matrix(y_val, y_val_pred_svm)
class_report_val_svm = classification_report(y_val, y_val_pred_svm)

# Record the run time
end_time = time.time()
elapsed_time = end_time - start_time
minutes, seconds = divmod(elapsed_time, 60)

print("\nSVM - Validation Set Results:")
print(f"Accuracy: {accuracy_val_svm:.2f}")
print(f"{int(minutes)} minutes and {seconds:.2f} seconds")
print("Confusion Matrix:")
print(conf_matrix_val_svm)
print("Classification Report:")
print(class_report_val_svm)