# Project Title

## Overview
Lorem ipsum dolor sit amet

## Setup

### Import dependencies

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from kaggle.api.kaggle_api_extended import KaggleApi
import os.path
import zipfile




### Import Dataset

In [35]:
# Authenticate with your Kaggle credentials
api = KaggleApi()
api.authenticate()

# Specify the dataset name
dataset_name = 'online-payments-fraud-detection-dataset'
dataset_folder_path = './data/'
kaggle_dataset_path = 'rupakroy/' + dataset_name

if os.path.isfile(dataset_folder_path + dataset_name + '.csv'):
    print("Found dataset archive")
else:
    # Download the dataset files
    api.dataset_download_files(kaggle_dataset_path, path=dataset_folder_path, unzip=True)
    # Specify the path to the zip file
    zip_file_path = dataset_folder_path + dataset_name + '.zip'

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all contents to the specified directory
        zip_ref.extractall(dataset_folder_path)

    # Delete the zip file
    os.remove(zip_file_path)
    for filename in zip_ref.namelist():
        new_filename = dataset_name + '.csv'
        old_filepath = os.path.join(dataset_folder_path, filename)
        new_filepath = os.path.join(dataset_folder_path, new_filename)
        os.rename(old_filepath, new_filepath)

df = pd.read_csv(dataset_folder_path + dataset_name + '.csv')
# # make a copy to preserve the original data
dff = df.copy()

Found dataset archive
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


## Data Preprocessing

### Check for empty data

In [None]:
if dff.isnull().values.any():
    print("Error: Missing data")
else:
    print("No missing values found.")

No missing values found.


### Check for correct data types

In [None]:
# Print datatypes of features to ensure they are the correct type
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


### Drop irrelevant columns

In [None]:
dff = dff.drop('isFlaggedFraud', axis=1)
print(dff.columns)

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud'],
      dtype='object')


### Rename Columns

In [None]:
dff.columns = ['Transaction_Hours','Type','Transaction_Amount','Sender','Sender_Balance_Previous_Transaction','Sender_Balance_After_Transaction','Receiver','Receiver_Balance_Previous_Transaction','Receiver_Balance_After_Transaction','isFraud','isFlaggedFraud']

ValueError: Length mismatch: Expected axis has 10 elements, new values have 11 elements

### Pick Numerical Features

In [None]:
# Pick features with numerical value, excluding the class column
features = ['Transaction_Hours','Transaction_Amount', 'Sender_Balance_Previous_Transaction','Sender_Balance_After_Transaction','Receiver_Balance_Previous_Transaction','Receiver_Balance_After_Transaction']

### Check for skewed distributions

In [None]:
plt.figure(figsize=(20,8))

# Select the numerical columns and plot them

# Create separate histograms with density plots for each numerical column
for i, feature in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.distplot(dff[feature])
    plt.title(f'Distribution for {feature}')

plt.show()

### Check for Class Imbalance

In [None]:
# Set the figure size
plt.figure(figsize=(8, 5))

# Plot the class imbalance
sns.countplot(x='isFraud', data=dff)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Imbalance in isFraud Column')

# Show the plot
plt.show()

NameError: name 'plt' is not defined