# This code is to go in the flask file

# 1. Load and Inspect the Data

In [1]:
# Import the modules
import numpy as np # For numerical operations and calculations
import pandas as pd # To read and manipulate the lending data as a dataframe
from pathlib import Path # To specify the the file path for reading the csv file
from sklearn.preprocessing import StandardScaler # To scale the data
import random
from datetime import datetime, timedelta

In [2]:
# Reading the credit card transaction data file from the resources folder into a pandas dataframe
# sample_df will be used to append is_fraud predictions 
sample_df = pd.read_csv(Path("sample1.csv"))

In [3]:
# Create a copy of the sample dataframe -
fraud_df = sample_df.copy()

In [4]:
# Drop the cc_num and trans_num columns as credit numbers are randomly generated by the banks and 
# have no link to whether fraud will be committed
fraud_df.drop(['cc_num','trans_num'], axis=1, inplace=True)

In [5]:
# Check datatypes of each column
# fraud_df.dtypes

In [6]:
# Convert 'trans_date_trans_time' from object to date time format
fraud_df['trans_date_trans_time'] = pd.to_datetime(fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')

In [7]:
# Sort transaction date and time in ascending order
fraud_df = fraud_df.sort_values(by='trans_date_trans_time', ascending=True)

In [8]:
# Number of rows to generate based on rows in sample file uploaded
num_rows = len(fraud_df)

# Initialize 'is_fraud' column with 0
fraud_df['is_fraud'] = 0

# Set 'is_fraud' to 1 for the first transaction
fraud_df.loc[0, 'is_fraud'] = 1

# Set 'is_fraud' to 1 for transactions every 7 days
for i in range(1, len(fraud_df)):
    time_difference = fraud_df['trans_date_trans_time'].iloc[i] - fraud_df['trans_date_trans_time'].iloc[i - 1]
    if time_difference >= timedelta(days=7):
        fraud_df.loc[i, 'is_fraud'] = 1

fraud_df.reset_index(drop=True, inplace=True)

## 2. Convert DateTime and Time columns into Unix Timestamps
### columns 'trans_date_trans_time'  and 'dob' 

In [9]:
# Convert the 'trans_date_trans_time' column to datetime objects
fraud_df['trans_date_trans_time'] = pd.to_datetime(fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')

# Convert the 'trans_date_trans_time' column to Unix timestamps
fraud_df['trans_date_trans_time'] = (fraud_df['trans_date_trans_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Convert the 'dob' column to datetime objects
fraud_df['dob'] = pd.to_datetime(fraud_df['dob'], format='%Y-%m-%d')

# Convert the 'dob' column to Unix timestamps
fraud_df['dob'] = (fraud_df['dob'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [10]:
# Scale the numeric columns.
# Scaling the data is necessary to ensure that features with different units or magnitudes have an equal 
# influence on machine learning algorithms and to enable efficient convergence.

# Define the columns you want to scale (assuming they are all numeric)
columns_to_scale = ['trans_date_trans_time', 'amt','zip','lat','long','city_pop','dob','unix_time','merch_lat','merch_long']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on your data and transform the specified columns
fraud_df[columns_to_scale] = scaler.fit_transform(fraud_df[columns_to_scale])

## 3. Implement target encoding for the individual categorical features and the 'is_fraud' target variable (except gender column)

In [11]:
# Implement target encoding for the 'merchant' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'merchant'
target_mean = fraud_df.groupby('merchant')['is_fraud'].mean()

# Replace merchant column with the target encoding
fraud_df['merchant'] = fraud_df['merchant'].map(target_mean)

In [12]:
# Implement target encoding for the 'category' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'job'
target_mean = fraud_df.groupby('category')['is_fraud'].mean()

# Replace category column with the target encoding
fraud_df['category'] = fraud_df['category'].map(target_mean)

In [13]:
# Implement target encoding for the 'first' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'first'
target_mean = fraud_df.groupby('first')['is_fraud'].mean()

# Replace first column with the target encoding
fraud_df['first'] = fraud_df['first'].map(target_mean)

In [14]:
# Implement target encoding for the 'last' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'last'
target_mean = fraud_df.groupby('last')['is_fraud'].mean()

# Replace last column with the target encoding
fraud_df['last'] = fraud_df['last'].map(target_mean)

In [15]:
# Implement target encoding for the 'street' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'street'
target_mean = fraud_df.groupby('street')['is_fraud'].mean()

# Replace street column with the target encoding
fraud_df['street'] = fraud_df['street'].map(target_mean)

In [16]:
# Implement target encoding for the 'city' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'city'
target_mean = fraud_df.groupby('city')['is_fraud'].mean()

# Replace city column with the target encoding
fraud_df['city'] = fraud_df['city'].map(target_mean)

In [17]:
# Implement target encoding for the 'state' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'state'
target_mean = fraud_df.groupby('state')['is_fraud'].mean()

# Replace state column with the target encoding
fraud_df['state'] = fraud_df['state'].map(target_mean)

In [18]:
# Implement target encoding for the 'job' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'job'
target_mean = fraud_df.groupby('job')['is_fraud'].mean()

# Replace job column with the target encoding
fraud_df['job'] = fraud_df['job'].map(target_mean)

## 4. Convert gender feature from categorical to numerical (male-1, female-0)

In [19]:
# Replace "M" with 1 and "F" with 0 in the "gender" column
fraud_df['gender'] = fraud_df['gender'].replace({'M': 1, 'F': 0})


In [20]:
# Drop is_fraud column
fraud_df.drop(['is_fraud'], axis=1, inplace=True)

# File is now ready for machine learning model

In [21]:
# Save the fraud_df to CSV file
#file_path = "sample1_encoded.csv"

# Use the to_csv method to export the DataFrame to a CSV file
#fraud_df.to_csv(file_path, index=False)