# Setup environment

## Import libraries

In [41]:
import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import os
from geopy import distance
#from sklearn.model_selection import train_test_split

## import the dataset

In [2]:
# Download latest version of dataset
path = kagglehub.dataset_download("kartik2112/fraud-detection")
print("Path to dataset files:", path)

Path to dataset files: /Users/valeria.verzi/.cache/kagglehub/datasets/kartik2112/fraud-detection/versions/1


In [47]:
card_transaction_dataset = pd.concat((pd.read_csv(path+'/'+f) for f in os.listdir(path)), ignore_index=True)
card_transaction_dataset.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


# Prepare feature transformation for modeling

Remove or transform potentially biasing columns

In [59]:
# Remove names
postprocessed_dataframe = card_transaction_dataset.drop(['first', 'last'], axis=1, inplace=False)

In [60]:
## drop cc_num (in real world can be replaces with cc_num_group as can be used to identify the bank)
postprocessed_dataframe.drop(['cc_num'], axis=1, inplace=True)

In [61]:
postprocessed_dataframe['trans_date_trans_time'] = postprocessed_dataframe['trans_date_trans_time'].astype('datetime64[ns]')

In [63]:
# transform birthdate into age groups, cc_num
postprocessed_dataframe['age_at_purchase'] = postprocessed_dataframe.trans_date_trans_time.dt.year - postprocessed_dataframe['dob'].apply(lambda x: int(x.split('-')[0]))
postprocessed_dataframe.drop(['dob'], axis=1, inplace=True)
bins = [16, 18, 30, 45, 57, 100] # 16-18, 19-30, 31-45, 46-57, 58-100: 5 groups, consider all before and after as outliers (set to Nas)
labels = [1,2,3,4,5]
postprocessed_dataframe['age_group'] = pd.cut(postprocessed_dataframe['age_at_purchase'], bins=bins, labels=labels)
postprocessed_dataframe.age_group = postprocessed_dataframe.age_group.astype(float).fillna(-1)

In [70]:
## create features on transaction date (e.g. day of the week, month, time of the day)
postprocessed_dataframe['trans_date_trans_time'] = postprocessed_dataframe['trans_date_trans_time'].astype('datetime64[ns]')
postprocessed_dataframe['transaction_day_of_the_week'] = postprocessed_dataframe.trans_date_trans_time.dt.day_of_week
day_bins = [-1, 7, 12, 17, 20, 24] # 0-7, 8-12, 13-17, 18-20, 21-24: 5 groups Early morning, Morning, Afternoon, Evening, Night
day_labels = [1,2,3,4,5]
postprocessed_dataframe['transaction_time_of_the_day'] = pd.cut(postprocessed_dataframe.trans_date_trans_time.dt.hour, bins=day_bins, labels=day_labels)
postprocessed_dataframe['transaction_month'] = postprocessed_dataframe.trans_date_trans_time.dt.month

In [72]:
postprocessed_dataframe.drop(columns=['Unnamed: 0'], inplace=True)

### calculate distance from merchant

In [73]:
postprocessed_dataframe['distance_from_mercant_km'] = postprocessed_dataframe.apply(lambda x: distance.distance((x['merch_lat'], x['merch_long']), (x['lat'],x['long'])).km, axis=1)

In [74]:
postprocessed_dataframe.drop(columns=['lat','long', 'merch_lat','merch_long'], inplace=True)

In [75]:
postprocessed_dataframe.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,street,city,state,zip,city_pop,job,trans_num,unix_time,is_fraud,age_at_purchase,age_group,transaction_day_of_the_week,transaction_time_of_the_day,transaction_month,distance_from_mercant_km
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,3495,"Psychologist, counselling",0b242abb623afc578575680df30655b9,1325376018,0,31,3.0,1,1,1,78.773821
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,149,Special educational needs teacher,1f76529f8574734946361c461b024d99,1325376044,0,41,3.0,1,1,1,30.216618
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,4154,Nature conservation officer,a1a22d70485983eac12b5b88dad1cf95,1325376051,0,57,4.0,1,1,1,108.102912
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,1939,Patent attorney,6b849c168bdad6f867558c3793159a81,1325376076,0,52,4.0,1,1,1,95.685115
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,99,Dance movement psychotherapist,a41d7549acf90789359a9aa5346dcb46,1325376186,0,33,3.0,1,1,1,77.702395


## Dump the dataset

In [76]:
postprocessed_dataframe.to_parquet('data/credit_card_transactions.parquet')