In [31]:
import pandas as pd

data_train_path = './datasets/Simulated-Credit-Card-Transactions-generated-using-Sparkov/fraudTrain.csv'
df_train = pd.read_csv(data_train_path)

data_test_path = './datasets/Simulated-Credit-Card-Transactions-generated-using-Sparkov/fraudTest.csv'
df_test = pd.read_csv(data_test_path)


In [33]:
# get train sample length
train_samples_len = df_train.shape[0]
test_samples_len = df_test.shape[0]
print(train_samples_len)
print(test_samples_len)

# concatenate the two datasets to perform the same data preprocessing
df_combined = pd.concat([df_train, df_test], axis=0)
df_combined.shape

1296675
555719


(1852394, 23)

In [34]:
feature_name = df_train.columns
old_feature_names = list(feature_name)
new_feature_name = ['index', 'transaction_time', 'account_number', 'merchant_name', 'category',
       'transaction_amount', 'first_name', 'last_name', 'gender', 'street', 'city', 'state', 'zip',
       'client_latitude', 'client_longitude', 'city_population', 'job', 'birthday', 'transaction_number', 'unix_time',
       'merchant_latitude', 'merchant_longitude', 'is_fraud']

feature_name_dict = {old_feature_names[i]: new_feature_name[i] for i in range(len(old_feature_names))}
feature_name_dict

# Edit col/feature name
df_combined = df_combined.rename(columns=feature_name_dict)
df_combined.head()

Unnamed: 0,index,transaction_time,account_number,merchant_name,category,transaction_amount,first_name,last_name,gender,street,...,client_latitude,client_longitude,city_population,job,birthday,transaction_number,unix_time,merchant_latitude,merchant_longitude,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [35]:
# convert transaction_time from string to datetime type
df_combined.transaction_time = pd.to_datetime(df_renamed.transaction_time)
df_combined['transaction_date'] = df_combined.transaction_time.dt.strftime('%Y-%m-%d')
df_combined['transaction_hour'] = df_combined.transaction_time.dt.strftime('%H')
df_combined['transaction_month'] = df_combined.transaction_time.dt.strftime('%m')

# convert transaction_birthday from string to datetime type
df_combined.birthday = pd.to_datetime(df_renamed.birthday)
df_combined['birthday_date'] = df_combined.birthday.dt.strftime('%Y-%m-%d')
df_combined['age'] = (df_combined.transaction_time - df_combined.birthday).astype('timedelta64[Y]')

# calculate transaction distance 
distance_trans_longitude = df_combined.merchant_longitude - df_combined.client_longitude
distance_trans_lattitude = df_combined.merchant_latitude - df_combined.client_latitude
distance_trans = np.sqrt(distance_trans_longitude**2 + distance_trans_lattitude**2)

df_combined['transaction_longitude_distance'] = distance_trans_longitude
df_combined['transaction_lattitude_distance'] = distance_trans_lattitude
df_combined['transaction_distance'] = distance_trans

In [36]:
# create dataframe with fraud sample only
df_fraud = df_combined[df_combined.is_fraud==1]


def age_interval(x):
    if x < 20:
        return "Less than 20" 
    elif x >=20 and x < 30:
        return "Between 20 and 30"
    elif x >=30 and x < 40:
        return "Between 30 and 40"
    elif x >=40 and x < 50:
        return "Between 40 and 50"
    elif x >=50 and x < 60:
        return "Between 50 and 60"
    else: 
        return "Larger than 60"

def gender_rename(x):
    if x == 'M':
        return "Male" 
    else: 
        return "Female"
    
    
df_combined['gender'] = df_combined.gender.map(lambda x : gender_rename(x))
df_combined['age_intervals'] = df_combined.age.map(lambda x : age_interval(x))

age_order = [ 'Larger than 60',
             'Between 50 and 60',
             'Between 40 and 50', 
             'Between 30 and 40', 
             'Between 20 and 30', 
             'Less than 20',]

In [37]:
df_train_preprocessed = df_combined.iloc[:train_samples_len, :]
df_test_preprocessed = df_combined.iloc[train_samples_len:,:]

In [29]:
df_train_preprocessed.to_csv('./datasets/Simulated-Credit-Card-Transactions-generated-using-Sparkov/df_train_preprocessed.csv')

In [30]:
df_test_preprocessed.to_csv('./datasets/Simulated-Credit-Card-Transactions-generated-using-Sparkov/df_test_preprocessed.csv')