# Load train and test dataset with pandas library

In [1]:
# Load required panda package
import pandas as pd

# Load train dataset
train_path = './datasets/fraudTrain.csv'
df_train = pd.read_csv(train_path)

# Get a quick view on train dataset feature and value
print(df_train.head())

#Load test dataset
test_path = './datasets/fraudTest.csv'
df_test = pd.read_csv(test_path)

# Get a quick view on test dataset feature and value
print(df_test.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

# Show basic statistical summries of the train / test datasets

In [2]:
# Get dataframe shapes
print('Shape of train dataframe:', df_train.shape)
print('Shape of test dataframe:', df_test.shape)

# Get train data info: count, feature names, data types, missing data counts
print('------------------Train Dataframe info------------------')
print(df_train.info())

# Get statistical summaries for each feature
print('------------------Train Dataframe describe------------------')
print(df_train.describe())

Shape of train dataframe: (1296675, 23)
Shape of test dataframe: (555719, 23)
------------------Train Dataframe info------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  obje

# Combine train and test dataframes and rename feature names

In [3]:
# Concatenate the two datasets to perform the same data preprocessing
df_combined = pd.concat([df_train, df_test], axis=0)
print('\n Shape of combined dataframe:\n', df_combined.shape)

# Get a list of old feature names
old_feature_names = list(df_train.columns)
print('\n Old feature names:\n', old_feature_names)

# Create a new corresponding feature names
new_feature_names = ['index', 'transaction_time', 'account_number', 'merchant_name', 'category',
       'transaction_amount', 'first_name', 'last_name', 'gender', 'street', 'city', 'state', 'zip',
       'client_latitude', 'client_longitude', 'city_population', 'job', 'birthday', 'transaction_number', 'unix_time',
       'merchant_latitude', 'merchant_longitude', 'is_fraud']

# Create a dictionary to map the old feature names to the new ones
feature_name_dict = {old_feature_names[i]: new_feature_names[i] for i in range(len(old_feature_names))}
print('\n Dictionary of old-new feature names:\n', feature_name_dict)

# Edit col/feature name
df_combined = df_combined.rename(columns=feature_name_dict)
print('\n Combine dataframe with new feature names:\n', df_combined.head(2))


 Shape of combined dataframe:
 (1852394, 23)

 Old feature names:
 ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']

 Dictionary of old-new feature names:
 {'Unnamed: 0': 'index', 'trans_date_trans_time': 'transaction_time', 'cc_num': 'account_number', 'merchant': 'merchant_name', 'category': 'category', 'amt': 'transaction_amount', 'first': 'first_name', 'last': 'last_name', 'gender': 'gender', 'street': 'street', 'city': 'city', 'state': 'state', 'zip': 'zip', 'lat': 'client_latitude', 'long': 'client_longitude', 'city_pop': 'city_population', 'job': 'job', 'dob': 'birthday', 'trans_num': 'transaction_number', 'unix_time': 'unix_time', 'merch_lat': 'merchant_latitude', 'merch_long': 'merchant_longitude', 'is_fraud': 'is_fraud'}

 Combine dataframe with new feature names:
    index     tr

# Analyze class labels with is_fraud feature

In [4]:
# Check uniaue values of is_fraud column
print('\n Unique values:\n', df_train.is_fraud.unique())

# Cound unique values of is_fraud column
print('\n Count unique values:\n', df_train.is_fraud.value_counts())

# Calculate percentage of unique values of is_fraud column
print('\n Percetage of unique values:\n', df_train.is_fraud.value_counts(normalize=True))

# Create dataframe for statistics of fraud labels
df_fraud_stat = pd.DataFrame({'Class': df_train.is_fraud.unique(),
                             'Count': df_train.is_fraud.value_counts(),
                             'Percentage': df_train.is_fraud.value_counts(normalize=True)})
print('\n Dataframe for fraud labels:\n', df_fraud_stat)


 Unique values:
 [0 1]

 Count unique values:
 0    1289169
1       7506
Name: is_fraud, dtype: int64

 Percetage of unique values:
 0    0.994211
1    0.005789
Name: is_fraud, dtype: float64

 Dataframe for fraud labels:
    Class    Count  Percentage
0      0  1289169    0.994211
1      1     7506    0.005789


# Feature engineering techniques on the credit card transaction dataset

In [8]:
import numpy as np
# Convert transaction_time from string to datetime type. Add transaction date, hour, month to combined dataframe
df_combined.transaction_time = pd.to_datetime(df_combined.transaction_time)
df_combined['transaction_date'] = df_combined.transaction_time.dt.strftime('%Y-%m-%d')
df_combined['transaction_hour'] = df_combined.transaction_time.dt.strftime('%H')
df_combined['transaction_month'] = df_combined.transaction_time.dt.strftime('%m')

# Convert transaction_birthday from string to datetime type. Add 'birthday_date' and 'age' features to combined dataframe
df_combined.birthday = pd.to_datetime(df_combined.birthday)
df_combined['birthday_date'] = df_combined.birthday.dt.strftime('%Y-%m-%d')
df_combined['age'] = (df_combined.transaction_time - df_combined.birthday).astype('timedelta64[Y]')

# calculate transaction distance 
distance_trans_longitude = df_combined.merchant_longitude - df_combined.client_longitude
distance_trans_lattitude = df_combined.merchant_latitude - df_combined.client_latitude
distance_trans = np.sqrt(distance_trans_longitude**2 + distance_trans_lattitude**2)

# Add distance features to combined dataframe
df_combined['transaction_longitude_distance'] = distance_trans_longitude
df_combined['transaction_lattitude_distance'] = distance_trans_lattitude
df_combined['transaction_distance'] = distance_trans

# Create age_interval function
def age_interval(x):
    """
    Binning age values to categorical ones with 5 categories:
    'Less than 20', 'Between 20 and 30', 'Between 30 and 40', 
    'Between 40 and 50', 'Between 50 and 60' and 'Larger than 60'
    """
    if x < 20:
        return "Less than 20" 
    elif x >=20 and x < 30:
        return "Between 20 and 30"
    elif x >=30 and x < 40:
        return "Between 30 and 40"
    elif x >=40 and x < 50:
        return "Between 40 and 50"
    elif x >=50 and x < 60:
        return "Between 50 and 60"
    else: 
        return "Larger than 60"

# Create rename function for gender feature
def gender_rename(x):
    """
        Rename gender value: 'M' to 'Male' 
        and 'F' to 'Female'
    """
    if x == 'M':
        return "Male" 
    else: 
        return "Female"
    
# Create 'age_intervals' 
df_combined['age_intervals'] = df_combined.age.map(lambda x : age_interval(x))

# Rename gender values
df_combined['gender'] = df_combined.gender.map(lambda x : gender_rename(x))


# Get and save preprocessed train and test dataframes

In [10]:
# Get the lenth of train dataset
train_samples_len = df_train.shape[0]

# Get preprocessed train and test dataframes
df_train_preprocessed = df_combined.iloc[:train_samples_len, :]
df_test_preprocessed = df_combined.iloc[train_samples_len:,:]

# Save the preprocessed dataframes into new corresponding csv files for further analyze later
df_train_preprocessed.to_csv('./datasets/df_train_preprocessed.csv')
df_test_preprocessed.to_csv('./datasets/df_test_preprocessed.csv')