In [39]:
from pathlib import Path
import pandas as pd
import numpy as np
import pgeocode

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# Functions for data import

def add_leading_zero_to_zipcode(item):
    item_str = str(item)  # Ensure the item is a string
    item_str = item_str.replace('.0', '') # remove trailing '.0'
    
    if len(item_str) == 4:
        return '0' + item_str
    elif len(item_str) == 3:
        return '00' + item_str
    
    elif item_str == '10072': # catch bad New York Zipcode
        return '10001'
    elif item_str == '30399': # catch bad Atlanta Zipcode
        return '30303'
    elif item_str == '94101': # catch bad San Francisco Zipcode
        return '94102'
    elif item_str == '92164': # catch bad San Diego
        return '92101'
    elif item_str == '98205': # catch bad Everett WA
        return '98201'
    elif item_str == '29573':
        return '29574'
    elif item_str == '19388':
        return '19390'
    elif item_str == '19640': # Reading PA
        return '19601'
    elif item_str == '16532': # Erie PA
        return '16501'
    elif item_str == '14645': # Rochester NY
        return '14604'
    elif item_str == '19483':
        return '19481'      # Valley Forge PA
    elif item_str == '17767':
        return '17751'      # Salona PA
    elif item_str == '45418':
        return '45390'      # Dayton OH
    elif item_str == '30330':
        return '30329'      # Atlanta GA
    elif item_str == '25965': 
        return '25976'      # Elton WV
    
    return item_str

def remove_dollar_and_convert(item):
    # Remove the dollar sign and convert to integer
    return np.int32(item.replace('$', ''))

def remove_dollar_and_convert_float(item):
    # Remove the dollar sign and convert to float
    return np.float64(item.replace('$', ''))

def convert_yes_no_to_binary(item):
    item_lower = str(item).lower()
    if item_lower == 'yes':
        result = 1
    else:
        result = 0 
    return np.int8(result)


In [4]:
# Import users from csv

users_path = Path.cwd() / 'data/sd254_users.csv'


users_columns_import = ['Birth Year', 
                        'Zipcode', 
                        'Per Capita Income - Zipcode',
                        'Yearly Income - Person', 
                        'Total Debt',
                        'FICO Score',
                        'Num Credit Cards']

user_converters = {'Zipcode': add_leading_zero_to_zipcode,
                   'Per Capita Income - Zipcode': remove_dollar_and_convert,
                   'Yearly Income - Person': remove_dollar_and_convert,
                   'Total Debt': remove_dollar_and_convert}

users_dtypes = {'Birth Year': np.uint16,
                 'FICO Score': np.uint16,
                 'Num Credit Cards': np.uint8}

users_df = pd.read_csv(users_path, 
                       usecols=users_columns_import,
                       converters=user_converters,
                       dtype=users_dtypes
                    )


users_df['User'] = users_df.index

users_df.head()


Unnamed: 0,Birth Year,Zipcode,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,User
0,1966,91750,29278,59696,127613,787,5,0
1,1966,11363,37891,77254,191349,701,5,1
2,1938,91792,22681,33483,196,698,5,2
3,1957,10069,163145,249925,202328,722,4,3
4,1976,94117,53797,109687,183855,675,1,4


In [5]:
# import cards csv

cards_path = Path.cwd() / 'data/sd254_cards.csv'

cards_columns_import = ['User',	
                        'CARD INDEX',
                        'Has Chip',
                        'Cards Issued',
                        'Year PIN last Changed',
                        'Card on Dark Web'
                        ]

cards_dtypes = {'CARD INDEX': np.uint8,
                'Cards Issued': np.uint8,
                'Year PIN last Changed': np.uint16
                }

cards_conversions = {'Card on Dark Web': convert_yes_no_to_binary,
                     'Has Chip': convert_yes_no_to_binary}

cards_df = pd.read_csv(cards_path,
                       usecols=cards_columns_import,
                       dtype=cards_dtypes,
                       converters=cards_conversions
                       )

display(cards_df.head())

Unnamed: 0,User,CARD INDEX,Has Chip,Cards Issued,Year PIN last Changed,Card on Dark Web
0,0,0,1,2,2008,0
1,0,1,1,2,2014,0
2,0,2,1,2,2004,0
3,0,3,0,1,2012,0
4,0,4,1,1,2009,0


In [6]:
transactions_path = Path.cwd() / 'data/credit_card_transactions-ibm_v2.csv'

transactions_columns_import = ['User',
                               'Card',
                               'Year',
                               'Month',
                               'Day',
                               'Time',
                               'Amount',
                               'Use Chip',
                               'Merchant City',
                               'Merchant State',
                               'Zip',
                               'MCC',
                               'Errors?',
                               'Is Fraud?'
                               ]

transaction_converters = {'Zip': add_leading_zero_to_zipcode,
                           'Amount': remove_dollar_and_convert_float,
                           'Is Fraud?': convert_yes_no_to_binary
                          }


transactions_df = pd.read_csv(transactions_path,
                                usecols=transactions_columns_import,
                                converters=transaction_converters
                                )


In [10]:
transactions_df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,134.09,Swipe Transaction,La Verne,CA,91750,5300,,0
1,0,0,2002,9,1,06:42,38.48,Swipe Transaction,Monterey Park,CA,91754,5411,,0
2,0,0,2002,9,2,06:22,120.34,Swipe Transaction,Monterey Park,CA,91754,5411,,0
3,0,0,2002,9,2,17:45,128.95,Swipe Transaction,Monterey Park,CA,91754,5651,,0
4,0,0,2002,9,3,06:23,104.71,Swipe Transaction,La Verne,CA,91750,5912,,0


In [11]:
# inconsistent Merchant City ONLINE and Use Chip Online Transaction
transactions_df.loc[((transactions_df['Merchant City'] == 'ONLINE') & (transactions_df['Use Chip'] != 'Online Transaction')), 'Use Chip'].value_counts()

Use Chip
Chip Transaction    7601
Name: count, dtype: int64

In [12]:
transactions_users_500_df = transactions_df.loc[transactions_df['User'] < 500]
transactions_test_df = transactions_df.loc[transactions_df['User'] < 5]

In [13]:
transactions_users_500_df.to_csv('data/transactions_users_500.csv', index=False)

In [22]:
# merge datasets

merge_step_1 = transactions_users_500_df.merge(users_df, how='inner', on='User')
merged_df = merge_step_1.merge(cards_df, how='inner', left_on=['User', 'Card'], right_on=['User', 'CARD INDEX'])
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5927394 entries, 0 to 5927393
Data columns (total 26 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   User                         int64  
 1   Card                         int64  
 2   Year                         int64  
 3   Month                        int64  
 4   Day                          int64  
 5   Time                         object 
 6   Amount                       float64
 7   Use Chip                     object 
 8   Merchant City                object 
 9   Merchant State               object 
 10  Zip                          object 
 11  MCC                          int64  
 12  Errors?                      object 
 13  Is Fraud?                    int8   
 14  Birth Year                   uint16 
 15  Zipcode                      object 
 16  Per Capita Income - Zipcode  int32  
 17  Yearly Income - Person       int32  
 18  Total Debt                   int32  
 19  

In [23]:
# add column to define whether international
merged_df['International'] = (merged_df['Merchant State'].str.len() > 2).astype(np.int8)

# add column for online transaction
merged_df['Online'] = (transactions_df['Merchant City'] == 'ONLINE').astype(np.int8)

# add column for age at transaction
merged_df['Age_at_transaction'] = merged_df['Year'] - merged_df['Birth Year']

# create income-to-debt
merged_df['income_to_debt'] = merged_df['Yearly Income - Person'] / (merged_df['Total Debt'] + 0.001)

# create a date-time column
merged_df['datetime'] = pd.to_datetime(merged_df['Year'].astype(str) + '-' + 
                                merged_df['Month'].astype(str) + '-' + 
                                merged_df['Day'].astype(str) + ' ' + 
                                merged_df['Time'])

# create day of week column
merged_df['day_of_week'] = merged_df['datetime'].dt.dayofweek

# create attribute PIN_changed_years_ago from year of transaction and year PIN changed
# merged_df['PIN_changed_years_ago'] = merged_df['Year'] - merged_df['Year PIN last Changed'] 

# create is it the weekend
merged_df['is_weekend'] = (merged_df['datetime'].dt.dayofweek >= 5).astype(np.int8)

# Convert datetime to Unix timestamp
merged_df['timestamp'] = merged_df['datetime'].astype(int) / 10**9  # Convert to seconds

In [33]:
# Put times into bins of time-of-day

# Define the bins and their corresponding labels
time_bins = [0, 6, 12, 18, 22, 24]
time_labels = ['Night', 'Morning', 'Afternoon', 'Evening', 'Night']

# Categorize the hours into bins
merged_df['time_of_day'] = pd.cut(merged_df['datetime'].dt.hour, bins=time_bins, labels=time_labels, right=False, include_lowest=True, ordered=False)


In [34]:
# create filters and lists of zipcodes to find distance with
distance_candidates = merged_df['Zip'].str.len() == 5
merchant_zip_list = merged_df.loc[distance_candidates, 'Zip'].to_list()
user_zip_list = merged_df.loc[distance_candidates, 'Zipcode'].to_list()

# use pgeocode to calculate distances
dist = pgeocode.GeoDistance('us')
distances = dist.query_postal_code(user_zip_list, merchant_zip_list)

avg_distance = np.mean(distances)
# avg_distance = 250.0


# initiate distance attribute with average distance
merged_df['distances'] = avg_distance


merged_df.loc[distance_candidates, 'distances'] = distances
# merged_df.loc[distance_candidates, 'problem_dist'] = np.isnan(distances)

print(avg_distance)



178.98201373462226


In [35]:
merged_df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?,Birth Year,Zipcode,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,CARD INDEX,Has Chip,Cards Issued,Year PIN last Changed,Card on Dark Web,International,Online,Age_at_transaction,income_to_debt,datetime,day_of_week,PIN_changed_years_ago,is_weekend,timestamp,distances,time_of_day
0,0,0,2002,9,1,06:21,134.09,Swipe Transaction,La Verne,CA,91750,5300,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-01 06:21:00,6,-6,1,1030861000.0,0.0,Morning
1,0,0,2002,9,1,06:42,38.48,Swipe Transaction,Monterey Park,CA,91754,5411,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-01 06:42:00,6,-6,1,1030863000.0,33.540588,Morning
2,0,0,2002,9,2,06:22,120.34,Swipe Transaction,Monterey Park,CA,91754,5411,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-02 06:22:00,0,-6,0,1030948000.0,33.540588,Morning
3,0,0,2002,9,2,17:45,128.95,Swipe Transaction,Monterey Park,CA,91754,5651,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-02 17:45:00,0,-6,0,1030989000.0,33.540588,Afternoon
4,0,0,2002,9,3,06:23,104.71,Swipe Transaction,La Verne,CA,91750,5912,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-03 06:23:00,1,-6,0,1031034000.0,0.0,Morning


In [38]:
columns_to_drop = ['Card',
                   'User',
                   'Year',
                   'Month',
                   'Birth Year',
                   'Day',
                   'Time',
                   'Merchant State',
                   'Zip',
                   'Zipcode',
                   'CARD INDEX',
                   'Year PIN last Changed',
                   'MCC',
                   'datetime']

merged_and_drop_df = merged_df.drop(columns=columns_to_drop, axis=1)

# Filter out negative amounts
merged_and_drop_df = merged_and_drop_df.loc[merged_and_drop_df['Amount'] > 0.0]
merged_and_drop_df['Errors?'].fillna('No Error', inplace = True)
merged_and_drop_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_and_drop_df['Errors?'].fillna('No Error', inplace = True)


Unnamed: 0,Amount,Use Chip,Merchant City,Errors?,Is Fraud?,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Has Chip,Cards Issued,Card on Dark Web,International,Online,Age_at_transaction,income_to_debt,day_of_week,PIN_changed_years_ago,is_weekend,timestamp,distances,time_of_day
0,134.09,Swipe Transaction,La Verne,No Error,0,29278,59696,127613,787,5,1,2,0,0,0,36,0.467789,6,-6,1,1.030861e+09,0.000000,Morning
1,38.48,Swipe Transaction,Monterey Park,No Error,0,29278,59696,127613,787,5,1,2,0,0,0,36,0.467789,6,-6,1,1.030863e+09,33.540588,Morning
2,120.34,Swipe Transaction,Monterey Park,No Error,0,29278,59696,127613,787,5,1,2,0,0,0,36,0.467789,0,-6,0,1.030948e+09,33.540588,Morning
3,128.95,Swipe Transaction,Monterey Park,No Error,0,29278,59696,127613,787,5,1,2,0,0,0,36,0.467789,0,-6,0,1.030989e+09,33.540588,Afternoon
4,104.71,Swipe Transaction,La Verne,No Error,0,29278,59696,127613,787,5,1,2,0,0,0,36,0.467789,1,-6,0,1.031034e+09,0.000000,Morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5927389,23.87,Chip Transaction,Rockton,No Error,0,18961,38661,88636,765,3,1,2,0,0,0,45,0.436177,6,4,1,1.581878e+09,2753.096563,Evening
5927390,15.62,Chip Transaction,Alsip,No Error,0,18961,38661,88636,765,3,1,2,0,0,0,45,0.436177,1,4,0,1.582028e+09,2880.794597,Afternoon
5927391,32.56,Chip Transaction,Alsip,No Error,0,18961,38661,88636,765,3,1,2,0,0,0,45,0.436177,1,4,0,1.582053e+09,2880.794597,Evening
5927392,3.00,Chip Transaction,Cicero,No Error,0,18961,38661,88636,765,3,1,2,0,0,0,45,0.436177,5,4,1,1.582403e+09,2874.610513,Evening


In [40]:
merged_and_drop_df['Merchant City'].nunique()

10414

In [None]:
# Define categorical and numerical columns
categorical_cols = ['Use Chip', 'Merchant City',
                    'Errors?', 'City', 'Age Group', 'Card Brand', 'Card Type', 'Has Chip',
                    'Card on Dark Web']
numerical_cols = ['MCC', 'Zipcode', 'Per Capita Income - Zipcode', 'Yearly Income - Person',
                  'Total Debt', 'FICO Score', 'Num Credit Cards', 'CARD INDEX', 'Cards Issued',
                  'Credit Limit', 'Year PIN last Changed']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Define the model - option 2
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_features='auto', max_depth=10, criterion='gini', random_state=42))
])
8:17
# Clean numerical columns by removing '$' and ',' from the 'Per Capita Income - Zipcode' and 'Yearly Income - Person' columns and convert them to float
X['Per Capita Income - Zipcode'] = X['Per Capita Income - Zipcode'].replace('[\$,]', '', regex=True).astype(float)
X['Yearly Income - Person'] = X['Yearly Income - Person'].replace('[\$,]', '', regex=True).astype(float)

In [37]:
merged_and_drop_df['Errors?'].value_counts()

Errors?
Insufficient Balance                               56205
Bad PIN                                            13635
Technical Glitch                                   11354
Bad Card Number                                     3485
Bad Expiration                                      2994
Bad CVV                                             2875
Bad Zipcode                                          398
Insufficient Balance,Technical Glitch                124
Bad PIN,Insufficient Balance                         112
Bad PIN,Technical Glitch                              37
Bad Card Number,Insufficient Balance                  24
Bad CVV,Insufficient Balance                          24
Bad Expiration,Insufficient Balance                   19
Bad Card Number,Bad CVV                               17
Bad Card Number,Bad Expiration                        14
Bad Expiration,Bad CVV                                12
Bad Expiration,Technical Glitch                        8
Bad Card Number,Technic

In [26]:
merged_df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?,Birth Year,Zipcode,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,CARD INDEX,Has Chip,Cards Issued,Year PIN last Changed,Card on Dark Web,International,Online,Age_at_transaction,income_to_debt,datetime,day_of_week,PIN_changed_years_ago,is_weekend,timestamp,distances
0,0,0,2002,9,1,06:21,134.09,Swipe Transaction,La Verne,CA,91750,5300,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-01 06:21:00,6,-6,1,1030861000.0,0.0
1,0,0,2002,9,1,06:42,38.48,Swipe Transaction,Monterey Park,CA,91754,5411,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-01 06:42:00,6,-6,1,1030863000.0,33.540588
2,0,0,2002,9,2,06:22,120.34,Swipe Transaction,Monterey Park,CA,91754,5411,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-02 06:22:00,0,-6,0,1030948000.0,33.540588
3,0,0,2002,9,2,17:45,128.95,Swipe Transaction,Monterey Park,CA,91754,5651,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-02 17:45:00,0,-6,0,1030989000.0,33.540588
4,0,0,2002,9,3,06:23,104.71,Swipe Transaction,La Verne,CA,91750,5912,,0,1966,91750,29278,59696,127613,787,5,0,1,2,2008,0,0,0,36,0.467789,2002-09-03 06:23:00,1,-6,0,1031034000.0,0.0


In [16]:
merged_df['Is Fraud?'].value_counts()

Is Fraud?
No     99393
Yes      126
Name: count, dtype: int64

In [9]:
transactions_df.loc[transactions_df['Zip'].str.len() == 0, 'Merchant City'].value_counts()

Merchant City
ONLINE          2720821
Cancun            16977
Mexico City        8878
Rome               8730
Toronto            7783
                 ...   
Lome                  2
Nuku Alofa            2
Asuncion              1
Gaborone              1
South Tarawa          1
Name: count, Length: 193, dtype: int64

In [16]:
transactions_df.loc[transactions_df['Merchant State'].str.len() > 2, :]

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
407,0,0,2003,2,24,13:02,7.52,Swipe Transaction,Zurich,Switzerland,,5411,,No
408,0,0,2003,2,25,06:46,36.97,Swipe Transaction,Zurich,Switzerland,,5411,,No
906,0,0,2004,7,22,06:32,38.26,Swipe Transaction,Tallinn,Estonia,,5912,,No
907,0,0,2004,7,23,20:05,75.38,Swipe Transaction,Tallinn,Estonia,,5812,,No
908,0,0,2004,7,25,13:08,16.20,Swipe Transaction,Tallinn,Estonia,,5541,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24385846,1999,1,2019,6,26,07:43,54.40,Chip Transaction,Buenos Aires,Argentina,,4121,,No
24385847,1999,1,2019,6,26,11:38,241.45,Swipe Transaction,Buenos Aires,Argentina,,3640,,No
24385848,1999,1,2019,6,26,11:58,-103.00,Swipe Transaction,Buenos Aires,Argentina,,3640,,No
24385849,1999,1,2019,6,26,12:44,118.29,Chip Transaction,Buenos Aires,Argentina,,3359,,No


In [6]:
transactions_df.loc[(transactions_df['User'] < 500, 'Is Fraud?')].value_counts()

Is Fraud?
No     5920547
Yes       6847
Name: count, dtype: int64

In [7]:
toy_transactions = transactions_df.loc[transactions_df['User'] < 100]
toy_transactions['Is Fraud?'].value_counts()

Is Fraud?
No     1256923
Yes       1310
Name: count, dtype: int64

In [8]:
merge_step_1 = toy_transactions.merge(users_df, how='inner', on='User')
merged_df = merge_step_1.merge(cards_df, how='inner', left_on=['User', 'Card'], right_on=['User', 'CARD INDEX'])
merge_step_1.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,...,City,State,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5


In [17]:
# create income-to-debt
merged_df['income_to_debt'] = merged_df['Yearly Income - Person'] / (merged_df['Total Debt'] + 0.001)

In [27]:
def calculate_distance(zip1, zip2):
    if zip1:
        try:
            zip1 = str(int(zip1))
            zip2 = str(int(zip2))
        
            dist = pgeocode.GeoDistance('us')
            return dist.query_postal_code(zip1, zip2)
        except:
            return -1
    else:
        return -1

In [34]:
merged_df.loc[merged_df['Zipcode'] == 1532, ['Merchant City', 'City']]

Unnamed: 0,Merchant City,City
468879,Northborough,Northborough
468880,Northborough,Northborough
468881,Northborough,Northborough
468882,Northborough,Northborough
468883,Northborough,Northborough
...,...,...
505844,ONLINE,Northborough
505845,Northborough,Northborough
505846,Northborough,Northborough
505847,West Boylston,Northborough


In [28]:
merged_df['Distance'] = merged_df.apply(lambda row: calculate_distance(row['Zipcode'], row['Zip'], axis=1))

KeyError: 'Zipcode'

In [16]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258233 entries, 0 to 1258232
Data columns (total 47 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   User                         1258233 non-null  int64  
 1   Card                         1258233 non-null  int64  
 2   Year                         1258233 non-null  int64  
 3   Month                        1258233 non-null  int64  
 4   Day                          1258233 non-null  int64  
 5   Time                         1258233 non-null  object 
 6   Amount                       1258233 non-null  object 
 7   Use Chip                     1258233 non-null  object 
 8   Merchant Name                1258233 non-null  int64  
 9   Merchant City                1258233 non-null  object 
 10  Merchant State               1075669 non-null  object 
 11  Zip                          1067603 non-null  float64
 12  MCC                          1258233 non-n

In [70]:
transactions_df.loc[transactions_df['Merchant State'].str.len() > 2, ['Merchant State', 'Merchant City', 'Zip']]

Unnamed: 0,Merchant State,Merchant City,Zip
407,Switzerland,Zurich,
408,Switzerland,Zurich,
906,Estonia,Tallinn,
907,Estonia,Tallinn,
908,Estonia,Tallinn,
...,...,...,...
24385846,Argentina,Buenos Aires,
24385847,Argentina,Buenos Aires,
24385848,Argentina,Buenos Aires,
24385849,Argentina,Buenos Aires,


In [None]:
transactions_df['Merchant State'].str.len() > 2

## Clean up Columns

Unsure how to deal with State or Zipcode. If we keep Zipcode, how do we deal with this as this number is a not a continuous number series?


An idea would be to calculate the distance between the user zipcode and the merchant zipcode?


### cards_df
Drop:
- Card Number
- Expires
- CVV
- Acct Open Date

### users_df
- Person
- Retirement Age
- Birth Year
- Gender
- Address
- Apartment
- City
- Latitude
- Longitude

### transactions_df
- Year
- Day
- Time
- Merchant State
  

In [9]:
transactions_df.head(5)

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [71]:
transactions_df['MCC'].nunique()

109

In [37]:
cards_df = cards_df.drop(columns=['Card Number',
                       'Expires',
                       'CVV',
                       'Acct Open Date',
                       ], axis=1)

In [39]:
users_df.head()

Unnamed: 0,Person,Current Age,Retirement Age,Birth Year,Birth Month,Gender,Address,Apartment,City,State,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,User
0,Hazel Robinson,53,66,1966,11,Female,462 Rose Lane,,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5,0
1,Sasha Sadr,53,68,1966,12,Female,3606 Federal Boulevard,,Little Neck,NY,11363,40.76,-73.74,$37891,$77254,$191349,701,5,1
2,Saanvi Lee,81,67,1938,11,Female,766 Third Drive,,West Covina,CA,91792,34.02,-117.89,$22681,$33483,$196,698,5,2
3,Everlee Clark,63,63,1957,1,Female,3 Madison Street,,New York,NY,10069,40.71,-73.99,$163145,$249925,$202328,722,4,3
4,Kyle Peterson,43,70,1976,9,Male,9620 Valley Stream Drive,,San Francisco,CA,94117,37.76,-122.44,$53797,$109687,$183855,675,1,4


In [28]:
transactions_df.loc[transactions_df['Year'] >= 2017].to_csv('data/test_set.csv')