In [14]:
from pathlib import Path
import pandas as pd
import numpy as np
import pgeocode

In [49]:
# Functions for data import

def add_leading_zero_to_zipcode(item):
    item_str = str(item)  # Ensure the item is a string
    item_str = item_str.replace('.0', '') # remove trailing '.0'
    if len(item_str) == 4:
        return '0' + item_str
    return item_str

def remove_dollar_and_convert(item):
    # Remove the dollar sign and convert to integer
    return np.int32(item.replace('$', ''))


In [54]:
# Import users from csv

users_path = Path.cwd() / 'data/sd254_users.csv'


users_columns_import = ['Birth Year', 
                        'Zipcode', 
                        'Per Capita Income - Zipcode',
                        'Yearly Income - Person', 
                        'Total Debt',
                        'FICO Score',
                        'Num Credit Cards']

user_converters = {'Zipcode': add_leading_zero_to_zipcode,
                   'Per Capita Income - Zipcode': remove_dollar_and_convert,
                   'Yearly Income - Person': remove_dollar_and_convert,
                   'Total Debt': remove_dollar_and_convert}

users_dtypes = {'Birth Year': np.uint16,
                 'FICO Score': np.uint16,
                 'Num Credit Cards': np.uint8}

users_df = pd.read_csv(users_path, 
                       usecols=users_columns_import,
                       converters=user_converters,
                       dtype=users_dtypes
                    )


users_df['User'] = users_df.index

users_df.head()


Unnamed: 0,Birth Year,Zipcode,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,User
0,1966,91750,29278,59696,127613,787,5,0
1,1966,11363,37891,77254,191349,701,5,1
2,1938,91792,22681,33483,196,698,5,2
3,1957,10069,163145,249925,202328,722,4,3
4,1976,94117,53797,109687,183855,675,1,4


In [58]:
# import cards csv

cards_path = Path.cwd() / 'data/sd254_cards.csv'

cards_columns_import = ['User',	
                        'CARD INDEX',
                        'Has Chip',
                        'Cards Issued',
                        'Year PIN last Changed',
                        'Card on Dark Web'
                        ]

cards_dtypes = {'CARD INDEX': np.uint8,
                'Cards Issued': np.uint8,
                'Year PIN last Changed': np.uint16
                }

cards_df = pd.read_csv(cards_path,
                       usecols=cards_columns_import,
                       dtype=cards_dtypes,
                       )

display(cards_df.head())

Unnamed: 0,User,CARD INDEX,Has Chip,Cards Issued,Year PIN last Changed,Card on Dark Web
0,0,0,YES,2,2008,No
1,0,1,YES,2,2014,No
2,0,2,YES,2,2004,No
3,0,3,NO,1,2012,No
4,0,4,YES,1,2009,No


In [59]:
transactions_path = Path.cwd() / 'data/credit_card_transactions-ibm_v2.csv'

transactions_columns_import = ['User',
                               'Card',
                               ]

transactions_df = pd.read_csv(transactions_path)


In [60]:
transactions_df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [5]:
transactions_users_500_df = transactions_df.loc[transactions_df['User'] < 500]
transactions_test_df = transactions_df.loc[transactions_df['User'] < 5]

In [6]:
transactions_users_500_df.to_csv('data/transactions_users_500.csv', index=False)

In [12]:

# function to add a leading zero to a zipcode
def add_leading_zero_to_zipcode(item):
    new_zip = '0' + str(item)
    return new_zip


# convert the Zip column to a string then remove the trailing '.0'
transactions_test_df.loc[:, 'Zip'] = transactions_test_df['Zip'].astype(dtype='string').str.replace('.0', '')
users_df.loc[:, 'Zipcode'] = users_df['Zipcode'].astype(dtype='string')

# add a leading zero to zipcode if zipcode is length 4
transactions_test_df.loc[:, 'Zip'] = transactions_test_df.apply(
    lambda x: add_leading_zero_to_zipcode(x['Zip']) if len(str(x['Zip'])) == 4 else x['Zip'], 
    axis=1
)



['91750', '11363', '91792', '10069', '94117', '52803', '40299', '97214',
 '18969', '70510',
 ...
 '71008', '99336', '93618', '29579', '63069', '11520', '41051',  '7201',
 '17011',  '3054']
Length: 2000, dtype: string' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  users_df.loc[:, 'Zipcode'] = users_df['Zipcode'].astype(dtype='string')


In [11]:
users_df.loc[users_df['Zipcode'].str.len() == 4, 'Zipcode'].value_counts()

AttributeError: Can only use .str accessor with string values!

In [45]:
transactions_test_df.loc[transactions_test_df['Zip'].str.len() == 4, 'Zip'].value_counts()

Series([], Name: count, dtype: int64)

In [40]:
transactions_users_500_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5927394 entries, 0 to 5927393
Data columns (total 15 columns):
 #   Column          Dtype  
---  ------          -----  
 0   User            int64  
 1   Card            int64  
 2   Year            int64  
 3   Month           int64  
 4   Day             int64  
 5   Time            object 
 6   Amount          object 
 7   Use Chip        object 
 8   Merchant Name   int64  
 9   Merchant City   object 
 10  Merchant State  object 
 11  Zip             float64
 12  MCC             int64  
 13  Errors?         object 
 14  Is Fraud?       object 
dtypes: float64(1), int64(7), object(7)
memory usage: 723.6+ MB


In [6]:
transactions_df.loc[(transactions_df['User'] < 500, 'Is Fraud?')].value_counts()

Is Fraud?
No     5920547
Yes       6847
Name: count, dtype: int64

In [7]:
toy_transactions = transactions_df.loc[transactions_df['User'] < 100]
toy_transactions['Is Fraud?'].value_counts()

Is Fraud?
No     1256923
Yes       1310
Name: count, dtype: int64

In [8]:
merge_step_1 = toy_transactions.merge(users_df, how='inner', on='User')
merge_step_1.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,...,City,State,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,...,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5


In [9]:
merged_df = merge_step_1.merge(cards_df, how='inner', left_on=['User', 'Card'], right_on=['User', 'CARD INDEX'])
merged_df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,...,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No


In [15]:
# add column to define whether international
merged_df['International'] = merged_df['Merchant State'].str.len() > 2

# remove $ from columns
merged_df['Per Capita Income - Zipcode'] = merged_df['Per Capita Income - Zipcode'].str.replace('$', '').astype('int64')
merged_df['Yearly Income - Person'] = merged_df['Yearly Income - Person'].str.replace('$', '').astype('int64')
merged_df['Total Debt'] = merged_df['Total Debt'].str.replace('$', '').astype('int64')




                   

In [17]:
# create income-to-debt
merged_df['income_to_debt'] = merged_df['Yearly Income - Person'] / (merged_df['Total Debt'] + 0.001)

In [27]:
def calculate_distance(zip1, zip2):
    if zip1:
        try:
            zip1 = str(int(zip1))
            zip2 = str(int(zip2))
        
            dist = pgeocode.GeoDistance('us')
            return dist.query_postal_code(zip1, zip2)
        except:
            return -1
    else:
        return -1

In [34]:
merged_df.loc[merged_df['Zipcode'] == 1532, ['Merchant City', 'City']]

Unnamed: 0,Merchant City,City
468879,Northborough,Northborough
468880,Northborough,Northborough
468881,Northborough,Northborough
468882,Northborough,Northborough
468883,Northborough,Northborough
...,...,...
505844,ONLINE,Northborough
505845,Northborough,Northborough
505846,Northborough,Northborough
505847,West Boylston,Northborough


In [28]:
merged_df['Distance'] = merged_df.apply(lambda row: calculate_distance(row['Zipcode'], row['Zip'], axis=1))

KeyError: 'Zipcode'

In [16]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258233 entries, 0 to 1258232
Data columns (total 47 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   User                         1258233 non-null  int64  
 1   Card                         1258233 non-null  int64  
 2   Year                         1258233 non-null  int64  
 3   Month                        1258233 non-null  int64  
 4   Day                          1258233 non-null  int64  
 5   Time                         1258233 non-null  object 
 6   Amount                       1258233 non-null  object 
 7   Use Chip                     1258233 non-null  object 
 8   Merchant Name                1258233 non-null  int64  
 9   Merchant City                1258233 non-null  object 
 10  Merchant State               1075669 non-null  object 
 11  Zip                          1067603 non-null  float64
 12  MCC                          1258233 non-n

In [70]:
transactions_df.loc[transactions_df['Merchant State'].str.len() > 2, ['Merchant State', 'Merchant City', 'Zip']]

Unnamed: 0,Merchant State,Merchant City,Zip
407,Switzerland,Zurich,
408,Switzerland,Zurich,
906,Estonia,Tallinn,
907,Estonia,Tallinn,
908,Estonia,Tallinn,
...,...,...,...
24385846,Argentina,Buenos Aires,
24385847,Argentina,Buenos Aires,
24385848,Argentina,Buenos Aires,
24385849,Argentina,Buenos Aires,


In [None]:
transactions_df['Merchant State'].str.len() > 2

## Clean up Columns

Unsure how to deal with State or Zipcode. If we keep Zipcode, how do we deal with this as this number is a not a continuous number series?


An idea would be to calculate the distance between the user zipcode and the merchant zipcode?


### cards_df
Drop:
- Card Number
- Expires
- CVV
- Acct Open Date

### users_df
- Person
- Retirement Age
- Birth Year
- Gender
- Address
- Apartment
- City
- Latitude
- Longitude

### transactions_df
- Year
- Day
- Time
- Merchant State
  

In [9]:
transactions_df.head(5)

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [71]:
transactions_df['MCC'].nunique()

109

In [37]:
cards_df = cards_df.drop(columns=['Card Number',
                       'Expires',
                       'CVV',
                       'Acct Open Date',
                       ], axis=1)

In [39]:
users_df.head()

Unnamed: 0,Person,Current Age,Retirement Age,Birth Year,Birth Month,Gender,Address,Apartment,City,State,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,User
0,Hazel Robinson,53,66,1966,11,Female,462 Rose Lane,,La Verne,CA,91750,34.15,-117.76,$29278,$59696,$127613,787,5,0
1,Sasha Sadr,53,68,1966,12,Female,3606 Federal Boulevard,,Little Neck,NY,11363,40.76,-73.74,$37891,$77254,$191349,701,5,1
2,Saanvi Lee,81,67,1938,11,Female,766 Third Drive,,West Covina,CA,91792,34.02,-117.89,$22681,$33483,$196,698,5,2
3,Everlee Clark,63,63,1957,1,Female,3 Madison Street,,New York,NY,10069,40.71,-73.99,$163145,$249925,$202328,722,4,3
4,Kyle Peterson,43,70,1976,9,Male,9620 Valley Stream Drive,,San Francisco,CA,94117,37.76,-122.44,$53797,$109687,$183855,675,1,4


In [28]:
transactions_df.loc[transactions_df['Year'] >= 2017].to_csv('data/test_set.csv')