## 1. Combine datasets and downsample

In [95]:
import pandas as pd
from collections import Counter

from statistics import mean 
from statistics import pvariance
from statistics import pstdev

In [96]:
# Recreate training and test set
origTrain = pd.read_csv('fraudTrain.csv')
origTrain.drop(origTrain.columns[0], axis=1, inplace=True)
origTest = pd.read_csv('fraudTest.csv')
origTest.drop(origTest.columns[0], axis=1, inplace=True)

combined = pd.concat([origTrain,origTest])

Among all observations, we select the full history of only 50 cards as new sample:

In [97]:
select_card = list(Counter(combined.cc_num).keys())[:50]
sample_df = combined[combined['cc_num'].isin(select_card)]

Number of instances in sample set:

In [98]:
len(sample_df)

119884

Number of positive cases (fraud) in sample set:

In [99]:
len(sample_df[sample_df.is_fraud==1])

504

In [100]:
#sample_df.to_csv('sample.csv')

## 2. Feature Engineering

### 2.1 Personal purchase history related

Two columns are created here: 
    - beyond_1std (whether the amount is higher than average + std)
    - beyond_2std (whether the amount is higher than average + 2std)
Average and std are calculated from all transactions made by this card

In [107]:
def get_hist(input_cc_num):
    hist = list(sample_df[sample_df.cc_num==input_cc_num]['amt'])
    avg = mean(hist)
    std = pstdev(hist)
    one_std = avg + std
    two_std = avg + 2*std
    
    return one_std, two_std

dict_one_std = {}
dict_two_std = {}
for card in select_card:
    one, two = get_hist(card)
    dict_one_std[card] = one
    dict_two_std[card] = two

In [109]:
one_std, two_std = [], []

for i in range(len(sample_df)):
    cc_num = sample_df.iloc[i]['cc_num']
    amount = sample_df.iloc[i]['amt']
    one_std.append(amount >= dict_one_std[cc_num])
    two_std.append(amount >= dict_two_std[cc_num])

sample_df['beyond_1std'] = one_std
sample_df['beyond_2std'] = two_std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


### 2.2 Age

One column is created by DOB column. We use transaction year to subtract the year of person's birth year, to get current age

In [129]:
import datetime

def get_age(trans_date_trans_time, dob):
    trans_year = datetime.datetime.strptime(trans_date_trans_time, '%Y-%m-%d %H:%M:%S').year
    birth_year = datetime.datetime.strptime(dob, '%Y-%m-%d').year
    age = trans_year - birth_year
    return age

In [130]:
age_lst = []

for i in range(len(sample_df)):
    year1 = sample_df.iloc[i]['trans_date_trans_time']
    year2 = sample_df.iloc[i]['dob']
    age_lst.append(get_age(year1, year2))
    
sample_df['age'] = age_lst

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### 2.3 Distance

One column is created by calculating the distance from merchant location and card location

In [133]:
from math import sin, cos, sqrt, atan2, radians

def calculate_distance(lat1, lon1, lat2, lon2):
    # approximate radius of earth in km
    R = 6373.0
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [134]:
distance_lst = []

for i in range(len(sample_df)):
    lat1 = sample_df.iloc[i]['lat']
    lon1 = sample_df.iloc[i]['long']
    lat2 = sample_df.iloc[i]['merch_lat']
    lon2 = sample_df.iloc[i]['merch_long']
    
    dis = calculate_distance(lat1, lon1, lat2, lon2)
    distance_lst.append(dis)
    
sample_df['distance'] = distance_lst

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


### 2.4 Transaction time

A rough estimate of transaction time: 
    - latenight: 12am - 6am
    - morning: 6am - 12pm
    - afternoon: 12pm - 6pm
    - evening: 6pm - 12am
    
4 new columns are created here

In [3]:
def convert_time(trans_date_trans_time):
    hour = datetime.datetime.strptime(trans_date_trans_time, '%Y-%m-%d %H:%M:%S').hour
    if hour <= 5:
        return 'trans_latenight'
    elif hour >5 and hour <= 11:
        return 'trans_morning'
    elif hour>11 and hour <= 17:
        return 'trans_afternoon'
    else:
        return 'trans_evening'

In [148]:
trans_time = []
for i in range(len(sample_df)):
    trans_date_trans_time = sample_df.iloc[i]['trans_date_trans_time']
    trans_time.append(convert_time(trans_date_trans_time))
    
sample_df[['trans_latenight','trans_afternoon','trans_evening', 'trans_morning']] = pd.get_dummies(trans_time)[['trans_latenight','trans_afternoon','trans_evening', 'trans_morning']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### 2.5 Number of transaction within a day and an hour

In [198]:
def num_trans(given_time, card, time_range):
    # Get all transaction times of this card
    count = 0
    hist = list(sample_df[sample_df.cc_num==card]['trans_date_trans_time'])
    given_time = datetime.datetime.strptime(given_time, '%Y-%m-%d %H:%M:%S')
    
    if time_range == 'day':
        time_gap = 86400 # number of seconds in a day
    if time_range == 'hour':
        time_gap = 3600 # number of seconds in an hour
        
    for temp_time in hist:
        temp_time = datetime.datetime.strptime(temp_time, '%Y-%m-%d %H:%M:%S')
        if (temp_time < given_time) and ((given_time-temp_time).days < 1) and ((given_time-temp_time).seconds < time_gap):
            count += 1
            
    return count

In [200]:
cardTry = 630423337322
num_trans('2019-01-01 22:32:43', cardTry, 'hour')

0

In [202]:
trans_last24hr = []
trans_last1hr = []

for i in range(len(sample_df)):
    given_time = sample_df.iloc[i]['trans_date_trans_time']
    card = sample_df.iloc[i]['cc_num']
    
    trans_last24hr.append(num_trans(given_time, card, 'day'))
    trans_last1hr.append(num_trans(given_time, card, 'hour'))
    
sample_df['trans_last24hr'] = trans_last24hr
sample_df['trans_last1hr'] = trans_last1hr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [205]:
#sample_df.to_csv("sample_done.csv")