In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

%matplotlib inline

# read dataset
df = pd.read_csv("../data_for_student_case.csv")

# set cvc
df['cvcresponsecode'].ix[df['cvcresponsecode']>3] = 3

# change currency
df['amount'].ix[df['currencycode'] == 'MXN'] = df['amount'].ix[df['currencycode'] == 'MXN'] * 0.052131
df['amount'].ix[df['currencycode'] == 'AUD'] = df['amount'].ix[df['currencycode'] == 'AUD'] * 0.7349
df['amount'].ix[df['currencycode'] == 'NZD'] = df['amount'].ix[df['currencycode'] == 'NZD'] * 0.68966
df['amount'].ix[df['currencycode'] == 'GBP'] = df['amount'].ix[df['currencycode'] == 'GBP'] * 1.292625
df['amount'].ix[df['currencycode'] == 'SEK'] = df['amount'].ix[df['currencycode'] == 'SEK'] * 0.112851

# change currency code to make it comparable
df['currencycode'].ix[df['currencycode'] == 'MXN'] = 'MX'
df['currencycode'].ix[df['currencycode'] == 'AUD'] = 'AU'
df['currencycode'].ix[df['currencycode'] == 'NZD'] = 'NZ'
df['currencycode'].ix[df['currencycode'] == 'GBP'] = 'GB'
df['currencycode'].ix[df['currencycode'] == 'SEK'] = 'SE'

# deal with 'nan'
df['issuercountrycode'].fillna('other',inplace=True)
df['shoppercountrycode'].fillna('other',inplace=True)
df['bin'].fillna(1,inplace=True)

# drop 'Refused'
df = df[df['simple_journal'] != 'Refused']
df['simple_journal'].ix[df['simple_journal'] == 'Settled'] = 1
df['simple_journal'].ix[df['simple_journal'] == 'Chargeback'] = 0

# verificationcodesupplied True:1, False:0, unknown:2
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == True] = 1
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == False] = 0
df['cardverificationcodesupplied'].fillna(2,inplace=True)

# extract weekday and hour from creationdate
df['creationdate'] = pd.to_datetime(df['creationdate'])
df['cweekday'] = df['creationdate'].dt.weekday_name
df['chour'] = df['creationdate'].dt.hour
df['chour'].apply(pd.to_numeric)

# create issue_shopper
df['issue_shopper'] = df['issuercountrycode'] == df['shoppercountrycode']
df['issue_shopper'].ix[df['issue_shopper'] == True] = 1.0
df['issue_shopper'].ix[df['issue_shopper'] == False] = 0.0

# create issue_currency
df['issue_currency'] = df['issuercountrycode'] == df['currencycode']
df['issue_currency'].ix[df['issue_currency'] == True] = 1.0
df['issue_currency'].ix[df['issue_currency'] == False] = 0.0

# create shopper_currency
df['shopper_currency'] = df['shoppercountrycode'] == df['currencycode']
df['shopper_currency'].ix[df['shopper_currency'] == True] = 1.0
df['shopper_currency'].ix[df['shopper_currency'] == False] = 0.0

# drop features
df = df.drop(['txid', 'bookingdate', 'creationdate', 'mail_id', 'ip_id', 'card_id'], axis=1)

df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,accountcode,cweekday,chour,issue_shopper,issue_currency,shopper_currency
272614,SE,mcdebit,554501.0,7493.3064,SE,SE,Ecommerce,1,1,1,SwedenAccount,Monday,11,1.0,1.0,1.0
83123,GB,visadebit,465942.0,8421.451875,GB,GB,Ecommerce,1,1,1,UKAccount,Tuesday,22,1.0,1.0,1.0
182443,GB,mccredit,543460.0,10205.274375,GB,GB,Ecommerce,1,1,1,UKAccount,Monday,17,1.0,1.0,1.0
21759,MX,visaclassic,491871.0,9894.4638,MX,MX,Ecommerce,1,1,0,MexicoAccount,Wednesday,21,1.0,1.0,1.0
196734,GB,visadebit,465859.0,2837.311875,GB,GB,Ecommerce,1,2,1,UKAccount,Thursday,23,1.0,1.0,1.0


#### empirical bayes method  
We thought 'issuercountrycode', 'shoppercountrycode' and 'bin' are high-cardinality categorical features. They will be transformed with empirical bayes method. 

In [117]:
def sigmoid(n, k, f):
    return 1 / (1 + np.exp((-1) * (n - k) / f))

def empirical_bayes(df, column):
    
    prior_bayes = 1 - sum(df['simple_journal'])/len(df)
    feature_set = set([])
    dic = {}
    
    # parameters
    k = len(df) / df[column].nunique()
    f = 1.0
    
    for feature in df[column]:
        if feature not in feature_set:
            feature_set.add(feature)
            df_pos = df[df[column] == feature]
            n = len(df_pos)
            alpha = 1 / (1 + np.exp((k - n) / f))
            dic[feature] = alpha * len(df_pos[df_pos['simple_journal'] == 0])/len(df_pos) + (1 - alpha) * prior_bayes
            
    return dic

def eb_feature(df, feature):
    dic = empirical_bayes(df, feature)
    li = []
    for i in df[feature]:
        li.append(dic[i])
    
    df[feature + '_eb'] = li
    df = df.drop([feature], axis=1)
    
    return df, dic

#### encoding categorical features with one-hot encoding

In [108]:
dummy_features = ['txvariantcode', 'currencycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode', 'cweekday']

for feature in dummy_features:

    just_dummies = pd.get_dummies(df[feature],prefix=feature+'_')

    df = pd.concat([df, just_dummies], axis=1)      
    df.drop([feature], inplace=True, axis=1)

df = df.reset_index(drop=True)
    
df.sample(5)

Unnamed: 0,issuercountrycode,bin,amount,shoppercountrycode,simple_journal,chour,issue_shopper,issue_currency,shopper_currency,txvariantcode__electron,...,accountcode__MexicoAccount,accountcode__SwedenAccount,accountcode__UKAccount,cweekday__Friday,cweekday__Monday,cweekday__Saturday,cweekday__Sunday,cweekday__Thursday,cweekday__Tuesday,cweekday__Wednesday
113644,GB,476367.0,7238.7,GB,1,15,1.0,1.0,1.0,0,...,0,0,1,0,0,0,0,1,0,0
61021,GB,475129.0,7872.08625,GB,1,0,1.0,1.0,1.0,0,...,0,0,1,0,0,0,0,0,1,0
13729,AU,519307.0,2829.365,AU,1,11,1.0,1.0,1.0,0,...,0,0,0,0,0,0,0,1,0,0
123902,GB,446291.0,16416.3375,GB,1,11,1.0,1.0,1.0,0,...,0,0,1,0,0,0,0,1,0,0
67435,GB,476367.0,7102.974375,GB,1,23,1.0,1.0,1.0,0,...,0,0,1,0,0,0,1,0,0,0


#### train classifier

In [109]:
def run_kfold(clf, df, sampling_ratio):
    kf = KFold(n_splits=10, shuffle=True)
    outcomes = np.zeros([2,2])
    fold = 0
    for train_index, test_index in kf.split(df):
        fold += 1
        
        # split train & test set
        train, test = df.iloc[train_index].copy(), df.iloc[test_index].copy()
                
        # empirical bayes for high-cardinality category
        train_prior = 1 - sum(train['simple_journal'])/len(train)
        train, dic1 = eb_feature(train,'issuercountrycode')
        train, dic2 = eb_feature(train,'shoppercountrycode')
        train, dic3 = eb_feature(train,'bin')
                
        # use the same dic to also process test set
        li = []
        for i in test['issuercountrycode']:
            # if not found, then just use prior probability
            try:
                li.append(dic1[i])
            except KeyError:
                li.append(train_prior)
    
        test['issuercountrycode_eb'] = li
        test = test.drop(['issuercountrycode'], axis=1)
        
        li = []
        for i in test['shoppercountrycode']:
            try:
                li.append(dic2[i])
            except KeyError:
                li.append(train_prior)
    
        test['shoppercountrycode_eb'] = li
        test = test.drop(['shoppercountrycode'], axis=1)
        
        li = []
        for i in test['bin']:
            try:
                li.append(dic3[i])
            except KeyError:
                li.append(train_prior)
    
        test['bin_eb'] = li
        test = test.drop(['bin'], axis=1)
                        
        # split x, y
        X_train = train.drop(['simple_journal'], axis=1).values
        X_test = test.drop(['simple_journal'], axis=1).values
        y_train = list(train['simple_journal'].values)
        y_test = list(test['simple_journal'].values)
        
        # undersampling
        rus = RandomUnderSampler(ratio=sampling_ratio)
        X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
        
        # train classifier
        clf.fit(X_train_rus, y_train_rus)
        predictions = clf.predict(X_test)
        c_matrix = confusion_matrix(y_test, predictions)
        outcomes = outcomes + c_matrix
        
        print("Fold:",fold) 
        print(np.size(X_train,0), np.size(X_train_rus,0), np.size(X_test,0))
        print(c_matrix)
        print('precision:',c_matrix[0,0]/(c_matrix[0,1]+c_matrix[1,0]),'recall:',c_matrix[0,0]/(c_matrix[0,0]+c_matrix[0,1]))
    print('average:')
    np.set_printoptions(suppress=True)
    print(outcomes / 10)
    print('precision:',outcomes[0,0]/(outcomes[0,1]+outcomes[1,0]),'recall:',outcomes[0,0]/(outcomes[0,0]+outcomes[0,1]))

#### black-box

In [118]:
rf = RandomForestClassifier()
run_kfold(rf, df, 0.7)



Fold: 1
213332 748 23704
[[   35     2]
 [ 2671 20996]]
precision: 0.0130939019828 recall: 0.945945945946




Fold: 2
213332 765 23704
[[   24     6]
 [ 2437 21237]]
precision: 0.00982398690135 recall: 0.8




Fold: 3
213332 757 23704
[[   30     3]
 [ 2144 21527]]
precision: 0.0139729855612 recall: 0.909090909091




Fold: 4
213332 752 23704
[[   32     3]
 [ 2600 21069]]
precision: 0.0122935074914 recall: 0.914285714286




Fold: 5
213332 733 23704
[[   39     4]
 [ 2654 21007]]
precision: 0.0146726862302 recall: 0.906976744186




Fold: 6
213332 735 23704
[[   36     6]
 [ 2767 20895]]
precision: 0.0129823296069 recall: 0.857142857143




Fold: 7
213333 765 23703
[[   28     2]
 [ 2594 21079]]
precision: 0.0107858243451 recall: 0.933333333333




Fold: 8
213333 765 23703
[[   26     4]
 [ 2570 21103]]
precision: 0.010101010101 recall: 0.866666666667




Fold: 9
213333 762 23703
[[   27     4]
 [ 2455 21217]]
precision: 0.0109800732005 recall: 0.870967741935




Fold: 10
213333 755 23703
[[   29     5]
 [ 2256 21413]]
precision: 0.0128261831048 recall: 0.852941176471
average:
[[    30.6      3.9]
 [  2514.8  21154.3]]
precision: 0.0121491245484 recall: 0.886956521739


#### white-box

In [102]:
dt = DecisionTreeClassifier()
run_kfold(dt, df, 0.7)

Fold: 1
213332 735 23704
[[   24    18]
 [ 2124 21538]]
precision: 0.0112044817927 recall: 0.571428571429
Fold: 2
213332 750 23704
[[   26    10]
 [ 2109 21559]]
precision: 0.0122699386503 recall: 0.722222222222
Fold: 3
213332 779 23704
[[   17     7]
 [ 2001 21679]]
precision: 0.00846613545817 recall: 0.708333333333
Fold: 4
213332 769 23704
[[   24     4]
 [ 2044 21632]]
precision: 0.01171875 recall: 0.857142857143
Fold: 5
213332 762 23704
[[   23     8]
 [ 2246 21427]]
precision: 0.0102040816327 recall: 0.741935483871
Fold: 6
213332 762 23704
[[   23     8]
 [ 2533 21140]]
precision: 0.0090515545061 recall: 0.741935483871
Fold: 7
213333 723 23703
[[   33    14]
 [ 1759 21897]]
precision: 0.0186125211506 recall: 0.702127659574
Fold: 8
213333 760 23703
[[   24     8]
 [ 2560 21111]]
precision: 0.00934579439252 recall: 0.75
Fold: 9
213333 752 23703
[[   27     8]
 [ 1836 21832]]
precision: 0.0146420824295 recall: 0.771428571429
Fold: 10
213333 743 23703
[[   33     6]
 [ 2595 21069]]
pr

#### Plot the tree  
Here we train with all the data and there is no testing in this part. 

In [104]:
rf = DecisionTreeClassifier()
rus = RandomUnderSampler(ratio=0.7)
df_all = df.copy()
# eb process
df_all, _ = eb_feature(df_all, 'issuercountrycode')
df_all, _ = eb_feature(df_all, 'shoppercountrycode')
df_all, _ = eb_feature(df_all, 'bin')

y_all = np.asarray(df_all['simple_journal'], dtype="|S6")
X_all = df_all.drop(['simple_journal'], axis=1).values
X_rus, y_rus = rus.fit_sample(X_all, y_all)
rf.fit(X_rus, y_rus)

tree.export_graphviz(rf, out_file = 'tree.dot', feature_names = list(df_all))
df_all.sample(5)

Unnamed: 0,amount,simple_journal,chour,issue_shopper,issue_currency,shopper_currency,txvariantcode__electron,txvariantcode__mc,txvariantcode__mccredit,txvariantcode__mcdebit,...,cweekday__Friday,cweekday__Monday,cweekday__Saturday,cweekday__Sunday,cweekday__Thursday,cweekday__Tuesday,cweekday__Wednesday,issuercountrycode_eb,shoppercountrycode_eb,bin_eb
58225,3742.149375,1,18,1.0,1.0,1.0,0,0,1,0,...,0,0,0,1,0,0,0,0.000186,0.000183,4.330617e-17
14638,12346.32,1,13,1.0,1.0,1.0,0,0,0,0,...,0,1,0,0,0,0,0,0.005112,0.005088,5.59129e-11
9591,8857.0569,1,0,1.0,1.0,1.0,0,0,0,0,...,0,0,0,0,0,0,1,0.015397,0.015419,0.000897465
200405,9558.961875,1,17,1.0,1.0,1.0,0,0,0,0,...,0,0,0,0,1,0,0,0.000186,0.000183,0.0
46588,16804.125,1,1,1.0,1.0,1.0,0,0,0,0,...,0,1,0,0,0,0,0,0.000186,0.000183,8.425055e-11
