In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

%matplotlib inline

# read dataset
df = pd.read_csv("../data_for_student_case.csv")

# set cvc
df['cvcresponsecode'].ix[df['cvcresponsecode']>3] = 3

# change currency
df['amount'].ix[df['currencycode'] == 'MXN'] = df['amount'].ix[df['currencycode'] == 'MXN'] * 0.052131
df['amount'].ix[df['currencycode'] == 'AUD'] = df['amount'].ix[df['currencycode'] == 'AUD'] * 0.7349
df['amount'].ix[df['currencycode'] == 'NZD'] = df['amount'].ix[df['currencycode'] == 'NZD'] * 0.68966
df['amount'].ix[df['currencycode'] == 'GBP'] = df['amount'].ix[df['currencycode'] == 'GBP'] * 1.292625
df['amount'].ix[df['currencycode'] == 'SEK'] = df['amount'].ix[df['currencycode'] == 'SEK'] * 0.112851

# change currency code to make it comparable
df['currencycode'].ix[df['currencycode'] == 'MXN'] = 'MX'
df['currencycode'].ix[df['currencycode'] == 'AUD'] = 'AU'
df['currencycode'].ix[df['currencycode'] == 'NZD'] = 'NZ'
df['currencycode'].ix[df['currencycode'] == 'GBP'] = 'GB'
df['currencycode'].ix[df['currencycode'] == 'SEK'] = 'SE'

# deal with 'nan'
df['issuercountrycode'].fillna('other',inplace=True)
df['shoppercountrycode'].fillna('other',inplace=True)
df['bin'].fillna(1,inplace=True)

# drop 'Refused'
df = df[df['simple_journal'] != 'Refused']
df['simple_journal'].ix[df['simple_journal'] == 'Settled'] = 1
df['simple_journal'].ix[df['simple_journal'] == 'Chargeback'] = 0

# verificationcodesupplied True:1, False:0, unknown:2
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == True] = 1
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == False] = 0
df['cardverificationcodesupplied'].fillna(2,inplace=True)

# extract weekday and hour from creationdate
df['creationdate'] = pd.to_datetime(df['creationdate'])
df['cweekday'] = df['creationdate'].dt.weekday_name
df['chour'] = df['creationdate'].dt.hour
df['chour'].apply(pd.to_numeric)

# create issue_shopper
df['issue_shopper'] = df['issuercountrycode'] == df['shoppercountrycode']
df['issue_shopper'].ix[df['issue_shopper'] == True] = 1.0
df['issue_shopper'].ix[df['issue_shopper'] == False] = 0.0

# create issue_currency
df['issue_currency'] = df['issuercountrycode'] == df['currencycode']
df['issue_currency'].ix[df['issue_currency'] == True] = 1.0
df['issue_currency'].ix[df['issue_currency'] == False] = 0.0

# create shopper_currency
df['shopper_currency'] = df['shoppercountrycode'] == df['currencycode']
df['shopper_currency'].ix[df['shopper_currency'] == True] = 1.0
df['shopper_currency'].ix[df['shopper_currency'] == False] = 0.0

# drop features
df = df.drop(['txid', 'bookingdate', 'creationdate', 'ip_id', 'mail_id', 'card_id'], axis=1)
# df = df.drop(['txid', 'bookingdate', 'creationdate'], axis=1)

df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,accountcode,cweekday,chour,issue_shopper,issue_currency,shopper_currency
190999,GB,visadebit,475129.0,15382.2375,GB,GB,Ecommerce,1,1,1,UKAccount,Monday,12,1.0,1.0,1.0
169485,GB,visadebit,475128.0,8660.5875,GB,GB,Ecommerce,1,1,1,UKAccount,Saturday,0,1.0,1.0,1.0
21991,MX,mccredit,557907.0,6250.5069,MX,MX,Ecommerce,1,1,0,MexicoAccount,Saturday,20,1.0,1.0,1.0
289556,SE,visadebit,453904.0,3938.4999,SE,SE,Ecommerce,1,1,1,SwedenAccount,Friday,20,1.0,1.0,1.0
221024,GB,visadebit,492181.0,7872.08625,GB,GB,Ecommerce,1,1,1,UKAccount,Wednesday,22,1.0,1.0,1.0


#### empirical bayes method  
We thought 'issuercountrycode', 'shoppercountrycode' and 'bin' are high-cardinality categorical features. They will be transformed with empirical bayes method. 

In [71]:
def sigmoid(n, k, f):
    return 1 / (1 + np.exp((-1) * (n - k) / f))

def empirical_bayes(df, column, k, f):
    
    prior_bayes = 1 - sum(df['simple_journal'])/len(df)
    feature_set = set([])
    dic = {}
    
    # parameters
    # k = len(df) / df[column].nunique()  # take average
    # f = 1.0
    
    for feature in df[column]:
        if feature not in feature_set:
            feature_set.add(feature)
            df_pos = df[df[column] == feature]
            n = len(df_pos)
            alpha = 1 / (1 + np.exp((k - n) / f))
            dic[feature] = alpha * len(df_pos[df_pos['simple_journal'] == 0])/len(df_pos) + (1 - alpha) * prior_bayes
            
    return dic

def eb_feature(df, feature, k, f):
    dic = empirical_bayes(df, feature, k, f)
    li = []
    for i in df[feature]:
        li.append(dic[i])
    
    df[feature + '_eb'] = li
    df = df.drop([feature], axis=1)
    
    return df, dic

def eb_testset(test, feature, dic, train_prior):
    li = []
    for i in test[feature]:
        # if not found, then just use prior probability
        try:
            li.append(dic[i])
        except KeyError:
            li.append(train_prior)
    test[feature + '_eb'] = li
    test = test.drop([feature], axis=1)
    return test

#### encoding categorical features with one-hot encoding

In [51]:
dummy_features = ['txvariantcode', 'currencycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode', 'cweekday']

for feature in dummy_features:

    just_dummies = pd.get_dummies(df[feature],prefix=feature+'_')

    df = pd.concat([df, just_dummies], axis=1)      
    df.drop([feature], inplace=True, axis=1)

df = df.reset_index(drop=True)
    
df.sample(5)

Unnamed: 0,issuercountrycode,bin,amount,shoppercountrycode,simple_journal,chour,issue_shopper,issue_currency,shopper_currency,txvariantcode__electron,...,accountcode__MexicoAccount,accountcode__SwedenAccount,accountcode__UKAccount,cweekday__Friday,cweekday__Monday,cweekday__Saturday,cweekday__Sunday,cweekday__Thursday,cweekday__Tuesday,cweekday__Wednesday
186277,IT,529911.0,9591.2775,GB,1,12,0.0,0.0,1.0,0,...,0,0,1,0,0,0,0,1,0,0
185107,GB,492913.0,4513.8465,GB,1,9,1.0,1.0,1.0,0,...,0,0,1,0,0,0,0,1,0,0
222700,SE,522660.0,7324.0299,SE,1,20,1.0,1.0,1.0,0,...,0,1,0,0,1,0,0,0,0,0
141713,GB,450875.0,2927.795625,GB,1,12,1.0,1.0,1.0,0,...,0,0,1,0,1,0,0,0,0,0
121330,GB,492181.0,7361.499375,GB,1,17,1.0,1.0,1.0,0,...,0,0,1,0,0,0,0,0,1,0


#### train classifier

In [99]:
def run_kfold(clf, df, sampling_ratio):
    kf = KFold(n_splits=10, shuffle=True)
    outcomes = np.zeros([2,2])
    fold = 0
    for train_index, test_index in kf.split(df):
        fold += 1
        
        # split train & test set
        train, test = df.iloc[train_index].copy(), df.iloc[test_index].copy()
                
        # empirical bayes for high-cardinality category
        train_prior = 1 - sum(train['simple_journal'])/len(train)
        train, dic1 = eb_feature(train,'issuercountrycode', 10, 100)
        train, dic2 = eb_feature(train,'shoppercountrycode', 10, 100)
        train, dic3 = eb_feature(train,'bin', 100, 100)
        #train, dic4 = eb_feature(train,'mail_id', 5)
        #train, dic5 = eb_feature(train,'card_id', 5)
        #train, dic6 = eb_feature(train,'ip_id', 5)
                
        # use the same dic to also process test set
        test = eb_testset(test, 'issuercountrycode', dic1, train_prior)
        test = eb_testset(test, 'shoppercountrycode', dic2, train_prior)
        test = eb_testset(test, 'bin', dic3, train_prior)
        #test = eb_testset(test, 'mail_id', dic4, train_prior)
        #test = eb_testset(test, 'card_id', dic5, train_prior)
        #test = eb_testset(test, 'ip_id', dic6, train_prior)
        
        # split x, y
        X_train = train.drop(['simple_journal'], axis=1).values
        X_test = test.drop(['simple_journal'], axis=1).values
        y_train = list(train['simple_journal'].values)
        y_test = list(test['simple_journal'].values)
        
        # undersampling
        rus = RandomUnderSampler(ratio=sampling_ratio)
        X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
        #smote = SMOTE(ratio=sampling_ratio)
        #X_train_rus, y_train_rus = smote.fit_sample(X_train,y_train)
        
        # train classifier
        clf.fit(X_train_rus, y_train_rus)
        predictions = clf.predict(X_test)
        c_matrix = confusion_matrix(y_test, predictions)
        outcomes = outcomes + c_matrix
        
        print("Fold:",fold) 
        print(np.size(X_train,0), np.size(X_train_rus,0), np.size(X_test,0))
        print(c_matrix)
        print('precision:',c_matrix[0,0]/(c_matrix[0,1]+c_matrix[1,0]),'recall:',c_matrix[0,0]/(c_matrix[0,0]+c_matrix[0,1]))
    print('average:')
    np.set_printoptions(suppress=True)
    print(outcomes / 10)
    print('precision:',outcomes[0,0]/(outcomes[0,0]+outcomes[1,0]),'recall:',outcomes[0,0]/(outcomes[0,0]+outcomes[0,1]))

#### black-box

In [100]:
rf = RandomForestClassifier()
run_kfold(rf, df, 0.7)

Fold: 1
213332 733 23704
[[   33    10]
 [ 2127 21534]]
precision: 0.0154422087038 recall: 0.767441860465
Fold: 2
213332 745 23704
[[   30     8]
 [ 2134 21532]]
precision: 0.0140056022409 recall: 0.789473684211
Fold: 3
213332 748 23704
[[   29     8]
 [ 1844 21823]]
precision: 0.0156587473002 recall: 0.783783783784
Fold: 4
213332 750 23704
[[   31     5]
 [ 2093 21575]]
precision: 0.0147759771211 recall: 0.861111111111
Fold: 5
213332 757 23704
[[   25     8]
 [ 1743 21928]]
precision: 0.0142775556825 recall: 0.757575757576
Fold: 6
213332 748 23704
[[   31     6]
 [ 1941 21726]]
precision: 0.0159219311762 recall: 0.837837837838
Fold: 7
213333 767 23703
[[   22     7]
 [ 1931 21743]]
precision: 0.0113519091847 recall: 0.758620689655
Fold: 8
213333 757 23703
[[   30     3]
 [ 2211 21459]]
precision: 0.0135501355014 recall: 0.909090909091
Fold: 9
213333 769 23703
[[   22     6]
 [ 2097 21578]]
precision: 0.0104612458393 recall: 0.785714285714
Fold: 10
213333 762 23703
[[   29     2]
 [ 20

#### white-box

In [90]:
dt = DecisionTreeClassifier()
run_kfold(dt, df, 0.7)

Fold: 1
213332 767 23704
[[   18    11]
 [ 2059 21616]]
precision: 0.00869565217391 recall: 0.620689655172
Fold: 2
213332 752 23704
[[   26     9]
 [ 2234 21435]]
precision: 0.0115916183683 recall: 0.742857142857
Fold: 3
213332 760 23704
[[   25     7]
 [ 2342 21330]]
precision: 0.0106428267348 recall: 0.78125
Fold: 4
213332 769 23704
[[   24     4]
 [ 2303 21373]]
precision: 0.0104031209363 recall: 0.857142857143
Fold: 5
213332 745 23704
[[   27    11]
 [ 2179 21487]]
precision: 0.0123287671233 recall: 0.710526315789
Fold: 6
213332 774 23704
[[   19     7]
 [ 1884 21794]]
precision: 0.0100475938657 recall: 0.730769230769
Fold: 7
213333 760 23703
[[   22    10]
 [ 1999 21672]]
precision: 0.0109507217521 recall: 0.6875
Fold: 8
213333 733 23703
[[   36     7]
 [ 2591 21069]]
precision: 0.013856812933 recall: 0.837209302326
Fold: 9
213333 748 23703
[[   27    10]
 [ 2418 21248]]
precision: 0.0111202635914 recall: 0.72972972973
Fold: 10
213333 728 23703
[[   36     9]
 [ 2012 21646]]
preci

#### Plot the tree  
Here we train with all the data and there is no testing in this part. 

In [93]:
rf = DecisionTreeClassifier()
rus = RandomUnderSampler(ratio=0.7)
df_all = df.copy()
# eb process
df_all, _ = eb_feature(df_all, 'issuercountrycode', 10, 100)
df_all, _ = eb_feature(df_all, 'shoppercountrycode', 10, 100)
df_all, _ = eb_feature(df_all, 'bin', 100, 100)

y_all = np.asarray(df_all['simple_journal'], dtype="|S6")
X_all = df_all.drop(['simple_journal'], axis=1).values
X_rus, y_rus = rus.fit_sample(X_all, y_all)
rf.fit(X_rus, y_rus)

tree.export_graphviz(rf, out_file = 'tree.dot', feature_names = list(df_all.drop(['simple_journal'], axis=1)))