In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

%matplotlib inline

# read dataset
df = pd.read_csv("../data_for_student_case.csv")

# set cvc
df['cvcresponsecode'].ix[df['cvcresponsecode']>3] = 3

# change currency
df['amount'].ix[df['currencycode'] == 'MXN'] = df['amount'].ix[df['currencycode'] == 'MXN'] * 0.052131
df['amount'].ix[df['currencycode'] == 'AUD'] = df['amount'].ix[df['currencycode'] == 'AUD'] * 0.7349
df['amount'].ix[df['currencycode'] == 'NZD'] = df['amount'].ix[df['currencycode'] == 'NZD'] * 0.68966
df['amount'].ix[df['currencycode'] == 'GBP'] = df['amount'].ix[df['currencycode'] == 'GBP'] * 1.292625
df['amount'].ix[df['currencycode'] == 'SEK'] = df['amount'].ix[df['currencycode'] == 'SEK'] * 0.112851

# deal with 'nan'
df['issuercountrycode'].fillna('other',inplace=True)
df['shoppercountrycode'].fillna('other',inplace=True)
df['bin'].fillna(1,inplace=True)

# drop 'Refused'
df = df[df['simple_journal'] != 'Refused']
df['simple_journal'].ix[df['simple_journal'] == 'Settled'] = 1
df['simple_journal'].ix[df['simple_journal'] == 'Chargeback'] = 0

# verificationcodesupplied True:1, False:0, unknown:2
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == True] = 1
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == False] = 0
df['cardverificationcodesupplied'].fillna(2,inplace=True)

# extract weekday and hour from creationdate
df['creationdate'] = pd.to_datetime(df['creationdate'])
df['cweekday'] = df['creationdate'].dt.weekday_name
df['chour'] = df['creationdate'].dt.hour
df['chour'].apply(pd.to_numeric)

# drop features
df = df.drop(['txid', 'bookingdate', 'creationdate', 'mail_id', 'ip_id', 'card_id'], axis=1)

df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,accountcode,cweekday,chour
42083,AU,mccredit,535316.0,7661.3325,AUD,AU,Ecommerce,1,1,0,APACAccount,Sunday,11
272858,SE,visadebit,458109.0,19664.28675,SEK,SE,Ecommerce,1,1,1,SwedenAccount,Tuesday,18
173141,GB,visadebit,476248.0,7652.34,GBP,GB,Ecommerce,1,1,1,UKAccount,Monday,21
71672,GB,visadebit,476230.0,4388.461875,GBP,GB,Ecommerce,1,1,1,UKAccount,Wednesday,17
96368,GB,visadebit,454313.0,4698.691875,GBP,GB,Ecommerce,1,2,1,UKAccount,Saturday,17


#### empirical bayes method  
We thought 'issuercountrycode', 'shoppercountrycode' and 'bin' are high-cardinality categorical features. They will be transformed with empirical bayes method. 

In [52]:
def sigmoid(n, k, f):
    return 1 / (1 + np.exp((-1) * (n - k) / f))

def empirical_bayes(df, column):
    
    prior_bayes = 1 - sum(df['simple_journal'])/len(df)
    feature_set = set([])
    dic = {}
    
    # parameters
    k = len(df) / df[column].nunique()
    f = 100
    
    for feature in df[column]:
        if feature not in feature_set:
            feature_set.add(feature)
            df_pos = df[df[column] == feature]
            n = len(df_pos)
            alpha = 1 / (1 + np.exp((-1) * (n - k) / f))
            dic[feature] = alpha * len(df_pos[df_pos['simple_journal'] == 0])/len(df_pos) + (1 - alpha) * prior_bayes
            
    return dic

def eb_feature(df, feature):
    dic = empirical_bayes(df, feature)
    li = []
    for i in df[feature]:
        li.append(dic[i])
    
    df[feature + '_eb'] = li
    df = df.drop([feature], axis=1)
    
    return df, dic

#### encoding categorical features with one-hot encoding

In [67]:
dummy_features = ['txvariantcode', 'currencycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode', 'cweekday']

for feature in dummy_features:

    just_dummies = pd.get_dummies(df[feature],prefix=feature+'_')

    df = pd.concat([df, just_dummies], axis=1)      
    df.drop([feature], inplace=True, axis=1)

df = df.reset_index(drop=True)
    
df.sample(5)

Unnamed: 0,issuercountrycode,bin,amount,shoppercountrycode,simple_journal,chour,txvariantcode__electron,txvariantcode__mc,txvariantcode__mccredit,txvariantcode__mcdebit,...,accountcode__MexicoAccount,accountcode__SwedenAccount,accountcode__UKAccount,cweekday__Friday,cweekday__Monday,cweekday__Saturday,cweekday__Sunday,cweekday__Thursday,cweekday__Tuesday,cweekday__Wednesday
154564,GB,446291.0,8660.5875,GB,1,19,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
159187,GB,465858.0,8660.5875,GB,1,16,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
93966,KZ,400303.0,8660.5875,GB,1,12,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
87754,GB,530127.0,4265.6625,GB,1,16,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
158026,GB,465859.0,8137.074375,GB,1,16,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


#### train classifier

In [79]:
def run_kfold(clf, df, sampling_ratio):
    kf = KFold(n_splits=10, shuffle=True)
    outcomes = np.zeros([2,2])
    fold = 0
    for train_index, test_index in kf.split(df):
        fold += 1
        
        # split train & test set
        train, test = df.iloc[train_index].copy(), df.iloc[test_index].copy()
                
        # empirical bayes for high-cardinality category
        train_prior = 1 - sum(train['simple_journal'])/len(train)
        train, dic1 = eb_feature(train,'issuercountrycode')
        train, dic2 = eb_feature(train,'shoppercountrycode')
        train, dic3 = eb_feature(train,'bin')
                
        # use the same dic to also process test set
        li = []
        for i in test['issuercountrycode']:
            # if not found, then just use prior probability
            try:
                li.append(dic1[i])
            except KeyError:
                li.append(train_prior)
    
        test['issuercountrycode_eb'] = li
        test = test.drop(['issuercountrycode'], axis=1)
        
        li = []
        for i in test['shoppercountrycode']:
            try:
                li.append(dic2[i])
            except KeyError:
                li.append(train_prior)
    
        test['shoppercountrycode_eb'] = li
        test = test.drop(['shoppercountrycode'], axis=1)
        
        li = []
        for i in test['bin']:
            try:
                li.append(dic3[i])
            except KeyError:
                li.append(train_prior)
    
        test['bin_eb'] = li
        test = test.drop(['bin'], axis=1)
                        
        # split x, y
        X_train = train.drop(['simple_journal'], axis=1).values
        X_test = test.drop(['simple_journal'], axis=1).values
        y_train = list(train['simple_journal'].values)
        y_test = list(test['simple_journal'].values)
        
        # undersampling
        rus = RandomUnderSampler(ratio=sampling_ratio)
        X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
        
        # train classifier
        clf.fit(X_train_rus, y_train_rus)
        predictions = clf.predict(X_test)
        c_matrix = confusion_matrix(y_test, predictions)
        outcomes = outcomes + c_matrix
        
        print("Fold:",fold) 
        print(np.size(X_train,0), np.size(X_train_rus,0), np.size(X_test,0))
        print(c_matrix)
        print('precision:',c_matrix[0,0]/(c_matrix[0,1]+c_matrix[1,0]),'recall:',c_matrix[0,0]/(c_matrix[0,0]+c_matrix[0,1]))
    print('average:')
    np.set_printoptions(suppress=True)
    print(outcomes / 10)
    print('precision:',outcomes[0,0]/(outcomes[0,1]+outcomes[1,0]),'recall:',outcomes[0,0]/(outcomes[0,0]+outcomes[0,1]))

In [80]:
clf = DecisionTreeClassifier()
run_kfold(clf, df, 0.5)

Fold: 1
213332 945 23704
[[   21     9]
 [ 1710 21964]]
precision: 0.0122164048866 recall: 0.7
Fold: 2
213332 930 23704
[[   22    13]
 [ 1845 21824]]
precision: 0.0118406889128 recall: 0.628571428571
Fold: 3
213332 960 23704
[[   17     8]
 [ 1616 22063]]
precision: 0.0104679802956 recall: 0.68
Fold: 4
213332 924 23704
[[   31     6]
 [ 2090 21577]]
precision: 0.0147900763359 recall: 0.837837837838
Fold: 5
213332 918 23704
[[   30     9]
 [ 2086 21579]]
precision: 0.0143198090692 recall: 0.769230769231
Fold: 6
213332 933 23704
[[   27     7]
 [ 2126 21544]]
precision: 0.0126582278481 recall: 0.794117647059
Fold: 7
213333 921 23703
[[   22    16]
 [ 1687 21978]]
precision: 0.0129183793306 recall: 0.578947368421
Fold: 8
213333 921 23703
[[   28    10]
 [ 2012 21653]]
precision: 0.0138476755687 recall: 0.736842105263
Fold: 9
213333 951 23703
[[   18    10]
 [ 1963 21712]]
precision: 0.0091231626964 recall: 0.642857142857
Fold: 10
213333 912 23703
[[   30    11]
 [ 1805 21857]]
precision:

#### Plot the tree  
Here we train with all the data and there is no testing in this part. 

In [64]:
rf = DecisionTreeClassifier()
rus = RandomUnderSampler(ratio=0.5)

# eb process
df, _ = eb_feature(df, 'issuercountrycode')
df, _ = eb_feature(df, 'shoppercountrycode')
df, _ = eb_feature(df, 'bin')

y_all = np.asarray(df['simple_journal'], dtype="|S6")
X_all = df.drop(['simple_journal'], axis=1).values
X_rus, y_rus = rus.fit_sample(X_all, y_all)
rf.fit(X_rus, y_rus)

tree.export_graphviz(rf, out_file = 'tree.dot', feature_names = list(df))
df.sample(5)

Unnamed: 0,amount,simple_journal,chour,txvariantcode__electron,txvariantcode__mc,txvariantcode__mccredit,txvariantcode__mcdebit,txvariantcode__visa,txvariantcode__visabusiness,txvariantcode__visaclassic,...,cweekday__Friday,cweekday__Monday,cweekday__Saturday,cweekday__Sunday,cweekday__Thursday,cweekday__Tuesday,cweekday__Wednesday,issuercountrycode_eb,shoppercountrycode_eb,bin_eb
34623,8862.131,1,6,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.001317,0.001327,0.01536988
79951,1286.161875,1,17,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0.000186,0.000183,0.001282229
157408,8789.85,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0.000186,0.000183,3.09769e-07
169892,8266.336875,1,14,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0.000186,0.000183,1.361551e-10
130698,8240.484375,1,15,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0.000186,0.000183,0.0
