In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# read dataset
df = pd.read_csv("../data_for_student_case.csv")

# set cvc
df['cvcresponsecode'].ix[df['cvcresponsecode']>3] = 3

# set currency
df['amount'].ix[df['currencycode'] == 'MXN'] = df['amount'].ix[df['currencycode'] == 'MXN'] * 0.052131
df['amount'].ix[df['currencycode'] == 'AUD'] = df['amount'].ix[df['currencycode'] == 'AUD'] * 0.7349
df['amount'].ix[df['currencycode'] == 'NZD'] = df['amount'].ix[df['currencycode'] == 'NZD'] * 0.68966
df['amount'].ix[df['currencycode'] == 'GBP'] = df['amount'].ix[df['currencycode'] == 'GBP'] * 1.292625
df['amount'].ix[df['currencycode'] == 'SEK'] = df['amount'].ix[df['currencycode'] == 'SEK'] * 0.112851

# simplify issuercountrycode
issuercountrycode_filter = set(['GB', 'AU', 'MX', 'SE', 'NZ', 'US', 'CN'])
df['issuercountrycode'].ix[df['issuercountrycode'].isin(issuercountrycode_filter) == False] = 'other'

# simplify txvariantcode
txvariantcode_filter = set(['visadebit', 'mccredit', 'mcdebit', 'visaclassic', 'visaplatinum', 'visagold', 'visa'])
df['txvariantcode'].ix[df['txvariantcode'].isin(txvariantcode_filter) == False] = 'other'

# simplify shoppercountrycode
shoppercountrycode_filter = set(['GB', 'MX', 'AU', 'SE', 'NZ', 'US'])
df['shoppercountrycode'].ix[df['shoppercountrycode'].isin(shoppercountrycode_filter) == False] = 'other'

# drop 'Refused'
df = df[df['simple_journal'] != 'Refused']
df['simple_journal'].ix[df['simple_journal'] == 'Settled'] = True
df['simple_journal'].ix[df['simple_journal'] == 'Chargeback'] = False

# verificationcodesupplied True:1, False:0, not provided:2
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == True] = 1
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == False] = 0
df['cardverificationcodesupplied'].fillna(2,inplace=True)

# set daysinterval
df['creationdate'] = pd.to_datetime(df['creationdate'])

df['cweekday'] = df['creationdate'].dt.weekday_name
df['chour'] = df['creationdate'].dt.hour
df['chour'].apply(pd.to_numeric)

# drop features
df = df.drop(['txid', 'bookingdate', 'creationdate', 'mail_id', 'ip_id', 'card_id', 'bin'], axis=1)

df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,issuercountrycode,txvariantcode,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,accountcode,cweekday,chour
68348,GB,mcdebit,9558.961875,GBP,GB,Ecommerce,True,1,1,UKAccount,Monday,15
131759,GB,visadebit,21838.899375,GBP,GB,Ecommerce,True,1,1,UKAccount,Tuesday,21
60981,GB,visadebit,8292.189375,GBP,GB,Ecommerce,True,2,1,UKAccount,Thursday,12
173877,GB,visadebit,7794.52875,GBP,GB,Ecommerce,True,1,1,UKAccount,Tuesday,13
62488,GB,visadebit,4065.305625,GBP,GB,Ecommerce,True,1,1,UKAccount,Friday,1


#### encoding categorical features

In [83]:
dummy_features = ['issuercountrycode', 'txvariantcode', 'currencycode', 'shoppercountrycode',
                  'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode', 'cweekday']

for feature in dummy_features:

    just_dummies = pd.get_dummies(df[feature],prefix=feature+'_')

    df = pd.concat([df, just_dummies], axis=1)      
    df.drop([feature], inplace=True, axis=1)

df.sample(5)

Unnamed: 0,amount,simple_journal,chour,issuercountrycode__AU,issuercountrycode__CN,issuercountrycode__GB,issuercountrycode__MX,issuercountrycode__NZ,issuercountrycode__SE,issuercountrycode__US,...,accountcode__MexicoAccount,accountcode__SwedenAccount,accountcode__UKAccount,cweekday__Friday,cweekday__Monday,cweekday__Saturday,cweekday__Sunday,cweekday__Thursday,cweekday__Tuesday,cweekday__Wednesday
237510,10715.86125,True,15,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
60919,6844.449375,True,11,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
233190,9041.911875,True,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
141772,4944.290625,True,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
185391,2126.368125,True,11,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


#### train classifier

In [96]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE 

def run_kfold(clf, X_all, y_all):
    kf = KFold(n_splits=10)
    outcomes = []
    fold = 0
    smote = SMOTE(random_state=42, ratio=0.8)
    for train_index, test_index in kf.split(X_all):
        fold += 1
        outcomes = np.zeros([2,2])
        X_train, X_test = X_all[train_index], X_all[test_index]
        y_train, y_test = y_all[train_index], y_all[test_index]
        X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)
        clf.fit(X_train_smote, y_train_smote)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        c_matrix = confusion_matrix(y_test, predictions)
        outcomes = outcomes + c_matrix
        print("Fold:",fold) 
        print(np.size(X_train), np.size(X_train_smote))
        print(c_matrix)
    mean_outcome = outcomes / 10
    print("Mean Accuracy: {0}".format(mean_outcome)) 


In [97]:
rf = RandomForestClassifier()

y_all = np.asarray(df['simple_journal'], dtype="|S6")
X_all = df.drop(['simple_journal'], axis=1).values

run_kfold(rf, X_all, y_all)

Fold: 1
10879932 19577778
[[    0   279]
 [    0 23425]]
Fold: 2
10879932 19555491
[[    0    36]
 [   75 23593]]
Fold: 3
10879932 19552176
[[23704]]
Fold: 4
10879932 19552176
[[    0     0]
 [    4 23700]]
Fold: 5
10879932 19552176
[[    0     0]
 [    2 23702]]
Fold: 6
10879932 19552176
[[    0     0]
 [    2 23702]]
Fold: 7
10879983 19552278
[[23703]]
Fold: 8
10879983 19552278
[[    0     0]
 [    2 23701]]
Fold: 9
10879983 19552278
[[    0     0]
 [    1 23702]]
Fold: 10
10879983 19555032
[[    0    30]
 [   11 23662]]
Mean Accuracy: [[  0.00000000e+00   3.00000000e+00]
 [  1.10000000e+00   2.36620000e+03]]
