In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline


#### Read dataset

In [11]:
df = pd.read_csv("../data_for_student_case.csv")

# change cvc
df['cvcresponsecode'].ix[df['cvcresponsecode']>3] = 3

# change currency
df['amount'].ix[df['currencycode'] == 'MXN'] = df['amount'].ix[df['currencycode'] == 'MXN'] * 0.052131
df['amount'].ix[df['currencycode'] == 'AUD'] = df['amount'].ix[df['currencycode'] == 'AUD'] * 0.7349
df['amount'].ix[df['currencycode'] == 'NZD'] = df['amount'].ix[df['currencycode'] == 'NZD'] * 0.68966
df['amount'].ix[df['currencycode'] == 'GBP'] = df['amount'].ix[df['currencycode'] == 'GBP'] * 1.292625
df['amount'].ix[df['currencycode'] == 'SEK'] = df['amount'].ix[df['currencycode'] == 'SEK'] * 0.112851

# simplify issuercountrycode
issuercountrycode_filter = set(['GB', 'AU', 'MX', 'SE', 'NZ', 'US', 'CN'])
df['issuercountrycode'].ix[df['issuercountrycode'].isin(issuercountrycode_filter) == False] = 'other'

# simplify txvariantcode
txvariantcode_filter = set(['visadebit', 'mccredit', 'mcdebit', 'visaclassic', 'visaplatinum', 'visagold', 'visa'])
df['txvariantcode'].ix[df['txvariantcode'].isin(txvariantcode_filter) == False] = 'other'

# simplify shoppercountrycode
shoppercountrycode_filter = set(['GB', 'MX', 'AU', 'SE', 'NZ', 'US'])
df['shoppercountrycode'].ix[df['shoppercountrycode'].isin(shoppercountrycode_filter) == False] = 'other'

#df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


#### set labels

In [12]:
df = df[df['simple_journal'] != 'Refused']
df['simple_journal'].ix[df['simple_journal'] == 'Settled'] = 1
df['simple_journal'].ix[df['simple_journal'] == 'Chargeback'] = 0

df['simple_journal'].value_counts()

df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == True] = 1
df['cardverificationcodesupplied'].ix[df['cardverificationcodesupplied'] == False] = 0
df['cardverificationcodesupplied'].fillna(2,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
features = ['issuercountrycode', 'txvariantcode', 'currencycode', 'shoppercountrycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode']
for feature in features:
    print(feature,df[feature].unique())

In [13]:
from sklearn.preprocessing import LabelEncoder
def encode_features(df):
    features = ['issuercountrycode', 'txvariantcode', 'currencycode', 'shoppercountrycode', 'shopperinteraction', 'cvcresponsecode', 'accountcode']
    
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df
    
df = encode_features(df)
df = df.drop(['txid', 'bookingdate', 'bin', 'creationdate', 'mail_id', 'ip_id', 'card_id'], axis=1)
df.head()

Unnamed: 0,issuercountrycode,txvariantcode,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,accountcode
0,3,0,3378.0888,2,2,1,0,1,0,1
1,3,0,2340.6819,2,2,1,0,1,0,1
2,3,0,7814.4369,2,2,1,0,1,0,1
3,3,0,5729.1969,2,2,1,0,1,0,1
4,3,4,4686.5769,2,2,1,0,1,0,1


In [14]:
from imblearn.over_sampling import  SMOTE 
sm = SMOTE(random_state=42)

#### Train classifier

In [15]:
from sklearn.model_selection import train_test_split

features = ['issuercountrycode', 'txvariantcode', 'currencycode', 'shoppercountrycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode']
X_all = df[features]
y_all = df['simple_journal']
y_all = np.asarray(y_all, dtype="|S6")
num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
X_smote, y_smote = sm.fit_sample(X_train, y_train)

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_smote = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_smote.fit(X_smote, y_smote)

# from sklearn.svm import SVC
# svc = SVC()
# svc_smote = SVC()
# svc.fit(X_train, y_train)
# svc_smote.fit(X_smote, y_smote)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [9]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, rf.predict(X_test)))
print(confusion_matrix(y_test, rf_smote.predict(X_test)))
# print(confusion_matrix(y_test, svc.predict(X_test)))
# print(confusion_matrix(y_test, svc_smote.predict(X_test)))


[[47339     0]
 [   69     0]]
[[41211  6128]
 [   18    51]]
