In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import numpy as np
import graphviz

In [2]:
from collections import Counter
from imblearn.datasets import fetch_datasets
from sklearn import svm, tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

In [3]:
df = pickle.load(open('dataframe.pickle', 'rb'))
print(df.dtypes)
df.head()

issuercountrycode                        int64
txvariantcode                           object
bin                                    float64
amount                                 float64
currencycode                            object
shoppercountrycode                      object
shopperinteraction                      object
cardverificationcodesupplied            object
cvcresponsecode                          int64
creationdate                    datetime64[ns]
accountcode                             object
mail_id                                  int64
ip_id                                    int64
card_id                                  int64
label                                    int64
creationdate_timestamp                 float64
amount_distance                        float64
total_cards_used                         int64
dtype: object


Unnamed: 0,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id,label,creationdate_timestamp,amount_distance,total_cards_used
0,76,mccredit,530056.0,64800.0,MXN,MX,Ecommerce,True,0,2015-07-01,MexicoAccount,68370,111778,184798,1,1435785000.0,0.0,1
1,76,mccredit,547046.0,44900.0,MXN,MX,Ecommerce,True,0,2015-07-02,MexicoAccount,101299,78749,151595,1,1435805000.0,0.0,1
2,76,mccredit,528843.0,149900.0,MXN,MX,Ecommerce,True,0,2015-07-02,MexicoAccount,278604,70594,242142,1,1435840000.0,0.0,1
3,76,mccredit,547146.0,109900.0,MXN,MX,Ecommerce,True,0,2015-07-03,MexicoAccount,47409,113648,181744,1,1435903000.0,0.0,1
4,76,visaclassic,477291.0,89900.0,MXN,MX,Ecommerce,True,0,2015-07-08,MexicoAccount,205501,83553,97271,1,1436373000.0,0.0,1


In [4]:
df_interesting = df[['label','ip_id','issuercountrycode','amount_distance','total_cards_used']]
df_interesting = df_interesting.sample(frac=1).reset_index(drop=True)
df_interesting.head()

Unnamed: 0,label,ip_id,issuercountrycode,amount_distance,total_cards_used
0,0,64062,40,0.0,2
1,0,333734,40,0.0,1
2,0,245134,40,0.0,1
3,0,340305,40,0.0,1
4,0,246763,40,0.0,1


In [5]:
sm = SMOTE(random_state=12)
svc = svm.SVC()
dtc = tree.DecisionTreeRegressor(max_depth=5)

In [6]:
X = np.array(df_interesting.drop(['label'], axis=1))
Y = np.array(df_interesting['label'])

Folds = []

kf = KFold(n_splits=10) #Already shuffled
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    X_train, Y_train = sm.fit_sample(X_train, Y_train)
    Folds.append((X_train, Y_train, X_test, Y_test))

TRAIN: [ 23670  23671  23672 ..., 236695 236696 236697] TEST: [    0     1     2 ..., 23667 23668 23669]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [23670 23671 23672 ..., 47337 47338 47339]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [47340 47341 47342 ..., 71007 71008 71009]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [71010 71011 71012 ..., 94677 94678 94679]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [ 94680  94681  94682 ..., 118347 118348 118349]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [118350 118351 118352 ..., 142017 142018 142019]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [142020 142021 142022 ..., 165687 165688 165689]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [165690 165691 165692 ..., 189357 189358 189359]
TRAIN: [     0      1      2 ..., 236695 236696 236697] TEST: [189360 189361 189362 ..., 213026 213027 213028]
TRAIN: [     0      1    

In [7]:
with open('folds.pickle', 'wb') as f:
    pickle.dump(Folds, f)

In [None]:
svc.fit(Folds[0][0], Folds[0][1])

with open('fold1.pickle', 'wb') as f:
    pickle.dump(svc, f)
print("SVC SCORE: " + str(svc.score(Folds[0][2], Folds[0][3])))

In [None]:
X_train_sm, Y_train_sm = sm.fit_sample(Folds[0][0], Folds[0][1])
svc.fit(X_train_sm, Y_train_sm)
with open('fold1_sm.pickle', 'wb') as f:
    pickle.dump(svc,f)
print("SVC SCORE(/W SMOTE): " + str(svc.score(Folds[0][2], Folds[0][3])))