# IEEE-CIS Fraud Detection -- Model [DecisionTree]

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')
sns.set_palette('deep')
sns.set_style('white')
%matplotlib inline

In [68]:
train_features = pd.read_csv('./Data/train_features.csv' )
test_features = pd.read_csv('./Data/test_features.csv')
train_target = pd.read_csv('./Data/train_target.csv', header=None)

## TimeSeriesSplit

- Since the train and test data were collected in time series manner and there's existing a gap, split the train and validation data based using TimeSeriesSplit. 

In [30]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler

In [10]:
# dt params
dt_params = {
             'max_depth': 8,
             'min_samples_split': 100,
             'min_samples_leaf': 70,
             'class_weight': 'balanced'}

In [11]:
def model_fn(clf, params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = clf(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, clf, y_pred_train    
    gc.collect()
    joblib.dump(model, "./Model/LGB_model.m")
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [12]:
model_fn(DecisionTreeClassifier, dt_params)

############## New Run ################
PARAMETERS: 
params  = {'max_depth': 8, 'min_samples_split': 100, 'min_samples_leaf': 70, 'class_weight': 'balanced'}
CV SCORE: 
AUC: 0.7966722596760338
AUC: 0.8174472989634196
AUC: 0.8305594519312686
AUC: 0.8365704699770519
AUC: 0.834451124390845
Mean AUC: 0.8231401209877237 



-0.8231401209877237

### Gridsearch for best params

In [14]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

In [27]:
dt_space = {
    "max_depth": hp.quniform('max_depth', 7, 150, 3),
    "min_samples_split": hp.choice('min_samples_split', [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]),
    "min_samples_leaf": hp.choice('min_samples_leaf', [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]),
    "criterion": "gini",
    "max_features": 'auto',
    'class_weight': 'balanced',
    'splitter': 'random',
    'random_state': 0}

In [28]:
def dt_grid(params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    params = {
        'max_depth': int(params['max_depth']),
        'min_samples_split': params['min_samples_split'],
        'min_samples_leaf': params['min_samples_leaf'],
        'criterion': params['criterion'],
        'max_features': params['max_features'],
        'random_state': 0
    }
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = DecisionTreeClassifier(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, y_pred_train    
    gc.collect()
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [35]:
dt_best = fmin(fn=dt_grid,
            space=dt_space,
            algo=tpe.suggest,
            max_evals=20)

############## New Run ################             
PARAMETERS:                                         
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 93.0, 'max_features': 'auto', 'min_samples_leaf': 35, 'min_samples_split': 35, 'random_state': 0, 'splitter': 'random'}
CV SCORE:                                           
AUC: 0.7624137568917329                             
AUC: 0.7712018470481414                             
AUC: 0.7764937158667164                             
AUC: 0.7820828145126059                             
AUC: 0.7748564836640683                             
Mean AUC: 0.773409723596653                         

############## New Run ################                                      
PARAMETERS:                                                                  
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 135.0, 'max_features': 'auto', 'min_samples_leaf': 40, 'min_samples_split': 35, 'random_state': 0, '

AUC: 0.7460597982228578                                                    
AUC: 0.7653588183546493                                                    
AUC: 0.7588353031439106                                                    
AUC: 0.7925752311604108                                                    
AUC: 0.783544247416388                                                     
Mean AUC: 0.7692746796596432                                               

############## New Run ################                                     
PARAMETERS:                                                                 
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 99.0, 'max_features': 'auto', 'min_samples_leaf': 45, 'min_samples_split': 5, 'random_state': 0, 'splitter': 'random'}
CV SCORE:                                                                   
AUC: 0.7759223334293397                                                     
AUC: 0.7820862018821716                    

AUC: 0.745268950670175                                                         
AUC: 0.7673616378022305                                                        
AUC: 0.7660558194602322                                                        
AUC: 0.7823614738419882                                                        
AUC: 0.7788731213862786                                                        
Mean AUC: 0.767984200632181                                                    

############## New Run ################                                        
PARAMETERS:                                                                    
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 87.0, 'max_features': 'auto', 'min_samples_leaf': 40, 'min_samples_split': 35, 'random_state': 0, 'splitter': 'random'}
CV SCORE:                                                                      
AUC: 0.7559954564352045                                                        
AUC: 0

## Train_test_split

In [36]:
sample_submission = pd.read_csv('./Data/sample_submission.csv')

In [46]:
%%time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

EPOCHS = 5
kf = KFold(n_splits = EPOCHS, shuffle = True)
y_preds = np.zeros(sample_submission.shape[0])
y_oof = np.zeros(train_features.shape[0])
score_mean = 0
print("CV SCORE: ")
for tr_idx, val_idx in kf.split(train_features, train_target):
    clf = DecisionTreeClassifier(**dt_params)
    
    X_tr, X_vl = train_features.iloc[tr_idx, :], train_features.iloc[val_idx, :]
    y_tr, y_vl = train_target.iloc[tr_idx], train_target.iloc[val_idx]
    clf.fit(X_tr, y_tr)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    score = roc_auc_score(y_vl, y_pred_train)
    score_mean += score
    print('ROC AUC {}'.format(score))
print(f'Mean AUC: {score_mean / EPOCHS} \n')

CV SCORE: 
ROC AUC 0.863980883554148
ROC AUC 0.864490755976214
ROC AUC 0.8593593771095372
ROC AUC 0.8670200293479176
ROC AUC 0.8681954287742606
Mean AUC: 0.8646092949524155 

CPU times: user 2min 47s, sys: 14.4 s, total: 3min 2s
Wall time: 4min 2s


### GridSearch

In [50]:
def dt_grid(params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    params = {
        'max_depth': int(params['max_depth']),
        'min_samples_split': params['min_samples_split'],
        'min_samples_leaf': params['min_samples_leaf'],
        'criterion': params['criterion'],
        'max_features': params['max_features'],
        'random_state': 0
    }
    
    EPOCHS = 5
    kf = KFold(n_splits = EPOCHS, shuffle = True)
    y_preds = np.zeros(sample_submission.shape[0])
    y_oof = np.zeros(train_features.shape[0])
    score_mean = 0
    print("CV SCORE: ")
    for tr_idx, val_idx in kf.split(train_features, train_target):
        clf = DecisionTreeClassifier(
            **params
        )

        X_tr, X_vl = train_features.iloc[tr_idx, :], train_features.iloc[val_idx, :]
        y_tr, y_vl = train_target.iloc[tr_idx], train_target.iloc[val_idx]
        clf.fit(X_tr, y_tr)
        y_pred_train = clf.predict_proba(X_vl)[:,1]
        y_oof[val_idx] = y_pred_train
        score = roc_auc_score(y_vl, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
        
    print(f'Mean AUC: {score_mean / EPOCHS} \n')
    joblib.dump(clf, "./Model/DT_model_cv.m")
    return -(score_mean / EPOCHS)

In [51]:
dt_best = fmin(fn=dt_grid,
            space=dt_space,
            algo=tpe.suggest,
            max_evals=20)

############## New Run ################             
PARAMETERS:                                         
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 75.0, 'max_features': 'auto', 'min_samples_leaf': 15, 'min_samples_split': 45, 'random_state': 0, 'splitter': 'random'}
CV SCORE:                                           
AUC: 0.8368788666646848                             
AUC: 0.8334646661760327                             
AUC: 0.8192552553222118                             
AUC: 0.8255278045373731                             
AUC: 0.8216106853231975                             
Mean AUC: 0.8273474556046999                        

############## New Run ################                                       
PARAMETERS:                                                                   
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30.0, 'max_features': 'auto', 'min_samples_leaf': 15, 'min_samples_split': 50, 'random_state': 0, 

CV SCORE:                                                                     
AUC: 0.8443385068578738                                                       
AUC: 0.8580041521383095                                                       
AUC: 0.8506730215692623                                                       
AUC: 0.8558459427456867                                                       
AUC: 0.8472477549408789                                                       
Mean AUC: 0.8512218756504023                                                  

############## New Run ################                                        
PARAMETERS:                                                                    
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 129.0, 'max_features': 'auto', 'min_samples_leaf': 35, 'min_samples_split': 5, 'random_state': 0, 'splitter': 'random'}
CV SCORE:                                                                      
AUC: 0.836731

CV SCORE:                                                                      
AUC: 0.8471722072635465                                                        
AUC: 0.848879296660397                                                         
AUC: 0.8464272539233848                                                        
AUC: 0.8352305300255187                                                        
AUC: 0.8500169055014062                                                        
Mean AUC: 0.8455452386748507                                                   

############## New Run ################                                        
PARAMETERS:                                                                    
params  = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 105.0, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 50, 'random_state': 0, 'splitter': 'random'}
CV SCORE:                                                                      
AUC: 0

In [69]:
submission_sample = pd.read_csv('./Data/sample_submission.csv')
clf = joblib.load("./Model/DT_model_cv.m")
test_pred = clf.predict_proba(test_features)[:,1]
submission_sample['isFraud'] = test_pred
submission_sample.to_csv('./Output/DT_cv_1.csv', index=False)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

## Reference
- https://www.kaggle.com/smerllo/identify-unique-cards-id