# IEEE-CIS Fraud Detection -- Model [ExtraTrees]

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_features = pd.read_csv('./Data/train_features.csv' )
test_features = pd.read_csv('./Data/test_features.csv')
train_target = pd.read_csv('./Data/train_target.csv', header=None)

## Model

In [3]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, auc
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals import joblib



In [4]:
# etc params
etc_params = {'n_estimators': 100,
             'max_depth': 8,
             'min_samples_split': 100,
             'min_samples_leaf': 70}

In [5]:
def model_fn(clf, params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = clf(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [6]:
model_fn(ExtraTreesClassifier, etc_params)

############## New Run ################
PARAMETERS: 
params  = {'n_estimators': 100, 'max_depth': 8, 'min_samples_split': 100, 'min_samples_leaf': 70}
CV SCORE: 
AUC: 0.8188898938775161
AUC: 0.8379024086631095
AUC: 0.8195859879752434
AUC: 0.8269533239208633
AUC: 0.819769515878774
Mean AUC: 0.8246202260631014 



-0.8246202260631014

## Gridsearch for best params

In [7]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

###  ExtraTrees

In [8]:
etc_space = {
    "max_depth": hp.quniform('max_depth', 7, 50, 3),
    "min_samples_split": hp.choice('min_samples_split', [5, 10, 15, 20, 25]),
    "min_samples_leaf": hp.choice('min_samples_leaf', [5, 10, 15, 20, 25]),
    "criterion": "gini",
    "n_estimators": hp.quniform('n_estimators', 10, 500, 10),
    "max_features": 'auto',
    'random_state': 0}

In [9]:
def etc_grid(params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    params = {
        'max_depth': int(params['max_depth']),
        'min_samples_split': params['min_samples_split'],
        'min_samples_leaf': params['min_samples_leaf'],
        'criterion': params['criterion'],
        'n_estimators': int(params['n_estimators']),
        'max_features': params['max_features'],
        'random_state': 0
    }
    
    EPOCHS = 5
    kf = KFold(n_splits = EPOCHS, shuffle = True)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in kf.split(train_features, train_target):
        model = ExtraTreesClassifier(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, y_pred_train    
    gc.collect()

    joblib.dump(model, "./Model/ETC_model.m")
    print(f'Mean AUC: {score_mean / EPOCHS} \n')
    return -(score_mean / EPOCHS)

In [10]:
%%time
etc_best = fmin(fn=etc_grid,
            space=etc_space,
            algo=tpe.suggest, 
            max_evals=20)

############## New Run ################             
PARAMETERS:                                         
params  = {'criterion': 'gini', 'max_depth': 21.0, 'max_features': 'auto', 'min_samples_leaf': 20, 'min_samples_split': 5, 'n_estimators': 100.0, 'random_state': 0}
CV SCORE:                                           
AUC: 0.896736505449048                              
AUC: 0.8965239313755312                             
AUC: 0.8977499756612448                             
AUC: 0.8974609238980089                             
AUC: 0.8962492644860025                             
Mean AUC: 0.896944120173967                         

############## New Run ################                                         
PARAMETERS:                                                                     
params  = {'criterion': 'gini', 'max_depth': 48.0, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 370.0, 'random_state': 0}
CV SCORE:                     

PARAMETERS:                                                                         
params  = {'criterion': 'gini', 'max_depth': 21.0, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 480.0, 'random_state': 0}
CV SCORE:                                                                           
AUC: 0.9094774260847542                                                             
AUC: 0.90608068291461                                                               
AUC: 0.9051948507263983                                                             
AUC: 0.9027898337564205                                                             
AUC: 0.9080509319347198                                                             
Mean AUC: 0.9063187450833805                                                        

############## New Run ################                                              
PARAMETERS:                                                         

## Reference
- https://www.kaggle.com/smerllo/identify-unique-cards-id