# IEEE-CIS Fraud Detection -- Model [RandomForest]

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')
sns.set_palette('deep')
sns.set_style('white')
%matplotlib inline

In [3]:
train_features = pd.read_csv('./Data/train_features.csv' )
test_features = pd.read_csv('./Data/test_features.csv')
train_target = pd.read_csv('./Data/train_target.csv', header=None)

## TimeSeriesSplit

- Since the train and test data were collected in time series manner and there's existing a gap, split the train and validation data based using TimeSeriesSplit. 

In [4]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib



In [6]:
# rf params
rf_params = {'n_estimators': 500,
             'max_depth': 8,
             'min_samples_split': 100,
             'min_samples_leaf': 70}

In [7]:
def model_fn(clf, params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = clf(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, clf, y_pred_train    
    gc.collect()
    joblib.dump(model, "./Model/LGB_model.m")
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [8]:
model_fn(RandomForestClassifier, rf_params)

############## New Run ################
PARAMETERS: 
params  = {'n_estimators': 500, 'max_depth': 8, 'min_samples_split': 100, 'min_samples_leaf': 70}
CV SCORE: 
AUC: 0.8530061277142115
AUC: 0.8710899463214429
AUC: 0.8525384488047617
AUC: 0.8637924793034852
AUC: 0.8578447296772242
Mean AUC: 0.8596543463642252 



-0.8596543463642252

### Gridsearch

In [9]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

In [10]:
rf_space = {
    "max_depth": hp.quniform('max_depth', 7, 50, 3),
    "min_samples_split": hp.choice('min_samples_split', [5, 10, 15, 20, 25]),
    "min_samples_leaf": hp.choice('min_samples_leaf', [5, 10, 15, 20, 25]),
    "criterion": "gini",
    "n_estimators": hp.quniform('n_estimators', 10, 500, 10),
    "max_features": 'auto',
    'random_state': 0}

In [11]:
def rf_grid(params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    params = {
        'max_depth': int(params['max_depth']),
        'min_samples_split': params['min_samples_split'],
        'min_samples_leaf': params['min_samples_leaf'],
        'criterion': params['criterion'],
        'n_estimators': int(params['n_estimators']),
        'max_features': params['max_features'],
        'random_state': 0
    }
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = RandomForestClassifier(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, y_pred_train    
    gc.collect()
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [None]:
rf_best = fmin(fn=rf_grid,
            space=rf_space,
            algo=tpe.suggest,
            max_evals=20)

############## New Run ################             
PARAMETERS:                                         
params  = {'criterion': 'gini', 'max_depth': 27.0, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 180.0, 'random_state': 0}
CV SCORE:                                           
AUC: 0.8766482301311187                             
AUC: 0.8975698631695969                             
AUC: 0.8815804264610305                             
AUC: 0.9003285080435519                             
AUC: 0.8933052708794614                             
Mean AUC: 0.8898864597369519                        

############## New Run ################                                           
PARAMETERS:                                                                       
params  = {'criterion': 'gini', 'max_depth': 30.0, 'max_features': 'auto', 'min_samples_leaf': 15, 'min_samples_split': 15, 'n_estimators': 60.0, 'random_state': 0}
CV SCORE:                  

PARAMETERS:                                                                            
params  = {'criterion': 'gini', 'max_depth': 48.0, 'max_features': 'auto', 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 50.0, 'random_state': 0}
CV SCORE:                                                                              
AUC: 0.8738366996831638                                                                
AUC: 0.8923535822190594                                                                
AUC: 0.8785143820292945                                                                
AUC: 0.8978622879271146                                                                
AUC: 0.8878858672340428                                                                
Mean AUC: 0.886090563818535                                                            

############## New Run ################                                                
PARAMETERS:                               

In [19]:
model_fn(RandomForestClassifier, rf_params)

############## New Run ################
PARAMETERS: 
params  = {'n_estimators': 100, 'max_depth': 21, 'oob_score': True, 'min_samples_split': 100, 'min_samples_leaf': 30}
CV SCORE: 
AUC: 0.8728489858985613
AUC: 0.8874333842638913
AUC: 0.8765140087882214
AUC: 0.8904808068160666
AUC: 0.8867049732530324
Mean AUC: 0.8827964318039546 



-0.8827964318039546

In [30]:
rf_best_params = space_eval(rf_space, rf_best)
rf_grid(rf_best_params)

{'bagging_fraction': 0.7,
 'colsample_bytree': 0.4016942936888648,
 'feature_fraction': 0.7,
 'gamma': 0.0027646374511606456,
 'learning_rate': 0.07306088200408586,
 'max_depth': 19.0,
 'min_child_samples': 16,
 'num_leaves': 240,
 'reg_alpha': 0.26096845550603054,
 'reg_lambda': 0.8730362197991705}

############## New Run ################
PARAMETERS: 
params  = {'bagging_fraction': 0.7, 'colsample_bytree': 0.4016942936888648, 'feature_fraction': 0.7, 'gamma': 0.0027646374511606456, 'learning_rate': 0.07306088200408586, 'max_depth': 19.0, 'min_child_samples': 16, 'num_leaves': 240, 'reg_alpha': 0.26096845550603054, 'reg_lambda': 0.8730362197991705}
CV SCORE: 
AUC: 0.8840829092579178
AUC: 0.9104428561255219
AUC: 0.8993004713835214
AUC: 0.9205492045829409
AUC: 0.9244486833378995
Mean AUC: 0.9077648249375603 



-0.9077648249375603

In [None]:
submission_sample = pd.read_csv('./Data/sample_submission.csv')
test_pred = lgb_model.predict_proba(test_features)[:,1]
submission_sample['isFraud'] = test_pred
submission_sample.to_csv('./Output/LGBM_model_3.csv', index=False)

In [None]:
from ml import simple

In [None]:
train_target.columns = ['isFraud']
train = pd.concat([train_features, train_target], axis=1)
submission_sample = pd.read_csv('./Data/sample_submission.csv')
test = pd.concat([submission_sample, test_features], axis=1)
data = simple.Data(train, test, 'TransactionID', 'isFraud')

In [None]:
sub = simple.Model(data, 'LGB', lgb_best_params, 0.2, 4).PRED
sub['isFraud'] = sub['isFraud'].clip(0,1)
sub.to_csv('./Output/LGB_simple.csv', index=False)

## Train_test_split

In [None]:
sample_submission = pd.read_csv('./Data/sample_submission.csv')

In [None]:
%%time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

EPOCHS = 5
kf = KFold(n_splits = EPOCHS, shuffle = True)
y_preds = np.zeros(sample_submission.shape[0])
y_oof = np.zeros(train_features.shape[0])
score_mean = 0
print("CV SCORE: ")
for tr_idx, val_idx in kf.split(train_features, train_target):
    clf = lgb.LGBMClassifier(**lgb_params)
    
    X_tr, X_vl = train_features.iloc[tr_idx, :], train_features.iloc[val_idx, :]
    y_tr, y_vl = train_target.iloc[tr_idx], train_target.iloc[val_idx]
    clf.fit(X_tr, y_tr)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    print('ROC AUC {}'.format(roc_auc_score(y_vl, y_pred_train)))
    
    y_preds += clf.predict_proba(test_features)[:,1] / EPOCHS

###  GridSearch

In [None]:
def rf_grid(params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    params = {
        'boosting_type': 'gbdt',
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'gamma': "{:.3f}".format(params['gamma']),
        'num_leaves': int(format(params['num_leaves'])),
        'min_child_samples': int(format(params['min_child_samples'])),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction']),
        'metric': 'auc',
        'bagging_seed': 11,
        'random_state': 0
    }
    
    EPOCHS = 5
    kf = KFold(n_splits = EPOCHS, shuffle = True)
    y_preds = np.zeros(sample_submission.shape[0])
    y_oof = np.zeros(train_features.shape[0])
    score_mean = 0
    print("CV SCORE: ")
    for tr_idx, val_idx in kf.split(train_features, train_target):
        clf = lgb.LGBMClassifier(
            **params
        )

        X_tr, X_vl = train_features.iloc[tr_idx, :], train_features.iloc[val_idx, :]
        y_tr, y_vl = train_target.iloc[tr_idx], train_target.iloc[val_idx]
        clf.fit(X_tr, y_tr)
        y_pred_train = clf.predict_proba(X_vl)[:,1]
        y_oof[val_idx] = y_pred_train
        score = roc_auc_score(y_vl, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
        
    print(f'Mean AUC: {score_mean / EPOCHS} \n')
    joblib.dump(clf, "./Model/LGB_model_cv.m")
    return -(score_mean / EPOCHS)

In [None]:
rf_best = fmin(fn=rf_grid,
            space=rf_space,
            algo=tpe.suggest,
            max_evals=20)

## Reference
- https://www.kaggle.com/smerllo/identify-unique-cards-id