# IEEE-CIS Fraud Detection -- Model [ExtraTrees]

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')
sns.set_palette('deep')
sns.set_style('white')
%matplotlib inline

In [3]:
train_features = pd.read_csv('./Data/train_features.csv' )
test_features = pd.read_csv('./Data/test_features.csv')
train_target = pd.read_csv('./Data/train_target.csv', header=None)

## Model

- Since the train and test data were collected in time series manner and there's existing a gap, split the train and validation data based using TimeSeriesSplit. 

In [4]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, auc
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals import joblib



In [5]:
# etc params
etc_params = {'n_estimators': 100,
             'max_depth': 8,
             'min_samples_split': 100,
             'min_samples_leaf': 70}

In [6]:
def model_fn(clf, params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = clf(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, clf, y_pred_train    
    gc.collect()
    joblib.dump(model, "./Model/LGB_model.m")
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [7]:
model_fn(ExtraTreesClassifier, etc_params)

############## New Run ################
PARAMETERS: 
params  = {'n_estimators': 100, 'max_depth': 8, 'min_samples_split': 100, 'min_samples_leaf': 70}
CV SCORE: 
AUC: 0.8188898938775161
AUC: 0.8379024086631095
AUC: 0.8195859879752434
AUC: 0.8269533239208633
AUC: 0.819769515878774
Mean AUC: 0.8246202260631014 



-0.8246202260631014

## Gridsearch for best params

In [8]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

### Random Forest

In [9]:
etc_space = {
    "max_depth": hp.quniform('max_depth', 7, 50, 3),
    "min_samples_split": hp.choice('min_samples_split', [5, 10, 15, 20, 25]),
    "min_samples_leaf": hp.choice('min_samples_leaf', [5, 10, 15, 20, 25]),
    "criterion": "gini",
    "n_estimators": hp.quniform('n_estimators', 10, 500, 10),
    "max_features": 'auto',
    'random_state': 0}

In [12]:
def etc_grid(params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    params = {
        'max_depth': int(params['max_depth']),
        'min_samples_split': params['min_samples_split'],
        'min_samples_leaf': params['min_samples_leaf'],
        'criterion': params['criterion'],
        'n_estimators': int(params['n_estimators']),
        'max_features': params['max_features'],
        'random_state': 0
    }
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = ExtraTreesClassifier(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, y_pred_train    
    gc.collect()
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [13]:
etc_best = fmin(fn=etc_grid,
            space=etc_space,
            algo=tpe.suggest,
            max_evals=20)

############## New Run ################             
PARAMETERS:                                         
params  = {'criterion': 'gini', 'max_depth': 33.0, 'max_features': 'auto', 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 480.0, 'random_state': 0}
CV SCORE:                                           
AUC: 0.860978690314488                              
AUC: 0.8704620077701809                             
AUC: 0.8596058153190592                             
  0%|          | 0/20 [15:28<?, ?it/s, best loss: ?]


KeyboardInterrupt: 

### Decision tree

In [22]:
model_fn(DecisionTreeClassifier)

In [23]:
score_mean = 0
for train_idx, validation_idx in tscv.split(train_features, train_target):
    model = DecisionTreeClassifier()

    X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
    y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

    model.fit(X_train, y_train)

    y_pred_train = model.predict_proba(X_validation)[:,1]
    score = roc_auc_score(y_validation, y_pred_train)
    score_mean += score
    print(f'AUC: {score}')

AUC: 0.6425932869797563
AUC: 0.6699192472670885
AUC: 0.6599168855337131
AUC: 0.6667743082398413
AUC: 0.6452824144108406


### Adaboost

In [24]:
model_fn(AdaBoostClassifier)

In [25]:
score_mean = 0
for train_idx, validation_idx in tscv.split(train_features, train_target):
    model = AdaBoostClassifier()

    X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
    y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

    model.fit(X_train, y_train)

    y_pred_train = model.predict_proba(X_validation)[:,1]
    score = roc_auc_score(y_validation, y_pred_train)
    score_mean += score
    print(f'AUC: {score}')

AUC: 0.837240486694629
AUC: 0.8602790351120537
AUC: 0.8479921977907815
AUC: 0.8541190045897462
AUC: 0.8606400757548586


### LGBM

In [27]:
lgb_space = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_depth': hp.quniform('max_depth', 7, 25, 1),
    'learning_rate': hp.uniform('learning_rate', 0.03, 0.2),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.9),
    'reg_lambda': hp.uniform('reg_lambda', 0.1, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.8),
    'num_leaves': hp.choice('num_leaves', list(range(20, 300, 20))),       
    'min_child_samples': hp.choice('min_child_samples', list(range(10, 100, 3))),
    'feature_fraction': hp.choice('feature_fraction', [.5, .6, .7, .8, .9]),
    'bagging_fraction': hp.choice('bagging_fraction', [.5, .6, .7, .8, .9]),
    'metric': 'auc',
    'verbosity': 2, 
    'bagging_seed': 11,
    'random_state': 0
}

In [28]:
def lgb_grid(params):
    print("############## New Run ################")
    print("PARAMETERS: ")
    print(f"params  = {params}")
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'gamma': "{:.3f}".format(params['gamma']),
        'num_leaves': int(format(params['num_leaves'])),
        'min_child_samples': int(format(params['min_child_samples'])),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    score_mean = 0
    print("CV SCORE: ")
    for train_idx, validation_idx in tscv.split(train_features, train_target):
        model = lgb.LGBMClassifier(**params)

        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]

        model.fit(X_train, y_train)

        y_pred_train = model.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        score_mean += score
        print(f'AUC: {score}')
    del X_train, X_validation, y_train, y_validation, y_pred_train    
    gc.collect()
    print(f'Mean AUC: {score_mean / 5} \n')
    return -(score_mean / 5)

In [29]:
lgb_best = fmin(fn=lgb_grid,
            space=lgb_space,
            algo=tpe.suggest,
            max_evals=20)

############## New Run ################             
PARAMETERS:                                         
params  = {'bagging_fraction': 0.6, 'bagging_seed': 11, 'boosting_type': 'gbdt', 'colsample_bytree': 0.8268678324284169, 'feature_fraction': 0.5, 'gamma': 0.11884074228478392, 'learning_rate': 0.173267744134551, 'max_depth': 24.0, 'metric': 'auc', 'min_child_samples': 76, 'num_leaves': 60, 'objective': 'binary', 'random_state': 0, 'reg_alpha': 0.6991739187569014, 'reg_lambda': 0.6678428807556946, 'verbosity': 2}
CV SCORE:                                           
AUC: 0.8678263377617448                             
AUC: 0.899476533986335                              
AUC: 0.8908991251337606                             
AUC: 0.9094153270912935                             
AUC: 0.9119449009263911                             
Mean AUC: 0.8959124449799051                        

############## New Run ################                                       
PARAMETERS:                

CV SCORE:                                                                      
AUC: 0.8612958082294765                                                        
AUC: 0.8956835457737965                                                        
AUC: 0.8805415078030998                                                        
AUC: 0.9131724886414996                                                        
AUC: 0.9063641622002886                                                        
Mean AUC: 0.8914115025296322                                                   

############## New Run ################                                        
PARAMETERS:                                                                    
params  = {'bagging_fraction': 0.5, 'bagging_seed': 11, 'boosting_type': 'gbdt', 'colsample_bytree': 0.5112254012045548, 'feature_fraction': 0.5, 'gamma': 0.5106186558870042, 'learning_rate': 0.11565196092669452, 'max_depth': 13.0, 'metric': 'auc', 'min_child_samples': 52, 'num_

In [30]:
lgb_best_params = space_eval(lgb_space, lgb_best)
lgb_best_params

{'bagging_fraction': 0.7,
 'colsample_bytree': 0.4016942936888648,
 'feature_fraction': 0.7,
 'gamma': 0.0027646374511606456,
 'learning_rate': 0.07306088200408586,
 'max_depth': 19.0,
 'min_child_samples': 16,
 'num_leaves': 240,
 'reg_alpha': 0.26096845550603054,
 'reg_lambda': 0.8730362197991705}

In [31]:
lgb_grid(lgb_best_params)

############## New Run ################
PARAMETERS: 
params  = {'bagging_fraction': 0.7, 'colsample_bytree': 0.4016942936888648, 'feature_fraction': 0.7, 'gamma': 0.0027646374511606456, 'learning_rate': 0.07306088200408586, 'max_depth': 19.0, 'min_child_samples': 16, 'num_leaves': 240, 'reg_alpha': 0.26096845550603054, 'reg_lambda': 0.8730362197991705}
CV SCORE: 
AUC: 0.8840829092579178
AUC: 0.9104428561255219
AUC: 0.8993004713835214
AUC: 0.9205492045829409
AUC: 0.9244486833378995
Mean AUC: 0.9077648249375603 



-0.9077648249375603

## Ensemble model -- Stacking

In [35]:
clfs = [lgb.LGBMClassifier(**lgb_params),
        xgb.XGBClassifier(**xgb_params), 
        RandomForestClassifier(**rf_params),
        ExtraTreesClassifier(), 
        DecisionTreeClassifier(), 
        AdaBoostClassifier()]
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=clfs, 
                          meta_classifier=lr)

In [36]:
%%time
scores = []
score = 0
for clf, label in zip(clfs + [sclf], 
                      ['LGBM', 
                       'Xgboost',
                       'Random Forest', 
                       'Extra Trees',
                       'DecisionTree',
                       'Adaboost',
                       'StackingClassifier']):
    
    for i, (train_idx, validation_idx) in enumerate(tscv.split(train_features, train_target)):
        
        print(f'[{label}] Start training {i+1} fold')
    
        X_train, X_validation = train_features.iloc[train_idx, :], train_features.iloc[validation_idx, :]
        y_train, y_validation = train_target.iloc[train_idx], train_target.iloc[validation_idx]
 
        clf.fit(X_train, y_train)  
 
        y_pred_train = clf.predict_proba(X_validation)[:,1]
        score = roc_auc_score(y_validation, y_pred_train)
        scores.append(score)
        print('            Validation AUC: ', score)

    print("[%s] Accuracy: %0.4f (+/- %0.2f)" % (label, np.mean(scores), np.std(scores)))

In [37]:
train_features.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,DeviceInfo,Month,Weekday,Day,Hour,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,D15_to_mean_addr1,P_emaildomain_2,R_emaildomain_2
0,2987000,68.5,4,4248,0,51,2,43,2,216,...,0,12,5,2,0,0.19458,0.257812,0.0,0,0
1,2987001,29.0,4,9979,305,51,3,3,2,226,...,0,12,5,2,0,0.123779,0.218994,0.0,2,0
2,2987002,59.0,4,11850,391,51,4,67,3,231,...,0,12,5,2,0,0.60791,0.443115,1.611328,2,0
3,2987003,50.0,4,8796,468,51,3,18,3,377,...,0,12,5,2,0,0.405029,0.377686,0.686035,2,0
4,2987004,50.0,1,11687,415,51,3,3,2,321,...,1566,12,5,2,0,0.515625,0.377686,-999.0,2,0


## Reference
- https://www.kaggle.com/smerllo/identify-unique-cards-id