In [1]:
### import
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import os, sys, gc, warnings, random, datetime
from skopt import BayesSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import StratifiedKFold
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import train_test_split

ITERATIONS = 10 

# Load data
df = pd.read_pickle('../input/lending-club-feature-selection/lgb_selected.pkl')

X = df.drop('loan_status', axis=1)

y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2021)

In [2]:
### Deterministic
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

## Bayesian Optimization

In [3]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 5000,
        'seed': 2021
    }

In [4]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=space['colsample_bytree'])
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=40,verbose=False)
    

    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test, pred)
    print ("SCORE:", auc)
    return {'loss': auc, 'status': STATUS_OK }

In [5]:
trials = Trials()

import warnings
warnings.filterwarnings('ignore')
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)

SCORE:
0.9633003057040391
SCORE:
0.9653330522671065
SCORE:
0.9624029141274085
SCORE:
0.9641747195811513
SCORE:
0.9627261930780876
SCORE:
0.9637333096393436
SCORE:
0.9644319293959316
SCORE:
0.9624432513970727
SCORE:
0.9640412925972864
SCORE:
0.9631904964433778
100%|██████████| 10/10 [3:32:25<00:00, 1274.56s/trial, best loss: 0.9624029141274085]


In [6]:
### print Best Hyperparameters
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.9452293247804742, 'gamma': 6.138235884890671, 'max_depth': 12.0, 'min_child_weight': 8.0, 'reg_alpha': 151.0, 'reg_lambda': 0.9627205324158953}


## Make Real Prediction (Params from Bayesian Optimization)

In [7]:
df = pd.read_pickle('../input/lending-club-feature-selection/lgb_selected.pkl')
train , test = train_test_split(df,test_size = 0.2, random_state = 2020)
y_train = train.pop('loan_status')
y_test = test.pop('loan_status')
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [8]:
remove_features = ['id','loan_status']
features = [col for col in list(train) if col not in remove_features]

In [9]:
def make_xgb_prediction(train, y, test, features, model_params=None, folds=5):
    def xgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) 
        return 'f1', 1-f1_score(y_true, y_hat, average='micro')
    
    skf = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = xgb.DMatrix(x_tr, label=y_tr)
        dvalid = xgb.DMatrix(x_val, label=y_val)

        clf = xgb.train(
            model_params,
            dtrain,
            num_boost_round=10000, 
            evals=[(dtrain, 'train'), (dvalid, 'valid')],
            verbose_eval=50,
            early_stopping_rounds=100,
            feval=xgb_f1_score
        )

        #feature_importance[f'fold_{fold+1}'] = clf.get_score()

        y_pred_val = clf.predict(dvalid)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict(xgb.DMatrix(x_test)) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [10]:
# xgb model params
xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'max_depth': int(best_hyperparams['max_depth']),
    'colsample_bytree': best_hyperparams['colsample_bytree'],

    'disable_default_eval_metric': 1,

    'seed': SEED,
    'gamma' : best_hyperparams['gamma'],
    'min_child_weight' : int(best_hyperparams['min_child_weight']) , 
    'reg_alpha' : int(best_hyperparams['reg_alpha']) , 
    'reg_lambda' : best_hyperparams['reg_lambda']
} 



In [11]:
y_oof_xgb, y_preds_xgb, fi_xgb = make_xgb_prediction(train, y_train, test, features, \
                                                     model_params=xgb_params)

Fold: 1
(1446827, 40) (361707, 40)
[0]	train-f1:0.13062	valid-f1:0.13062
[50]	train-f1:0.99515	valid-f1:0.99491
[100]	train-f1:0.99580	valid-f1:0.99559
Fold 1 | F1 Score: 0.9889192080883146
Fold: 2
(1446827, 40) (361707, 40)
[0]	train-f1:0.13062	valid-f1:0.13062
[50]	train-f1:0.99493	valid-f1:0.99475
[99]	train-f1:0.99572	valid-f1:0.99557
Fold 2 | F1 Score: 0.9885763891768752
Fold: 3
(1446827, 40) (361707, 40)
[0]	train-f1:0.13062	valid-f1:0.13062
[50]	train-f1:0.99481	valid-f1:0.99495
[100]	train-f1:0.99568	valid-f1:0.99590
Fold 3 | F1 Score: 0.9887063286029852
Fold: 4
(1446827, 40) (361707, 40)
[0]	train-f1:0.13062	valid-f1:0.13062
[50]	train-f1:0.99470	valid-f1:0.99491
[99]	train-f1:0.99573	valid-f1:0.99585
Fold 4 | F1 Score: 0.9885155664667811
Fold: 5
(1446828, 40) (361706, 40)
[0]	train-f1:0.13062	valid-f1:0.13062
[50]	train-f1:0.99506	valid-f1:0.99482
[99]	train-f1:0.99571	valid-f1:0.99552
Fold 5 | F1 Score: 0.988305419318452

Mean F1 score = 0.9886045823306816
OOF F1 score = 0.9