## Dependencies

In [4]:
# import
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import display
from bayes_opt import BayesianOptimization



from sklearn import metrics
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', 200)

# pd.options.display.max_rows = 10000
# pd.options.display.max_columns = 1000
# pd.options.display.max_colwidth = 1000

In [5]:
### Deterministic

def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [6]:
### Load Data
df = pd.read_pickle('../input/lending-club-feature-selection/lgb_selected.pkl')
remove_features = ['id','loan_status']
features = [col for col in list(df) if col not in remove_features]

In [7]:
train_df , test_df = train_test_split(df,test_size = 0.2, random_state = 2020)

In [8]:
# Load Data with selected features
X = pd.read_pickle('../input/lending-club-feature-selection/lgb_selected.pkl')
# Labels
y = X['loan_status']

# Remove Labels from Dataframe
X.drop(['loan_status'], axis = 1, inplace = True)
# Final Data Shapes
print(X.shape)
print(y.shape)

(2260668, 40)
(2260668,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 2020)

In [10]:
cat_feat = []
# LABEL ENCODING
for col in X_train.columns:
    if (X_train[col].dtype.name == 'object' or X_test[col].dtype.name == 'object' \
        or X_train[col].nunique() < 300) and col != 'age':
        cat_feat.append(col)
        le = LabelEncoder()
        le.fit(list(X_train[col].values) + list(X_test[col].values))
        X_train[col] = le.transform(list(X_train[col].values))
        X_test[col] = le.transform(list(X_test[col].values))
        
print('categorical feature:', cat_feat)

categorical feature: ['last_fico_range_high', 'debt_settlement_flag', 'fico_range_low', 'hardship_flag', 'emp_length', 'term', 'num_rev_accts', 'all_util', 'application_type', 'mths_since_last_record', 'il_util', 'pub_rec_bankruptcies', 'sub_grade', 'total_acc', 'total_cu_tl', 'pymnt_plan', 'addr_state']


## Bayesian Optimization

In [11]:
import catboost as cgb

def cat_hyp(depth, bagging_temperature,  learning_rate):
    params = {"iterations": 100,
            'eval_metric': 'AUC',
              'loss_function': 'Logloss',
            "verbose": False} 
    params["depth"] = int(round(depth)) 
    params["bagging_temperature"] = bagging_temperature
    params["learning_rate"] = learning_rate
#     params["l2_leaf_reg"] = l2_leaf_reg
  
    cat_feat = [] # Categorical features list, we have nothing in this dataset
    cv_dataset = cgb.Pool(data=X_train,
                        label=y_train,
                        cat_features=cat_feat)

    scores = cgb.cv(cv_dataset,
              params,
              fold_count=3)
    print (scores)
    return np.max(scores['test-AUC-mean']) 

In [12]:
# Search space
pds = {'depth': (4, 10),
       'bagging_temperature': (0.1,10),
#        'l2_leaf_reg': (0.1, 10),
       'learning_rate': (0.1, 0.2)
        }

In [13]:
optimizer = BayesianOptimization(cat_hyp, pds, random_state=SEED)
optimizer.maximize(init_points=10, n_iter=10)

|   iter    |  target   | baggin... |   depth   | learni... |
-------------------------------------------------------------
    iterations  test-AUC-mean  test-AUC-std  test-Logloss-mean  \
0            0       0.982917      0.000895           0.333160   
1            1       0.987546      0.000435           0.184724   
2            2       0.988780      0.000278           0.125980   
3            3       0.990729      0.000311           0.094006   
4            4       0.992448      0.000247           0.073704   
..         ...            ...           ...                ...   
95          95       0.995732      0.000234           0.035708   
96          96       0.995739      0.000245           0.035661   
97          97       0.995752      0.000249           0.035576   
98          98       0.995752      0.000252           0.035577   
99          99       0.995762      0.000233           0.035537   

    test-Logloss-std  train-Logloss-mean  train-Logloss-std  
0           0.000930 

In [14]:
optimizer.max['params']

{'bagging_temperature': 10.0, 'depth': 10.0, 'learning_rate': 0.2}

## Make Real Prediction (Params from Bayesian Optimization)

In [15]:
# catboost model params
cat_params = {
    'n_estimators': 10000,
    'learning_rate': optimizer.max['params']['learning_rate'],
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_seed': SEED,
    'metric_period': 100,
    'od_wait': 100,
    'depth': optimizer.max['params']['depth'],
    'bagging_temperature': optimizer.max['params']['bagging_temperature']
}

In [16]:
df = pd.read_pickle('../input/lending-club-feature-selection/lgb_selected.pkl')

train , test = train_test_split(df,test_size = 0.2, random_state = 2020)
y_train = train.pop('loan_status')
y_test = test.pop('loan_status')
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [17]:
def make_cat_prediction(train, y, test, features, categorical_features=None, model_params=None, folds=5):
    skf = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)
        
        clf = CatBoostClassifier(**model_params)
        clf.fit(x_tr, y_tr, eval_set=(x_val, y_val),
                cat_features=categorical_features,
                use_best_model=True,
                verbose=True)

        feature_importance[f'fold_{fold+1}'] = clf.feature_importances_

        best_iteration = clf.best_iteration_
        y_pred_val = clf.predict_proba(x_val)[:,1]

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict_proba(x_test)[:,1] / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [20]:
cat_feat = []

In [21]:
y_oof_cat, y_preds_cat, fi_cat = make_cat_prediction(train, y_train, test, features, \
                                                     categorical_features=cat_feat, model_params=cat_params)

Fold: 1
(1446827, 40) (361707, 40)




0:	test: 0.9809463	best: 0.9809463 (0)	total: 845ms	remaining: 2h 20m 48s
100:	test: 0.9959185	best: 0.9959197 (99)	total: 1m 19s	remaining: 2h 9m 17s
200:	test: 0.9960215	best: 0.9960215 (200)	total: 2m 36s	remaining: 2h 7m 12s
300:	test: 0.9960404	best: 0.9960517 (293)	total: 3m 55s	remaining: 2h 6m 33s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9960516946
bestIteration = 293

Shrink model to first 294 iterations.
Fold 1 | F1 Score: 0.9897181973254595
Fold: 2
(1446827, 40) (361707, 40)




0:	test: 0.9812374	best: 0.9812374 (0)	total: 798ms	remaining: 2h 12m 57s
100:	test: 0.9959130	best: 0.9959161 (99)	total: 1m 19s	remaining: 2h 9m 15s
200:	test: 0.9959644	best: 0.9959719 (156)	total: 2m 37s	remaining: 2h 7m 36s
300:	test: 0.9960091	best: 0.9960457 (270)	total: 3m 55s	remaining: 2h 6m 43s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9960456758
bestIteration = 270

Shrink model to first 271 iterations.
Fold 2 | F1 Score: 0.9895550818756618
Fold: 3
(1446827, 40) (361707, 40)




0:	test: 0.9799793	best: 0.9799793 (0)	total: 801ms	remaining: 2h 13m 32s
100:	test: 0.9958760	best: 0.9958779 (99)	total: 1m 18s	remaining: 2h 7m 52s
200:	test: 0.9959831	best: 0.9959879 (193)	total: 2m 35s	remaining: 2h 6m 35s
300:	test: 0.9960026	best: 0.9960185 (273)	total: 3m 54s	remaining: 2h 5m 52s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9960185241
bestIteration = 273

Shrink model to first 274 iterations.
Fold 3 | F1 Score: 0.9895799638934275
Fold: 4
(1446827, 40) (361707, 40)




0:	test: 0.9809740	best: 0.9809740 (0)	total: 774ms	remaining: 2h 8m 58s
100:	test: 0.9959365	best: 0.9959384 (96)	total: 1m 18s	remaining: 2h 7m 43s
200:	test: 0.9960286	best: 0.9960357 (183)	total: 2m 36s	remaining: 2h 6m 55s
300:	test: 0.9960410	best: 0.9960410 (300)	total: 3m 53s	remaining: 2h 5m 28s
400:	test: 0.9960241	best: 0.9960410 (300)	total: 5m 11s	remaining: 2h 4m 14s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9960410175
bestIteration = 300

Shrink model to first 301 iterations.
Fold 4 | F1 Score: 0.9895937872366307
Fold: 5
(1446828, 40) (361706, 40)




0:	test: 0.9794702	best: 0.9794702 (0)	total: 871ms	remaining: 2h 25m 13s
100:	test: 0.9957474	best: 0.9957581 (95)	total: 1m 19s	remaining: 2h 9m 47s
200:	test: 0.9958964	best: 0.9958986 (195)	total: 2m 37s	remaining: 2h 7m 46s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9958985859
bestIteration = 195

Shrink model to first 196 iterations.
Fold 5 | F1 Score: 0.9892537032838825

Mean F1 score = 0.9895401467230125
OOF F1 score = 0.9895401468813968
