In [1]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 200)

# pd.options.display.max_rows = 10000
# pd.options.display.max_columns = 1000
# pd.options.display.max_colwidth = 1000

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import display
from bayes_opt import BayesianOptimization


from sklearn import metrics
from sklearn.model_selection import train_test_split


In [3]:
### Deterministic

def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [4]:
path_read ="/Users/a06411/Documents/data_hub/lending_club/lgb_selected.pkl"

In [5]:
### Load Data
df = pd.read_pickle(path_read)


In [6]:
df.shape

(1382351, 63)

In [7]:
df['issue_d'] = pd.to_datetime(df['issue_d'])
df['issue_year'] = df['issue_d'].dt.year
df.loc[df['issue_year'] == 2007 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2008 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2009 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2010 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2011 , 'issue_year'] = 201200
df.loc[df['issue_year'] == 2012 , 'issue_year'] = 201200

In [8]:
pd.options.display.max_rows = 10000

In [9]:
train = df[df['issue_year'] != 2018]
test = df[df['issue_year'] == 2018]

In [10]:
answer = test[['id','loan_status']]
# test.drop('loan_status',1, inplace = True)

In [11]:
ttrain = train[train['issue_year'] != 2017]
ttest = train[train['issue_year'] == 2017]

In [12]:
X_train = ttrain.copy()
y_train = ttrain['loan_status']
X_train.drop(['loan_status'], axis = 1, inplace = True)

X_test = ttest.copy()
y_test = ttest['loan_status']
X_test.drop(['loan_status'], axis = 1, inplace = True)

In [13]:
remove_features = ['earliest_cr_line', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d', 'next_pymnt_d',
                  'initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee',
                   'recoveries','collection_recovery_fee','last_pymnt_amnt','last_fico_range_high','last_fico_range_low' , 'id', 'loan_status' , 'issue_year']
features  = [col for col in list(train) if col not in remove_features]

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132562 entries, 0 to 2260697
Data columns (total 63 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   int_rate                     1132562 non-null  float64       
 1   dti                          1132562 non-null  float64       
 2   annual_inc                   1132562 non-null  float64       
 3   mo_sin_old_rev_tl_op         1132562 non-null  float64       
 4   acc_open_past_24mths         1132562 non-null  float64       
 5   loan_amnt                    1132562 non-null  float64       
 6   emp_length                   1132562 non-null  int64         
 7   addr_state                   1132562 non-null  int64         
 8   revol_bal                    1132562 non-null  float64       
 9   term                         1132562 non-null  int64         
 10  sub_grade                    1132562 non-null  int64         
 11  funded_amnt

In [15]:
cat_feat = [
    'debt_settlement_flag',
    'hardship_flag',
    'term',
    'application_type',
    ## 'pymnt_plan',
        'verification_status',
        'home_ownership',
        'grade',
        'purpose'
]

In [16]:
X_train = X_train[features]
X_test = X_test[features]

In [17]:
import catboost as cgb

def cat_hyp(depth, bagging_temperature,  learning_rate):
    params = {"iterations": 100,
            'eval_metric': 'AUC',
              'loss_function': 'Logloss',
            "verbose": False} 
    params["depth"] = int(round(depth)) 
    params["bagging_temperature"] = bagging_temperature
    params["learning_rate"] = learning_rate
#     params["l2_leaf_reg"] = l2_leaf_reg
  
    cat_feat = [] # Categorical features list, we have nothing in this dataset
    cv_dataset = cgb.Pool(data=X_train,
                        label=y_train,
                        cat_features=cat_feat)

    scores = cgb.cv(cv_dataset,
              params,
              fold_count=3)
    print (scores)
    return np.max(scores['test-AUC-mean']) 

In [18]:
# Search space
pds = {'depth': (4, 10),
       'bagging_temperature': (0.1,10),
#        'l2_leaf_reg': (0.1, 10),
       'learning_rate': (0.1, 0.2)
        }

In [19]:
optimizer = BayesianOptimization(cat_hyp, pds, random_state=SEED)
optimizer.maximize(init_points=10, n_iter=10)

|   iter    |  target   | baggin... |   depth   | learni... |
-------------------------------------------------------------
Training on fold [0/3]

bestTest = 0.7657601867
bestIteration = 99

Training on fold [1/3]

bestTest = 0.7675683765
bestIteration = 99

Training on fold [2/3]

bestTest = 0.7686908555
bestIteration = 99

    iterations  test-AUC-mean  test-AUC-std  test-Logloss-mean  \
0            0       0.740796      0.000126           0.589878   
1            1       0.744900      0.000732           0.528068   
2            2       0.749478      0.001388           0.488845   
3            3       0.751297      0.001205           0.464209   
4            4       0.752115      0.001170           0.448630   
5            5       0.753297      0.001469           0.438575   
6            6       0.753806      0.001184           0.431359   
7            7       0.754895      0.001311           0.426261   
8            8       0.756008      0.001143           0.422582   
9           

In [20]:
optimizer.max['params']

{'bagging_temperature': 3.807947176588889,
 'depth': 9.704285838459498,
 'learning_rate': 0.17319939418114053}

In [21]:
help(CatBoostClassifier)

Help on class CatBoostClassifier in module catboost.core:

class CatBoostClassifier(CatBoost)
 |  CatBoostClassifier(iterations=None, learning_rate=None, depth=None, l2_leaf_reg=None, model_size_reg=None, rsm=None, loss_function=None, border_count=None, feature_border_type=None, per_float_feature_quantization=None, input_borders=None, output_borders=None, fold_permutation_block=None, od_pval=None, od_wait=None, od_type=None, nan_mode=None, counter_calc_method=None, leaf_estimation_iterations=None, leaf_estimation_method=None, thread_count=None, random_seed=None, use_best_model=None, best_model_min_trees=None, verbose=None, silent=None, logging_level=None, metric_period=None, ctr_leaf_count_limit=None, store_all_simple_ctr=None, max_ctr_complexity=None, has_time=None, allow_const_label=None, target_border=None, classes_count=None, class_weights=None, auto_class_weights=None, class_names=None, one_hot_max_size=None, random_strength=None, name=None, ignored_features=None, train_dir=None, 

In [22]:
# catboost model params
cat_params = {
    'n_estimators': 10000,
    'learning_rate': optimizer.max['params']['learning_rate'],
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_seed': SEED,
    'metric_period': 100,
     ## 'pymnt_plan' : 
     ## 'disbursement_method' :
    'od_wait': 100,
    'depth': int(optimizer.max['params']['depth']),
    'bagging_temperature': optimizer.max['params']['bagging_temperature']
}

In [23]:
y_train = train.pop('loan_status')
y_test = test.pop('loan_status')
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [24]:
def make_cat_prediction(train, y, test, features, categorical_features=None, model_params=None, folds=5):
    skf = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)
        
        clf = CatBoostClassifier(**model_params)
        clf.fit(x_tr, y_tr, eval_set=(x_val, y_val),
                cat_features=categorical_features,
                use_best_model=True,
                verbose=True)

        feature_importance[f'fold_{fold+1}'] = clf.feature_importances_

        best_iteration = clf.best_iteration_
        y_pred_val = clf.predict_proba(x_val)[:,1]

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | AUC Score: {roc_auc_score(y_val, y_pred_val)}")

        score += roc_auc_score(y_val, y_pred_val) / folds
        y_preds += clf.predict_proba(x_test)[:,1] / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean AUC score = {score}")
    print(f"OOF AUC score = {roc_auc_score(y, y_oof)}")
    
    return y_oof, y_preds, feature_importance

In [25]:
y_oof_cat, y_preds_cat, fi_cat = make_cat_prediction(train, y_train, test, features, \
                                                     categorical_features=cat_feat, model_params=cat_params)

Fold: 1
(1051432, 60) (262858, 60)




0:	test: 0.7328113	best: 0.7328113 (0)	total: 382ms	remaining: 1h 3m 38s
100:	test: 0.7657883	best: 0.7657883 (100)	total: 32.9s	remaining: 53m 47s
200:	test: 0.7676141	best: 0.7676141 (200)	total: 1m 6s	remaining: 53m 55s
300:	test: 0.7680037	best: 0.7680098 (298)	total: 1m 39s	remaining: 53m 15s
400:	test: 0.7679161	best: 0.7681480 (326)	total: 2m 11s	remaining: 52m 25s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7681479764
bestIteration = 326

Shrink model to first 327 iterations.
Fold 1 | AUC Score: 0.7681479764231331
Fold: 2
(1051432, 60) (262858, 60)




0:	test: 0.7355645	best: 0.7355645 (0)	total: 424ms	remaining: 1h 10m 36s
100:	test: 0.7686149	best: 0.7686149 (100)	total: 32.8s	remaining: 53m 33s
200:	test: 0.7706742	best: 0.7706760 (199)	total: 1m 5s	remaining: 53m 29s
300:	test: 0.7711488	best: 0.7712002 (291)	total: 1m 38s	remaining: 52m 47s
400:	test: 0.7710147	best: 0.7712621 (310)	total: 2m 10s	remaining: 51m 56s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7712621346
bestIteration = 310

Shrink model to first 311 iterations.
Fold 2 | AUC Score: 0.7712621346176636
Fold: 3
(1051432, 60) (262858, 60)




0:	test: 0.7340486	best: 0.7340486 (0)	total: 411ms	remaining: 1h 8m 31s
100:	test: 0.7687331	best: 0.7687331 (100)	total: 33.4s	remaining: 54m 37s
200:	test: 0.7710036	best: 0.7710036 (200)	total: 1m 6s	remaining: 53m 50s
300:	test: 0.7712325	best: 0.7713026 (267)	total: 1m 38s	remaining: 52m 42s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7713025658
bestIteration = 267

Shrink model to first 268 iterations.
Fold 3 | AUC Score: 0.771302565828486
Fold: 4
(1051432, 60) (262858, 60)




0:	test: 0.7363081	best: 0.7363081 (0)	total: 405ms	remaining: 1h 7m 32s
100:	test: 0.7688365	best: 0.7688365 (100)	total: 32.2s	remaining: 52m 31s
200:	test: 0.7706431	best: 0.7706431 (200)	total: 1m 5s	remaining: 52m 56s
300:	test: 0.7710706	best: 0.7710919 (297)	total: 1m 38s	remaining: 52m 38s
400:	test: 0.7710450	best: 0.7711337 (314)	total: 2m 11s	remaining: 52m 25s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7711337442
bestIteration = 314

Shrink model to first 315 iterations.
Fold 4 | AUC Score: 0.7711337442039483
Fold: 5
(1051432, 60) (262858, 60)




0:	test: 0.7327484	best: 0.7327484 (0)	total: 395ms	remaining: 1h 5m 49s
100:	test: 0.7669345	best: 0.7669345 (100)	total: 32.5s	remaining: 53m
200:	test: 0.7691875	best: 0.7691875 (200)	total: 1m 5s	remaining: 53m 2s
300:	test: 0.7700351	best: 0.7700448 (292)	total: 1m 38s	remaining: 53m 2s
400:	test: 0.7701860	best: 0.7702004 (395)	total: 2m 11s	remaining: 52m 27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7702004186
bestIteration = 395

Shrink model to first 396 iterations.
Fold 5 | AUC Score: 0.7702004186435023

Mean AUC score = 0.7704093679433467
OOF AUC score = 0.7704060281171885


In [26]:
print(f"TEST AUC score = {roc_auc_score(answer['loan_status'], y_preds_cat)}")


TEST AUC score = 0.73955779986333
