In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from time import time
import pprint
import joblib
import warnings
import sklearn

from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import make_scorer

from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer

In [16]:
df = pd.read_csv('../../datasets/loan_data_clean.csv')

In [17]:
df

Unnamed: 0.1,Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0.0,0,0,376,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,1.0,0,0,306,1508.0,128.000000,360.0,1.0,0,0
2,2,1,1,0.0,0,1,139,0.0,66.000000,360.0,1.0,2,1
3,3,1,1,0.0,1,0,90,2358.0,120.000000,360.0,1.0,2,1
4,4,1,0,0.0,0,0,381,0.0,141.000000,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,609,0,0,0.0,0,0,125,0.0,71.000000,360.0,1.0,0,1
610,610,1,1,3.0,0,0,275,0.0,40.000000,180.0,1.0,0,1
611,611,1,1,1.0,0,0,431,240.0,253.000000,360.0,1.0,2,1
612,612,1,1,2.0,0,0,422,0.0,187.000000,360.0,1.0,2,1


In [18]:
df.columns

Index(['Unnamed: 0', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [19]:
x_cols = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

x = df[x_cols]
y = df['Loan_Status']

In [20]:
x_train, x_validate, y_train, y_validate = train_test_split(x, y, test_size=0.3, random_state=12)

In [21]:
#basemodel
bs_mdl = CatBoostClassifier(task_type='CPU', verbose=False)

bs_mdl.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x20629049670>

In [22]:
y_pred = bs_mdl.predict_proba(x_validate)[:, 1]
score  = np.sqrt(mse(y_validate, y_pred))
score

0.3991951342493785

In [23]:
#Bayes optimizer

def performance_report(optimizer, x, y, title, callbacks=None):
    
    start = time()
    
    if callbacks:
        optimizer.fit(x, y, callbacks=callbacks)
    else:
        optimizer.fit(x, y)
        
    d = pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    
    print((title + ' took %.2f seconds, candidates checked: %d, best CV score: %.3f' +u'\u00B1'+'%.3f') % (time() - start, 
                                                                                                          len(optimizer.cv_results_['params']),
                                                                                                          best_score,
                                                                                                          best_score_std))
    print('Best Parameters: ')
    pprint.pprint(best_params)
    
    
    return best_params

In [24]:
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [25]:
clf = CatBoostClassifier(thread_count=2,
                         loss_function='Logloss',
                         od_type='Iter',
                         verbose=False)

In [26]:
baye_space = {
    'iterations': Integer(10, 1000), 
    'depth' : Integer(1, 8),
    'learning_rate' : Real(0.01, 1.0, 'log-uniform'),
    'random_strength' : Real(1e-9, 10, 'log-uniform'),
    'bagging_temperature' : Real(0.0, 1.0),
    'border_count' : Integer(1, 255), 
    #'12_leaf_reg' : Integer(2, 30), 
    'scale_pos_weight' : Real(0.01, 1.0, 'uniform')
}

In [27]:
opt = BayesSearchCV(
    clf, 
    baye_space,
    cv = skf,
    n_iter = 100,
    n_jobs=1,
    return_train_score=False, 
    refit=True,
    optimizer_kwargs={'base_estimator' : 'GP'},
    random_state=42
)

In [28]:
params = performance_report(
    opt, 
    x_train, 
    y_train, 
    'Bayes CatBoost', 
    callbacks=[VerboseCallback(100), DeadlineStopper(60*10)])

Iteration No: 1 started. Searching for the next optimal point.


SystemError: <method '_train' of '_catboost._CatBoost' objects> returned a result with an error set

In [None]:
tnd_mdl = CatBoostClassifier(**params, task_type='CPU', od_type='Iter', one_hot_max_size=10)

tnd_mdl.fit(x_train, y_train)

In [None]:
y_pred = tnd_mdl.predict_proba(x_validate)[:, 1]
score = np.sqrt(mse(y_validate, y_pred))
score