In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as mse

In [2]:
df = pd.read_csv('../../datasets/loan_data_clean.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0.0,0,0,376,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,1.0,0,0,306,1508.0,128.0,360.0,1.0,0,0
2,2,1,1,0.0,0,1,139,0.0,66.0,360.0,1.0,2,1
3,3,1,1,0.0,1,0,90,2358.0,120.0,360.0,1.0,2,1
4,4,1,0,0.0,0,0,381,0.0,141.0,360.0,1.0,2,1


In [4]:
df.columns

Index(['Unnamed: 0', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [None]:
x_cols = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

In [5]:
x = df[x_cols]
y = df['Loan_Status']

In [6]:
x

Unnamed: 0.1,Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0,1,0,0.0,0,0,376,0.0,146.412162,360.0,1.0,2
1,1,1,1,1.0,0,0,306,1508.0,128.000000,360.0,1.0,0
2,2,1,1,0.0,0,1,139,0.0,66.000000,360.0,1.0,2
3,3,1,1,0.0,1,0,90,2358.0,120.000000,360.0,1.0,2
4,4,1,0,0.0,0,0,381,0.0,141.000000,360.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
609,609,0,0,0.0,0,0,125,0.0,71.000000,360.0,1.0,0
610,610,1,1,3.0,0,0,275,0.0,40.000000,180.0,1.0,0
611,611,1,1,1.0,0,0,431,240.0,253.000000,360.0,1.0,2
612,612,1,1,2.0,0,0,422,0.0,187.000000,360.0,1.0,2


In [7]:
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64

In [8]:
def objective(trial):
    x_train, x_validate, y_train, y_validate = train_test_split(x, y, test_size=0.3)
    
    param = {
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1),
        'depth' : trial.suggest_int('depth', 1, 12),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type' : trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit' : '3gb',
    }
    
    if param['bootstrap_type'] == 'Bayesian': param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    
    elif param['bootstrap_type'] == 'Bernoulli' : param['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    gbm = cb.CatBoostClassifier(**param)
    
    gbm.fit(x_train, y_train, eval_set=[(x_validate, y_validate)], verbose=0, early_stopping_rounds=100)
    
    preds = gbm.predict(x_validate)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_validate, pred_labels)
    
    return accuracy

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

[32m[I 2022-04-29 09:43:47,553][0m A new study created in memory with name: no-name-0af92c44-0de9-492b-b16b-ac7673778a00[0m
[32m[I 2022-04-29 09:43:52,141][0m Trial 0 finished with value: 0.8162162162162162 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.055208223588188304, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2408661109451111}. Best is trial 0 with value: 0.8162162162162162.[0m
[32m[I 2022-04-29 09:44:05,177][0m Trial 1 finished with value: 0.8 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.08874437205243008, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.09669101938940439}. Best is trial 0 with value: 0.8162162162162162.[0m
[32m[I 2022-04-29 09:44:07,777][0m Trial 2 finished with value: 0.7837837837837838 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.05875379187213092, 'depth': 12, 'boosting_type': 'Plain', 'bootst

In [10]:
print('NUmber of finshed trials: {}'.format(len(study.trials)))
print('Best trial:')

trial = study.best_trial

print(' Value: {}'.format(trial.value))

print(' Params: ')
for key, value in trial.params.items():
    print(' {}: {}'.format(key, value))

NUmber of finshed trials: 22
Best trial:
 Value: 0.8486486486486486
 Params: 
 objective: CrossEntropy
 colsample_bylevel: 0.07354786984147887
 depth: 10
 boosting_type: Ordered
 bootstrap_type: Bernoulli
 subsample: 0.9878200221325684


In [11]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=30)


In [32]:
cb_loan = cb.CatBoostClassifier(n_estimators=50,
                                objective='CrossEntropy',
                                colsample_bylevel=0.07,
                                depth=10,
                                boosting_type='Plain',
                                bootstrap_type='Bernoulli',
                                subsample=0.98,
                                loss_function='Logloss',
                                learning_rate=0.1,
                                task_type='CPU',
                                random_state=1,
                                verbose=False)

In [33]:
cb_loan.fit(train_x, train_y)

<catboost.core.CatBoostClassifier at 0x223923a4a30>

In [34]:
loan_predicted = cb_loan.predict(test_x)

In [35]:
model_rmse = np.sqrt(mse(test_y, loan_predicted))

In [36]:
model_rmse

0.4597631061983315