In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as mse

In [2]:
df = pd.read_csv('../../../datasets/synthesized_diabetes_data.csv')

In [3]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2,1
1,1,1,2.79355,1,2,299,207.078743,97.333547,185.855127,0.99445,1,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0,1
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2,1
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0,1


In [4]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [5]:
x = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

In [6]:
x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2
1,1,1,2.793550,1,2,299,207.078743,97.333547,185.855127,0.994450,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0
...,...,...,...,...,...,...,...,...,...,...,...
3995,1,1,2.118012,0,0,248,2693.049737,176.172212,360.665439,1.007047,1
3996,0,0,-0.002574,0,0,249,1949.728061,117.277610,361.044262,0.986177,1
3997,1,1,0.010152,0,0,300,2882.202092,136.098872,361.722296,0.994603,0
3998,0,1,-0.002625,0,0,385,182.631094,191.733185,360.723093,0.989029,1


In [7]:
y

0       1
1       1
2       1
3       1
4       1
       ..
3995    1
3996    1
3997    1
3998    0
3999    1
Name: Loan_Status, Length: 4000, dtype: int64

In [8]:
def objective(trial):
    x_train, x_validate, y_train, y_validate = train_test_split(x, y, test_size=0.3)
    
    param = {
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1),
        'depth' : trial.suggest_int('depth', 1, 12),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type' : trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit' : '3gb',
    }
    
    if param['bootstrap_type'] == 'Bayesian': param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    
    elif param['bootstrap_type'] == 'Bernoulli' : param['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    gbm = cb.CatBoostClassifier(**param)
    
    gbm.fit(x_train, y_train, eval_set=[(x_validate, y_validate)], verbose=0, early_stopping_rounds=100)
    
    preds = gbm.predict(x_validate)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_validate, pred_labels)
    
    return accuracy

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

[32m[I 2022-05-04 11:27:06,185][0m A new study created in memory with name: no-name-49ca012b-df35-4c24-9c29-2dea1075882f[0m
[32m[I 2022-05-04 11:27:18,964][0m Trial 0 finished with value: 0.9091666666666667 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.0981800722269784, 'depth': 1, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.14349183693663986}. Best is trial 0 with value: 0.9091666666666667.[0m
[32m[I 2022-05-04 11:27:29,602][0m Trial 1 finished with value: 0.9183333333333333 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.027894970249399735, 'depth': 1, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.9183333333333333.[0m
[32m[I 2022-05-04 11:27:48,996][0m Trial 2 finished with value: 0.9158333333333334 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.09666931727361973, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.313203

In [10]:
print('NUmber of finshed trials: {}'.format(len(study.trials)))
print('Best trial:')

trial = study.best_trial

print(' Value: {}'.format(trial.value))

print(' Params: ')
for key, value in trial.params.items():
    print(' {}: {}'.format(key, value))

NUmber of finshed trials: 50
Best trial:
 Value: 0.94
 Params: 
 objective: CrossEntropy
 colsample_bylevel: 0.03667750306928356
 depth: 4
 boosting_type: Plain
 bootstrap_type: Bayesian
 bagging_temperature: 0.479403316898928


In [11]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=30)


In [24]:
cb_loan = cb.CatBoostClassifier(n_estimators=100,
                                objective='CrossEntropy',
                                colsample_bylevel=0.09,
                                depth=4,
                                boosting_type='Plain',
                                bootstrap_type='Bernoulli',
                                subsample=0.34,
                                loss_function='Logloss',
                                learning_rate=0.1,
                                task_type='CPU',
                                random_state=1,
                                verbose=False)

In [25]:
cb_loan.fit(train_x, train_y)

<catboost.core.CatBoostClassifier at 0x1d92e669ca0>

In [26]:
loan_predicted = cb_loan.predict(test_x)

In [27]:
model_rmse = np.sqrt(mse(test_y, loan_predicted))

In [28]:
model_rmse

0.2936835031117683