In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as mse

In [2]:
df = pd.read_csv('../../../datasets/synthesized_diabetes_data.csv')

In [3]:
df.head()

Unnamed: 0,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,...,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn,height_meters,BMI,hip_waist_ratio,diabetic
0,156.700394,31,52.154896,3.314927,4.299748,1,7,1,68.496912,146.234609,...,85.186571,151.973658,92.697071,29.222653,34.012633,761.958003,1.66703,53.028606,0.815154,0
1,205.708528,50,74.558454,2.397619,4.750497,1,50,0,67.514518,155.252176,...,88.848128,152.708004,92.802874,32.88762,40.64047,715.021735,1.68185,42.542497,0.855346,1
2,184.25974,39,50.107744,2.436952,5.033467,1,56,0,63.226992,134.186382,...,94.615906,153.064309,92.212116,32.548504,36.741043,210.965098,1.565997,54.781513,0.863442,0
3,236.859522,40,46.163972,3.794523,5.382256,0,51,0,66.799333,202.246191,...,79.333852,151.949308,92.541618,45.72684,50.127763,801.80059,1.752456,65.815471,0.851003,1
4,189.616025,92,40.045044,3.633247,4.634788,1,10,1,69.283944,186.218617,...,67.017735,151.475941,92.036697,32.192694,41.216135,160.570897,1.779865,58.610103,0.890379,1


In [4]:
df.columns

Index(['chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'location', 'age',
       'gender', 'height', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s',
       'bp.2d', 'waist', 'hip', 'time.ppn', 'height_meters', 'BMI',
       'hip_waist_ratio', 'diabetic'],
      dtype='object')

In [5]:
x = df.drop(columns=['diabetic'])
y = df['diabetic']

In [6]:
x

Unnamed: 0,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,...,bp.1s,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn,height_meters,BMI,hip_waist_ratio
0,156.700394,31,52.154896,3.314927,4.299748,1,7,1,68.496912,146.234609,...,121.623985,85.186571,151.973658,92.697071,29.222653,34.012633,761.958003,1.667030,53.028606,0.815154
1,205.708528,50,74.558454,2.397619,4.750497,1,50,0,67.514518,155.252176,...,130.264602,88.848128,152.708004,92.802874,32.887620,40.640470,715.021735,1.681850,42.542497,0.855346
2,184.259740,39,50.107744,2.436952,5.033467,1,56,0,63.226992,134.186382,...,133.617710,94.615906,153.064309,92.212116,32.548504,36.741043,210.965098,1.565997,54.781513,0.863442
3,236.859522,40,46.163972,3.794523,5.382256,0,51,0,66.799333,202.246191,...,206.689504,79.333852,151.949308,92.541618,45.726840,50.127763,801.800590,1.752456,65.815471,0.851003
4,189.616025,92,40.045044,3.633247,4.634788,1,10,1,69.283944,186.218617,...,109.481572,67.017735,151.475941,92.036697,32.192694,41.216135,160.570897,1.779865,58.610103,0.890379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,257.604430,98,35.771984,5.118372,4.753262,0,18,0,61.331854,235.607829,...,202.049036,102.549858,152.100496,83.917625,35.150511,53.406133,88.264917,1.529035,68.481644,0.855944
6496,162.296900,45,35.999784,5.681376,4.227569,1,4,0,65.386967,296.862677,...,121.976799,84.701618,125.647137,82.484329,39.052591,59.843454,443.647170,1.656517,80.007193,0.830202
6497,187.502574,57,33.007135,4.111922,5.364477,1,20,0,66.676238,190.406558,...,114.021723,76.921248,151.736385,93.133092,49.009939,64.824856,187.248003,1.722115,115.404367,0.820453
6498,182.580909,32,52.969260,2.549922,4.153793,1,16,0,61.393461,132.810153,...,106.366161,66.361413,151.984054,92.246495,31.932760,46.220675,895.800788,1.542742,65.397955,0.777446


In [7]:
y

0       0
1       1
2       0
3       1
4       1
       ..
6495    1
6496    0
6497    1
6498    0
6499    1
Name: diabetic, Length: 6500, dtype: int64

In [8]:
def objective(trial):
    x_train, x_validate, y_train, y_validate = train_test_split(x, y, test_size=0.3)
    
    param = {
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1),
        'depth' : trial.suggest_int('depth', 1, 12),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type' : trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit' : '3gb',
    }
    
    if param['bootstrap_type'] == 'Bayesian': param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    
    elif param['bootstrap_type'] == 'Bernoulli' : param['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    gbm = cb.CatBoostClassifier(**param)
    
    gbm.fit(x_train, y_train, eval_set=[(x_validate, y_validate)], verbose=0, early_stopping_rounds=100)
    
    preds = gbm.predict(x_validate)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_validate, pred_labels)
    
    return accuracy

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

[32m[I 2022-05-05 14:38:19,041][0m A new study created in memory with name: no-name-ccdfa368-5036-4d74-8032-696186e94d22[0m
[32m[I 2022-05-05 14:39:11,538][0m Trial 0 finished with value: 0.8861538461538462 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.07399087877834012, 'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8285308460230432}. Best is trial 0 with value: 0.8861538461538462.[0m
[32m[I 2022-05-05 14:39:47,250][0m Trial 1 finished with value: 0.8887179487179487 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.07741866011978457, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.8887179487179487.[0m
[32m[I 2022-05-05 14:40:50,200][0m Trial 2 finished with value: 0.8902564102564102 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.06703299857887235, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 2 with value: 0

In [10]:
print('NUmber of finshed trials: {}'.format(len(study.trials)))
print('Best trial:')

trial = study.best_trial

print(' Value: {}'.format(trial.value))

print(' Params: ')
for key, value in trial.params.items():
    print(' {}: {}'.format(key, value))

NUmber of finshed trials: 14
Best trial:
 Value: 0.9015384615384615
 Params: 
 objective: CrossEntropy
 colsample_bylevel: 0.0855194503666572
 depth: 12
 boosting_type: Ordered
 bootstrap_type: Bernoulli
 subsample: 0.2676694879284146


In [11]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=30)


In [17]:
cb_diabetes = cb.CatBoostClassifier(n_estimators=100,
                                objective='CrossEntropy',
                                colsample_bylevel=0.08,
                                depth=12,
                                boosting_type='Ordered',
                                bootstrap_type='Bernoulli',
                                subsample=0.26,
                                loss_function='Logloss',
                                learning_rate=0.1,
                                task_type='CPU',
                                random_state=1,
                                verbose=False)

In [18]:
cb_diabetes.fit(train_x, train_y)

<catboost.core.CatBoostClassifier at 0x1a41e3cfc10>

In [19]:
loan_predicted = cb_diabetes.predict(test_x)

In [20]:
model_rmse = np.sqrt(mse(test_y, loan_predicted))

In [21]:
model_rmse

0.31744169664558997