In [4]:
pip install catboost



In [5]:
pip install optuna



In [6]:
import pandas as pd
import numpy as np
import catboost as cb
import optuna
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as mse

In [7]:
df = pd.read_csv('/content/sample_data/synthesized_loan_data.csv')

In [8]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2,1
1,1,1,2.79355,1,2,299,207.078743,97.333547,185.855127,0.99445,1,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0,1
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2,1
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0,1


In [9]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [10]:
x = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

In [11]:
x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2
1,1,1,2.793550,1,2,299,207.078743,97.333547,185.855127,0.994450,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0
...,...,...,...,...,...,...,...,...,...,...,...
3995,1,1,2.118012,0,0,248,2693.049737,176.172212,360.665439,1.007047,1
3996,0,0,-0.002574,0,0,249,1949.728061,117.277610,361.044262,0.986177,1
3997,1,1,0.010152,0,0,300,2882.202092,136.098872,361.722296,0.994603,0
3998,0,1,-0.002625,0,0,385,182.631094,191.733185,360.723093,0.989029,1


In [12]:
y

0       1
1       1
2       1
3       1
4       1
       ..
3995    1
3996    1
3997    1
3998    0
3999    1
Name: Loan_Status, Length: 4000, dtype: int64

In [13]:
def objective(trial):
    x_train, x_validate, y_train, y_validate = train_test_split(x, y, test_size=0.3)
    
    param = {
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1),
        'depth' : trial.suggest_int('depth', 1, 12),
        'boosting_type' : trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type' : trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit' : '3gb',
    }
    
    if param['bootstrap_type'] == 'Bayesian': param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    
    elif param['bootstrap_type'] == 'Bernoulli' : param['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    gbm = cb.CatBoostClassifier(**param)
    
    gbm.fit(x_train, y_train, eval_set=[(x_validate, y_validate)], verbose=0, early_stopping_rounds=100)
    
    preds = gbm.predict(x_validate)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_validate, pred_labels)
    
    return accuracy

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

[32m[I 2022-05-09 10:10:05,419][0m A new study created in memory with name: no-name-88f6238e-d407-4434-b065-24022db9df97[0m
[32m[I 2022-05-09 10:10:06,560][0m Trial 0 finished with value: 0.9275 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.07452014983755605, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.9147295344612902}. Best is trial 0 with value: 0.9275.[0m
[32m[I 2022-05-09 10:10:07,380][0m Trial 1 finished with value: 0.9066666666666666 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.02083108503874114, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9275.[0m
[32m[I 2022-05-09 10:10:08,185][0m Trial 2 finished with value: 0.92 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.06109782248623011, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9178478242359547}. Best is trial 0 with value: 0.9

In [15]:
print('NUmber of finshed trials: {}'.format(len(study.trials)))
print('Best trial:')

trial = study.best_trial

print(' Value: {}'.format(trial.value))

print(' Params: ')
for key, value in trial.params.items():
    print(' {}: {}'.format(key, value))

NUmber of finshed trials: 50
Best trial:
 Value: 0.9358333333333333
 Params: 
 objective: Logloss
 colsample_bylevel: 0.051820620919506774
 depth: 9
 boosting_type: Plain
 bootstrap_type: MVS


In [16]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=30)


In [17]:
cb_loan = cb.CatBoostClassifier(n_estimators=100,
                                objective='CrossEntropy',
                                colsample_bylevel=0.09,
                                depth=4,
                                boosting_type='Plain',
                                bootstrap_type='Bernoulli',
                                subsample=0.34,
                                loss_function='Logloss',
                                learning_rate=0.1,
                                task_type='CPU',
                                random_state=1,
                                verbose=False)

In [18]:
cb_loan.fit(train_x, train_y)

<catboost.core.CatBoostClassifier at 0x7f80a6d05a90>

In [19]:
loan_predicted = cb_loan.predict(test_x)

In [20]:
model_rmse = np.sqrt(mse(test_y, loan_predicted))

In [21]:
model_rmse

0.2936835031117683

In [21]:
cb_optuna = 'catboost_optuna_saved_model.sav'
pickle.dump(cb_loan, open(cb_optuna, 'wb'))

In [None]:
cb_loaded = pickle.load(open(cb_optuna, 'rb'))

In [None]:
instance_score = cb_loaded.score(cb_loaded, single_instance)
print(instance_score)