In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.0.5-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 52 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.5


In [2]:
import pandas as pd
import numpy as np 

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error as mse

import pickle

In [3]:
df = pd.read_csv('/content/sample_data/synthesized_loan_data.csv')

In [4]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2,1
1,1,1,2.793550,1,2,299,207.078743,97.333547,185.855127,0.994450,1,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0,1
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2,1
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1,1,2.118012,0,0,248,2693.049737,176.172212,360.665439,1.007047,1,1
3996,0,0,-0.002574,0,0,249,1949.728061,117.277610,361.044262,0.986177,1,1
3997,1,1,0.010152,0,0,300,2882.202092,136.098872,361.722296,0.994603,0,1
3998,0,1,-0.002625,0,0,385,182.631094,191.733185,360.723093,0.989029,1,0


In [5]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [6]:
train_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

x = df[train_cols]
y = df['Loan_Status']

In [7]:
x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1.015491,0,0,60,2263.426739,120.264574,362.034926,0.999618,2
1,1,1,2.793550,1,2,299,207.078743,97.333547,185.855127,0.994450,1
2,0,0,1.020575,0,1,497,-222.105153,98.957504,361.411489,0.999132,0
3,1,1,0.967628,0,0,35,1874.200797,141.121981,360.427013,0.999273,2
4,1,1,1.929354,0,0,340,47.080906,322.936778,360.013915,0.979882,0
...,...,...,...,...,...,...,...,...,...,...,...
3995,1,1,2.118012,0,0,248,2693.049737,176.172212,360.665439,1.007047,1
3996,0,0,-0.002574,0,0,249,1949.728061,117.277610,361.044262,0.986177,1
3997,1,1,0.010152,0,0,300,2882.202092,136.098872,361.722296,0.994603,0
3998,0,1,-0.002625,0,0,385,182.631094,191.733185,360.723093,0.989029,1


In [8]:
y

0       1
1       1
2       1
3       1
4       1
       ..
3995    1
3996    1
3997    1
3998    0
3999    1
Name: Loan_Status, Length: 4000, dtype: int64

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=20)

In [10]:
cb_sy_grid = CatBoostClassifier(n_estimators=100,
                       loss_function='Logloss',
                       learning_rate=0.1,
                       objective='CrossEntropy',
                       #colsample_bylevel= 0.06,
                        task_type='CPU',
                        #boosting_type='Plain', 
                       random_state=1,
                       verbose=False)

In [11]:
grid_loan = {
    'max_depth' : [3, 4, 5],
    'n_estimators' : [100, 200, 300],
    #'learning_rate' : [0.01, 0.05, 0.1],
    #'subsample' : [0.1, 0.2, 0.4, 0.8],
    'colsample_bylevel' : [0.01, 0.05, 0.09]
}

In [12]:
grid_cat = GridSearchCV(
    estimator = cb_sy_grid,
    param_grid= grid_loan,
    scoring= 'accuracy',
    cv = 5
)

In [13]:
grid_cat.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostClassifier object at 0x7f5f148fac90>,
             param_grid={'colsample_bylevel': [0.01, 0.05, 0.09],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [100, 200, 300]},
             scoring='accuracy')

In [14]:
loan_predicted = grid_cat.predict(x_test)

In [15]:
model_rmse_grid = np.sqrt(mse(y_test, loan_predicted))

In [16]:
model_rmse_grid


0.27988092706244444

In [17]:
#large grid 0.279888
#small grid 0.292973
#smaller grid_0 0.284312
#smaller grid_1 0.2828427
#smaller grid_2 0.276887
#smaller grid_3 0.28722813
#smaller grid_4 0.28431203
#smaller grid_5 0.279888092


In [18]:
def predict_discrete (grid_cat, single_input):
    #Data preparation
    input_df = pd.DataFrame([single_input])
    
    #input_df[numerical_cols] = imputer.transform(input_df[numerical_cols])
    #input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])
    #input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])

    x_input = input_df
    predictions = grid_cat.predict(x_input)[0]
    probability = grid_cat.predict_proba(x_input)[0][list(grid_cat.classes_).index(predictions)]

    return predictions, probability

In [19]:
single_instance = {
    'Gender' : 1, 
    'Married' : 1, 
    'Dependents' : 1.02, 
    'Education' : 0, 
    'Self_Employed' : 0,
       'ApplicantIncome': 60, 
       'CoapplicantIncome' : 2263.43, 
       'LoanAmount' : 362.03,
       'Loan_Amount_Term' : 360, 
       'Credit_History' : 1, 
       'Property_Area': 2,
}

In [20]:
predict_discrete(grid_cat, single_instance)

(1, 0.9811287931181352)

In [23]:
model_load = 'catboost_final_model.sav'
pickle.dump(grid_cat, open(model_load, 'wb'))


In [24]:
loaded_model = pickle.load(open(model_load, 'rb'))

In [25]:
predict_discrete(loaded_model, single_instance)

(1, 0.9811287931181352)