[Early stage diabetes risk prediction dataset](http://archive.ics.uci.edu/ml/datasets/Early+stage+diabetes+risk+prediction+dataset.)

In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
!pip install -q catboost
from catboost import CatBoostClassifier

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [4]:
SEED, METRIC = 42, 'f1' # 'recall'

In [5]:
def eval_model(model, data, target):
    predicted = model.predict(data)
    probas = model.predict_proba(data)[:,1]
    print(confusion_matrix(target, predicted).T)
    print(f'accuracy: {accuracy_score(target, predicted):.4f}')
    print(f'roc_auc: {roc_auc_score(target, probas):.4f}')
    print(f'precision: {precision_score(target, predicted):.4f}')
    print(f'recall: {recall_score(target, predicted):.4f}')
    print(f'f1: {f1_score(target, predicted):.4f}')

In [6]:
df = pd.read_csv('diabetes.csv')
df.columns = [c.replace(' ', '_').lower() for c in df.columns]
df.replace({'Yes':'yes', 'No':'no'}, inplace=True)
df.replace({'Male':'male', 'Female':'female'}, inplace=True)
df['class'] = df['class'].map({'Negative':0,'Positive':1})

In [7]:
df[:3]

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,male,no,yes,no,yes,no,no,no,yes,no,yes,no,yes,yes,yes,1
1,58,male,no,no,no,yes,no,no,yes,no,no,no,yes,no,yes,no,1
2,41,male,yes,no,no,yes,yes,no,no,yes,no,yes,no,yes,yes,no,1


In [8]:
num_cols = []
cat_cols = df.columns[:-1].tolist()
feature_cols = num_cols + cat_cols
target_col = 'class'

In [9]:
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df[target_col], random_state=SEED)

In [10]:
lr = LogisticRegression(solver='liblinear', random_state=SEED)

ct = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)])

model = Pipeline([('ct', ct), ('clf', lr)])

In [11]:
grid = {'clf__C': np.arange(0.1,3.0,0.1), 'clf__penalty': ['l1','l2']}
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=SEED)
cv = GridSearchCV(model, grid, scoring=METRIC, cv=kfold)
cv.fit(train_data[feature_cols], train_data[target_col])
best_lr = cv.best_estimator_

In [12]:
eval_model(best_lr, train_data[feature_cols], train_data[target_col])

[[150  14]
 [ 10 242]]
accuracy: 0.9423
roc_auc: 0.9893
precision: 0.9603
recall: 0.9453
f1: 0.9528


In [13]:
eval_model(best_lr, test_data[feature_cols], test_data[target_col])

[[39  6]
 [ 1 58]]
accuracy: 0.9327
roc_auc: 0.9832
precision: 0.9831
recall: 0.9062
f1: 0.9431


In [14]:
grid = {'learning_rate':[0.1,0.2,0.3], 'num_trees':[50,80,100]}
model = CatBoostClassifier(cat_features=cat_cols, silent=True)
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=SEED)
cv = GridSearchCV(model, grid, scoring=METRIC, cv=kfold)
cv.fit(train_data[feature_cols], train_data[target_col])
best_cat = cv.best_estimator_ # .get_params()

In [15]:
eval_model(best_cat, train_data[feature_cols], train_data[target_col])

[[159   1]
 [  1 255]]
accuracy: 0.9952
roc_auc: 0.9999
precision: 0.9961
recall: 0.9961
f1: 0.9961


In [16]:
eval_model(best_cat, test_data[feature_cols], test_data[target_col])

[[40  1]
 [ 0 63]]
accuracy: 0.9904
roc_auc: 0.9973
precision: 1.0000
recall: 0.9844
f1: 0.9921


In [17]:
rf = RandomForestClassifier(random_state=SEED)

ct = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)])

model = Pipeline([('ct', ct), ('clf', rf)])

In [18]:
grid = {'clf__n_estimators':[50,80,100], 'clf__max_depth':[5,10]}
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=SEED)
cv = GridSearchCV(model, grid, scoring=METRIC, cv=kfold)
cv.fit(train_data[feature_cols], train_data[target_col])
best_rf = cv.best_estimator_

In [19]:
eval_model(best_rf, train_data[feature_cols], train_data[target_col])

[[160   0]
 [  0 256]]
accuracy: 1.0000
roc_auc: 1.0000
precision: 1.0000
recall: 1.0000
f1: 1.0000


In [20]:
eval_model(best_rf, test_data[feature_cols], test_data[target_col])

[[40  1]
 [ 0 63]]
accuracy: 0.9904
roc_auc: 0.9996
precision: 1.0000
recall: 0.9844
f1: 0.9921


In [21]:
pickle.dump(best_lr, open('lr.pkl', 'wb'))
pickle.dump(best_rf, open('rf.pkl', 'wb'))
best_cat.save_model('cat.cb')

In [22]:
lr = pickle.load(open('lr.pkl', 'rb'))
rf = pickle.load(open('rf.pkl', 'rb'))
cat = CatBoostClassifier().load_model('cat.cb')

In [23]:
eval_model(lr, test_data[feature_cols], test_data[target_col])

[[39  6]
 [ 1 58]]
accuracy: 0.9327
roc_auc: 0.9832
precision: 0.9831
recall: 0.9062
f1: 0.9431


In [24]:
eval_model(rf, test_data[feature_cols], test_data[target_col])

[[40  1]
 [ 0 63]]
accuracy: 0.9904
roc_auc: 0.9996
precision: 1.0000
recall: 0.9844
f1: 0.9921


In [25]:
eval_model(cat, test_data[feature_cols], test_data[target_col])

[[40  1]
 [ 0 63]]
accuracy: 0.9904
roc_auc: 0.9973
precision: 1.0000
recall: 0.9844
f1: 0.9921
