# Predicting Chronic Kidney Disease Based on health care records

# PYCARET PACKAGE - Automation Model

In [1]:
# install pycaret package
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.0.4-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.4/484.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyod>=1.0.8 (from pycaret)
  Downloading pyod-1.1.0.tar.gz (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.4/153.4 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.2-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting xxhash (from pycaret)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 k

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# import dataset
dataset = pd.read_csv('/content/kidney_disease.csv')
dataset.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,0.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [4]:
# creating training data
data = dataset.sample(frac=0.90, random_state=None).reset_index(drop=True)

# creating test data
data_unseen = dataset.drop(data.index).reset_index(drop=True)

print("Data for Modelling :" + str(data.shape))
print()
print("Unseen Data for prediction :" + str(data_unseen.shape))

Data for Modelling :(360, 26)

Unseen Data for prediction :(40, 26)


In [6]:
# setting up an environment in pycaret
from pycaret.classification import *

In [12]:
exp_clf1 = setup(data=data, target='classification', pca= True, pca_components=0.95, session_id=124)

Unnamed: 0,Description,Value
0,Session id,124
1,Target,classification
2,Target type,Multiclass
3,Target mapping,"ckd: 0, ckd	: 1, notckd: 2"
4,Original data shape,"(360, 26)"
5,Transformed data shape,"(360, 4)"
6,Transformed train set shape,"(251, 4)"
7,Transformed test set shape,"(109, 4)"
8,Ordinal features,8
9,Numeric features,12


In [13]:
# Check model
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9962,0.0,0.9962,0.9925,0.9943,0.9923,0.9926,1.035
xgboost,Extreme Gradient Boosting,0.9962,0.0,0.9962,0.9925,0.9943,0.9923,0.9926,0.755
dt,Decision Tree Classifier,0.9922,0.0,0.9922,0.9925,0.9922,0.984,0.9848,0.491
ada,Ada Boost Classifier,0.9922,0.0,0.9922,0.9925,0.9922,0.984,0.9848,0.664
gbc,Gradient Boosting Classifier,0.9922,0.0,0.9922,0.9925,0.9922,0.984,0.9848,1.281
et,Extra Trees Classifier,0.9922,0.0,0.9922,0.9888,0.9902,0.9834,0.9841,0.839
lightgbm,Light Gradient Boosting Machine,0.9922,0.0,0.9922,0.9925,0.9922,0.984,0.9848,0.73
lr,Logistic Regression,0.9842,0.0,0.9842,0.9853,0.9842,0.968,0.9697,1.156
knn,K Neighbors Classifier,0.9842,0.0,0.9842,0.9817,0.9824,0.9671,0.9685,0.718
nb,Naive Bayes,0.9722,0.0,0.9722,0.9707,0.9704,0.9418,0.9441,0.703


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [14]:
# create a model
rf = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9615,0.0,0.9615,0.9255,0.9429,0.9226,0.9263
1,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,1.0,0.0,1.0,1.0,1.0,1.0,1.0
5,1.0,0.0,1.0,1.0,1.0,1.0,1.0
6,1.0,0.0,1.0,1.0,1.0,1.0,1.0
7,1.0,0.0,1.0,1.0,1.0,1.0,1.0
8,1.0,0.0,1.0,1.0,1.0,1.0,1.0
9,1.0,0.0,1.0,1.0,1.0,1.0,1.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
# Hyper Parameter tuning
tuned_rf = tune_model(rf)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9615,0.0,0.9615,0.9255,0.9429,0.9226,0.9263
1,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,1.0,0.0,1.0,1.0,1.0,1.0,1.0
5,1.0,0.0,1.0,1.0,1.0,1.0,1.0
6,1.0,0.0,1.0,1.0,1.0,1.0,1.0
7,1.0,0.0,1.0,1.0,1.0,1.0,1.0
8,1.0,0.0,1.0,1.0,1.0,1.0,1.0
9,1.0,0.0,1.0,1.0,1.0,1.0,1.0


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [17]:
# Evaluate the model
evaluate_model(tuned_rf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [18]:
predict_model(tuned_rf)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9817,0,0.9817,0.973,0.9771,0.9608,0.9615


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,rc,htn,dm,cad,appet,pe,ane,classification,prediction_label,prediction_score
273,179,72.0,90.0,1.010,2.0,0.0,,abnormal,present,notpresent,...,,no,no,no,good,no,no,0,ckd,0.93
307,12,68.0,70.0,1.015,3.0,1.0,,normal,present,notpresent,...,3.4,yes,yes,yes,poor,yes,no,0,ckd,0.98
329,18,60.0,100.0,1.025,0.0,3.0,,normal,notpresent,notpresent,...,4.3,yes,yes,yes,good,no,no,0,ckd,0.99
263,191,,70.0,1.010,3.0,0.0,normal,normal,notpresent,notpresent,...,3.4,yes,yes,no,poor,no,no,0,ckd,1.00
169,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,6.2,no,no,no,good,no,no,2,notckd,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,200,90.0,90.0,1.025,1.0,0.0,,normal,notpresent,notpresent,...,3.9,yes,yes,no,good,no,no,0,ckd,0.98
215,161,62.0,,1.015,3.0,0.0,abnormal,,notpresent,notpresent,...,4.8,yes,yes,no,good,no,no,0,ckd,0.98
51,83,48.0,70.0,1.015,1.0,0.0,normal,normal,notpresent,notpresent,...,,yes,yes,no,good,no,no,0,ckd,1.00
201,162,59.0,70.0,,,,,,notpresent,notpresent,...,\t?,no,yes,no,good,no,no,0,ckd,0.94


In [19]:
# Predict the unseen test data
unseen_predictions = predict_model(tuned_rf, data=data_unseen)
unseen_predictions

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,1.0,0,1.0,1.0,1.0,,0.0


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,rc,htn,dm,cad,appet,pe,ane,classification,prediction_label,prediction_score
0,360,35.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,6.2,no,no,no,good,no,no,2,notckd,1.0
1,361,29.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,5.8,no,no,no,good,no,no,2,notckd,0.99
2,362,33.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,4.8,no,no,no,good,no,no,2,notckd,1.0
3,363,67.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,5.2,no,no,no,good,no,no,2,notckd,1.0
4,364,73.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,4.7,no,no,no,good,no,no,2,notckd,1.0
5,365,24.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,6.3,no,no,no,good,no,no,2,notckd,1.0
6,366,60.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,5.3,no,no,no,good,no,no,2,notckd,1.0
7,367,68.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,6.1,no,no,no,good,no,no,2,notckd,1.0
8,368,30.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,5.9,no,no,no,good,no,no,2,notckd,1.0
9,369,75.0,70.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,4.8,no,no,no,good,no,no,2,notckd,1.0


In [20]:
# Save model for deployment
save_model(tuned_rf, "tuned_rf_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['id', 'age', 'bp', 'sg', 'al',
                                              'su', 'bgr', 'bu', 'sc', 'sod',
                                              'pot', 'hemo'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_v...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='