In [1]:
from utils import DataLoader
# Simple Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Ensemble Model
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# %% Load and prepare data
data_loader = DataLoader()
data_loader.load_dataset()
df,X,y = data_loader.prepare_data()
data_loader.save_preprocessor()
df

number of duplicate rows:  (3854, 9)
Other gender: 0.018721527676658415%


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,non-smoker,25.19,6.6,140,0
1,Female,54.0,0,0,non-smoker,27.32,6.6,80,0
2,Male,28.0,0,0,non-smoker,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,Female,36.0,0,0,non-smoker,24.60,4.8,145,0
99996,Female,2.0,0,0,non-smoker,17.37,6.5,100,0
99997,Male,66.0,0,0,past_smoker,27.83,5.7,155,0
99998,Female,24.0,0,0,non-smoker,35.42,4.0,100,0


Decision Tree

In [3]:
dt_pipeline = data_loader.create_model_pipeline(
    model=DecisionTreeClassifier, model_params={}
)
data_loader.fit_model_pipeline(dt_pipeline)
data_loader.evaluate_model_pipeline(dt_pipeline)

Model ROC Area Under Curve:  0.8646415520526014
Model Accuracy:  0.9017476334130864


KNN

In [4]:
knn_pipeline = data_loader.create_model_pipeline(
    model=KNeighborsClassifier, model_params={}
)
data_loader.fit_model_pipeline(knn_pipeline)
data_loader.evaluate_model_pipeline(knn_pipeline)

Model ROC Area Under Curve:  0.9446594793340816
Model Accuracy:  0.9195360449391449


Logistic Regression

In [5]:
lgr_pipeline = data_loader.create_model_pipeline(
    model=LogisticRegression, model_params={}
)
data_loader.fit_model_pipeline(lgr_pipeline)
data_loader.evaluate_model_pipeline(lgr_pipeline)

Model ROC Area Under Curve:  0.9597282243161399
Model Accuracy:  0.921980651201498


Ensemble: Random Forest

In [6]:
rf_params = {
    'n_estimators': 50,
    'max_depth': 10,
    'min_samples_split': 2,
    'min_samples_leaf': 2
}
rf_pipeline = data_loader.create_model_pipeline(model=RandomForestClassifier, model_params=rf_params)

In [7]:
data_loader.fit_model_pipeline(rf_pipeline)
data_loader.evaluate_model_pipeline(rf_pipeline)

Model ROC Area Under Curve:  0.973748083594118
Model Accuracy:  0.9476230105066057


Saving Random Forest for streamlit since it has the highest roc score

In [8]:
data_loader.save_model_pipeline('random_forest', rf_pipeline)

In [9]:
from interpret.blackbox import LimeTabular
from interpret import show

In [10]:
new_data = {
    "gender": ["Male"],
    "age": [52.0],
    "hypertension": [0],
    "heart_disease": [0],
    "smoking_history": ["non-smoker"],
    "bmi": [26.67],
    "HbA1c_level": [6.2],
    "blood_glucose_level": [100],
}
new_df = pd.DataFrame(data=new_data)
new_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,Male,52.0,0,0,non-smoker,26.67,6.2,100


In [11]:
new_df_transformed = data_loader.transform_data(new_df)
new_df_transformed

array([[ 0.45422634, -0.09625766,  0.62183704, -0.93417473, -0.29007982,
        -0.20626803,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ]])

In [12]:
lime = LimeTabular(
    model=rf_pipeline,
    # predict_fn=rf_pipeline.predict_proba,
    data=data_loader.X_train,
    random_state=1,
)

In [13]:
# Get local explanations
lime_local = lime.explain_local(new_df_transformed,
                                name='LIME')

# show(lime_local)