In [1]:
from utils import DataLoader
# Simple Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Ensemble Model
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# %% Load and prepare data
data_loader = DataLoader()
data_loader.load_dataset()
df,X,y = data_loader.prepare_data()
data_loader.save_preprocessor()
df

number of duplicate rows:  (3854, 9)
Other gender: 0.018721527676658415%


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,non-smoker,25.19,6.6,140,0
1,Female,54.0,0,0,non-smoker,27.32,6.6,80,0
2,Male,28.0,0,0,non-smoker,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,Female,36.0,0,0,non-smoker,24.60,4.8,145,0
99996,Female,2.0,0,0,non-smoker,17.37,6.5,100,0
99997,Male,66.0,0,0,past_smoker,27.83,5.7,155,0
99998,Female,24.0,0,0,non-smoker,35.42,4.0,100,0


Decision Tree

In [3]:
dt_pipeline = data_loader.create_model_pipeline(
    model=DecisionTreeClassifier, model_params={}
)
data_loader.fit_model_pipeline(dt_pipeline)
data_loader.evaluate_model_pipeline(dt_pipeline)

Model ROC Area Under Curve:  0.8638725434559308
Model Accuracy:  0.9042442525746385


KNN

In [4]:
knn_pipeline = data_loader.create_model_pipeline(
    model=KNeighborsClassifier, model_params={}
)
data_loader.fit_model_pipeline(knn_pipeline)
data_loader.evaluate_model_pipeline(knn_pipeline)

Model ROC Area Under Curve:  0.9457113977103626
Model Accuracy:  0.9169353999791948


Logistic Regression

In [5]:
lgr_pipeline = data_loader.create_model_pipeline(
    model=LogisticRegression, model_params={}
)
data_loader.fit_model_pipeline(lgr_pipeline)
data_loader.evaluate_model_pipeline(lgr_pipeline)

Model ROC Area Under Curve:  0.9596175401583604
Model Accuracy:  0.9254655154478311


Ensemble: Random Forest

In [6]:
rf_params = {
    'n_estimators': 50,
    'max_depth': 10,
    'min_samples_split': 2,
    'min_samples_leaf': 2
}
rf_pipeline = data_loader.create_model_pipeline(model=RandomForestClassifier, model_params=rf_params)

In [7]:
data_loader.fit_model_pipeline(rf_pipeline)
data_loader.evaluate_model_pipeline(rf_pipeline)

Model ROC Area Under Curve:  0.973731194458326
Model Accuracy:  0.9461146364298346


Saving Random Forest for streamlit since it has the highest roc score

In [8]:
data_loader.save_model_pipeline('random_forest', rf_pipeline)

Explanable AI

In this section three sample values will be picked from test split of the data and explanations will be provided on those predictions using LIME and SHAP. Finally CounterFactual will be used to display the changed required for opposite prediction.

In [198]:
df_test = pd.merge(data_loader.X_test, data_loader.y_test, left_index=True, right_index=True)
positive_examples = df_test[df_test['diabetes'] == 1]
positive_examples.head()

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease,gender_Female,gender_Male,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker,diabetes
95309,1.255536,-0.406552,1.833146,0.532424,3.447327,-0.206268,1.0,0.0,1.0,0.0,0.0,1
9769,-0.925808,-0.000214,0.249127,0.410208,-0.29008,-0.206268,0.0,1.0,0.0,1.0,0.0,1
1391,0.365192,-0.839486,0.155949,3.465622,-0.29008,-0.206268,1.0,0.0,0.0,0.0,1.0,1
7645,-0.124498,-0.897112,0.621837,1.510157,-0.29008,-0.206268,1.0,0.0,0.0,1.0,0.0,1
35060,-1.281946,-0.765606,0.994547,0.043558,-0.29008,-0.206268,1.0,0.0,0.0,1.0,0.0,1


In [199]:
negative_examples = df_test[df_test['diabetes'] == 0]
negative_examples.head()

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease,gender_Female,gender_Male,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker,diabetes
41458,-0.035463,-0.298688,-0.682649,0.507981,-0.29008,-0.206268,1.0,0.0,1.0,0.0,0.0,0
56059,1.700709,-0.125809,0.621837,-1.300824,-0.29008,-0.206268,0.0,1.0,0.0,1.0,0.0,0
84971,-0.124498,-0.000214,-0.496294,0.483538,-0.29008,-0.206268,0.0,1.0,0.0,1.0,0.0,0
767,-1.638084,-1.536908,-0.496294,-0.298648,-0.29008,-0.206268,0.0,1.0,0.0,1.0,0.0,0
53955,-1.593566,-0.272091,0.155949,-1.178608,-0.29008,-0.206268,0.0,1.0,0.0,1.0,0.0,0


In [259]:
from interpret.blackbox import LimeTabular
from interpret.blackbox import ShapKernel
from interpret import show
import numpy as np

In [246]:
sample_one = positive_examples[10:11]
sample_two = negative_examples[10:11]
sample_three = negative_examples[20:21]
sample_one_input = sample_one.drop("diabetes", axis=1)
sample_two_input = sample_two.drop("diabetes", axis=1)
sample_three_input = sample_three.drop("diabetes", axis=1)
sample_inputs = [sample_one_input, sample_two_input, sample_three_input]

In [247]:
sample_one_input

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease,gender_Female,gender_Male,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker
90082,1.567157,0.533196,0.528659,0.532424,-0.29008,-0.206268,1.0,0.0,0.0,0.0,1.0


In [248]:
sample_two_input

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease,gender_Female,gender_Male,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker
41291,1.700709,-0.142063,0.435482,0.507981,-0.29008,-0.206268,1.0,0.0,0.0,1.0,0.0


In [249]:
sample_three_input

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease,gender_Female,gender_Male,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker
19176,-0.07998,-0.731622,0.90137,-1.178608,-0.29008,-0.206268,1.0,0.0,0.0,0.0,1.0


Ensemble Model Predictions on each Sample

In [250]:
rf_pipeline.predict(sample_one_input)[0] # Correct prediction

1

In [251]:
rf_pipeline.predict(sample_two_input)[0] # Correct prediction

0

In [252]:
rf_pipeline.predict(sample_three_input)[0] # Correct prediction

0

LIME

In [253]:
lime = LimeTabular(
    model=rf_pipeline,
    # predict_fn=rf_pipeline.predict_proba,
    data=data_loader.X_train,
    random_state=1,
)

In [258]:
# Get local explanations
lime_local = lime.explain_local(
    sample_inputs,
    [1, 0, 0],
    name="LIME",
)

show(lime_local)

SHAP

In [260]:
columns = list(
            map(lambda x: x[5:], list(data_loader.preprocessor.get_feature_names_out()))
        )
background_val = pd.DataFrame(
            np.median(data_loader.X_train, axis=0).reshape(1, -1), columns=columns
        )
shap = ShapKernel(rf_pipeline, background_val)

In [261]:
shap_local = shap.explain_local(sample_inputs, [1, 0, 0], name="SHAP")
show(shap_local)

  0%|          | 0/3 [00:00<?, ?it/s]

CounterFactual

In [205]:
import dice_ml

In [206]:
merged_df = pd.merge(data_loader.X, data_loader.y, left_index=True, right_index=True)

In [208]:
# Dataset
continuous_features = [
    "age",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level",
    "hypertension",
    "heart_disease",
]
data_dice = dice_ml.Data(
    dataframe=merged_df,
    continuous_features=continuous_features,
    outcome_name="diabetes",
)

In [209]:
rf_dice = dice_ml.Model(
    model=rf_pipeline,
    backend="sklearn",
)
explainer = dice_ml.Dice(
    data_dice,
    rf_dice,
    method="random",
    # method="genetic",
    # method="kdtree",
)

In [262]:
inverse_transform_numeric = data_loader.preprocessor.named_transformers_.num.inverse_transform

In [264]:
features_to_vary = [
    "age",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level",
    "hypertension",
    "heart_disease",
]
features_to_vary = continuous_features
permitted_range = {
    "age": [0, 200],
    "bmi": [10, 40],
    "HbA1c_level": [3, 9],
    "blood_glucose_level": [70, 300],
    "hypertension": [0, 1],
    "heart_disease": [0, 1],
}

In [265]:
# Now generating explanations using the new feature weights
cf = explainer.generate_counterfactuals(
    sample_one_input,
    total_CFs=1,
    desired_class="opposite",
    permitted_range=permitted_range,
    features_to_vary=features_to_vary,
)

# Visualize it
output = cf.visualize_as_dataframe(show_only_changes=True)
output

100%|██████████| 1/1 [03:00<00:00, 180.37s/it]

Query instance (original outcome : 1)





Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease,gender_Female,gender_Male,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker,diabetes
0,1.567157,0.533196,0.528659,0.532424,-0.29008,-0.206268,1.0,0.0,0.0,0.0,1.0,1



Diverse Counterfactual set (new outcome: 0.0)


Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease,gender_Female,gender_Male,smoking_history_current,smoking_history_non-smoker,smoking_history_past_smoker,diabetes
0,0.718785,0.533195846511487,0.5286594290458336,0.5324243186915422,-0.2910798169502588,-0.2063680286605339,1.0,0.0,0.0,0.0,1.0,0.0


In [274]:
res = cf.cf_examples_list[0].final_cfs_df[continuous_features]
v1 = inverse_transform_numeric(sample_one_input[continuous_features])
v2 = inverse_transform_numeric(res)
changes_required = pd.DataFrame(data=v2-v1, columns=[
    "age",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level",
    "hypertension",
    "heart_disease",
])
changes_required

Unnamed: 0,age,bmi,HbA1c_level,blood_glucose_level,hypertension,heart_disease
0,-19.057163,0.0,0.0,0.0,0.0,0.0


According to this analysis reducing the Age by 19 years would result in a non-diabetic prediction