In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read the data from drive
df = pd.read_csv('/content/drive/My Drive/XAI/CF/healthcare-dataset-stroke-data.csv')

In [None]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [None]:
df.drop('id', axis=1, inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [None]:
!pip install dice-ml



In [None]:
import dice_ml
from dice_ml.utils import helpers

In [None]:
d = dice_ml.Data(dataframe=df, continuous_features=['age', 'avg_glucose_level', 'bmi'], outcome_name='stroke')

In [None]:

# Split data into train and test
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf 

# Seperate features and target
target = df["stroke"]
datasetX = df.drop("stroke", axis=1)

# split train and test
x_train, x_test, y_train, y_test = train_test_split(datasetX, target, test_size = 0.2, random_state=0, stratify=target)

# Seperate numerical and categorial features
numerical = ["age", "avg_glucose_level", "bmi"]
categorical = x_train.columns.difference(numerical)

# transformation of categorial features
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
transformations = ColumnTransformer(transformers=[('cat', categorical_transformer, categorical)])

def create_model():
  ann_model = Sequential()
  ann_model.add(Dense(12, activation=tf.nn.relu))
  ann_model.add(Dense(1, activation=tf.nn.sigmoid))
  ann_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return ann_model

clf = KerasClassifier(build_fn=create_model, nb_epoch=100, batch_size=100, verbose=False)
model = Pipeline(steps=[('preprocess', transformations), ('classifier', clf)])
model = model.fit(x_train, y_train, )


In [None]:

# Using sklearn backend
m = dice_ml.Model(model=model, backend="sklearn")
# Using method=random for generating CFs
exp = dice_ml.Dice(d, m, method="random")

In [None]:
e1 = exp.generate_counterfactuals(x_train[28:29], total_CFs=2, desired_class="opposite")
e1.visualize_as_dataframe(show_only_changes=True)



Query instance (original outcome : 0)




Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,70.0,0,0,Yes,Private,Urban,221.580002,47.5,never smoked,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Other,69.99999999999976,-,-,-,-,-,-,-,-,1
1,Male,-,-,-,-,-,-,91.42,-,-,1


# Lets try using Tensorflow without using wrapper 

In [None]:
numerical

['age', 'avg_glucose_level', 'bmi']

In [None]:
categorical

Index(['Residence_type', 'ever_married', 'gender', 'heart_disease',
       'hypertension', 'smoking_status', 'work_type'],
      dtype='object')

In [None]:
cat = pd.get_dummies(df[categorical], prefix_sep='_')

In [None]:
new_df = pd.concat([df[numerical], cat], axis=1)
new_df.shape

(4909, 23)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(new_df, target, test_size = 0.2, random_state=0, stratify=target)

In [None]:
ann = Sequential()
ann.add(Dense(12, input_shape=(23,), activation=tf.nn.relu))
ann.add(Dense(1, activation=tf.nn.sigmoid))
ann.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
ann.fit(x_train, y_train, epochs=100, batch_size=32, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7f558d8b0250>

In [None]:
backend = 'TF'+tf.__version__[0]
mmm = dice_ml.Model(model=ann, backend=backend)

In [None]:
sess=tf.compat.v1.InteractiveSession()
new_d = dice_ml.Data(dataframe=df, continuous_features=['age', 'avg_glucose_level', 'bmi'], outcome_name='stroke')
exp = dice_ml.Dice(new_d, mmm)



In [None]:
query_intance = dict(df.iloc[1])
# query = del query_intance['stroke']
e2 = exp.generate_counterfactuals(query_intance, total_CFs=2, desired_class='opposite')

No Counterfactuals found for the given configuation, perhaps try with different values of proximity (or diversity) weights or learning rate... ; total time taken: 02 min 16 sec


In [None]:
new_d.ohe_encoded_feature_names

['age',
 'avg_glucose_level',
 'bmi',
 'gender_Female',
 'gender_Male',
 'gender_Other',
 'hypertension_0',
 'hypertension_1',
 'heart_disease_0',
 'heart_disease_1',
 'ever_married_No',
 'ever_married_Yes',
 'work_type_Govt_job',
 'work_type_Never_worked',
 'work_type_Private',
 'work_type_Self-employed',
 'work_type_children',
 'Residence_type_Rural',
 'Residence_type_Urban',
 'smoking_status_Unknown',
 'smoking_status_formerly smoked',
 'smoking_status_never smoked',
 'smoking_status_smokes']

In [None]:
e2.visualize_as_dataframe()

Query instance (original outcome : 0)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,0.016



No counterfactuals found!
