<center><font color="lightbeige" size=+1.0><b>Data feature selection / extraction using SCI-XAI pipeline</b></font><center>

<font color="bluegrey" size=+1.0><b>Load and Read dataset</b></font>

In [10]:
import pandas as pd

# Load the dataset
f_path = "healthcare-dataset-stroke-data.csv"
df = pd.read_csv(f_path)

In [11]:
# Relevant features for post-stroke treatment plans
selected_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'smoking_status']

# Relevant columns
df_selected = df[selected_features + ['stroke']]

<font color="bluegrey" size=+1.0><b>Handling missing data</b></font>

In [12]:
# Replace 'N/A' with pd.NA for proper handling
df_selected['bmi'].replace('N/A', pd.NA, inplace=True)  
# Convert 'bmi' to numeric 
df_selected['bmi'] = pd.to_numeric(df_selected['bmi'], errors='coerce')  

print("Summary Statistics of Selected Features:")
print(df_selected[selected_features].describe())

Summary Statistics of Selected Features:
               age  hypertension  heart_disease  avg_glucose_level  \
count  5110.000000   5110.000000    5110.000000        5110.000000   
mean     43.226614      0.097456       0.054012         106.147677   
std      22.612647      0.296607       0.226063          45.283560   
min       0.080000      0.000000       0.000000          55.120000   
25%      25.000000      0.000000       0.000000          77.245000   
50%      45.000000      0.000000       0.000000          91.885000   
75%      61.000000      0.000000       0.000000         114.090000   
max      82.000000      1.000000       1.000000         271.740000   

               bmi  
count  4909.000000  
mean     28.893237  
std       7.854067  
min      10.300000  
25%      23.500000  
50%      28.100000  
75%      33.100000  
max      97.600000  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['bmi'].replace('N/A', pd.NA, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['bmi'] = pd.to_numeric(df_selected['bmi'], errors='coerce')


<font color="bluegrey" size=+1.0><b>SCI-XAI pipeline and feature selection</b></font>

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer


X = pd.get_dummies(df_selected[selected_features], columns=['smoking_status'])  # One-hot encode 'smoking_status'

# Impute missing values
imputer = SimpleImputer(strategy='mean') 
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

y = df_selected['stroke']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Model Selection using Gradient Boosting for stroke prediction
model_gradient_boosting = GradientBoostingClassifier(random_state=42)
model_gradient_boosting.fit(X_train, y_train)

<font color="bluegrey" size=+1.0><b>Data exploration</b></font>

In [17]:
print("Selected Features:")
print(selected_features)

# Display summary statistics of the selected features
print(df[selected_features].describe())

Selected Features:
['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'smoking_status']
               age  hypertension  heart_disease  avg_glucose_level  \
count  5110.000000   5110.000000    5110.000000        5110.000000   
mean     43.226614      0.097456       0.054012         106.147677   
std      22.612647      0.296607       0.226063          45.283560   
min       0.080000      0.000000       0.000000          55.120000   
25%      25.000000      0.000000       0.000000          77.245000   
50%      45.000000      0.000000       0.000000          91.885000   
75%      61.000000      0.000000       0.000000         114.090000   
max      82.000000      1.000000       1.000000         271.740000   

               bmi  
count  4909.000000  
mean     28.893237  
std       7.854067  
min      10.300000  
25%      23.500000  
50%      28.100000  
75%      33.100000  
max      97.600000  


<font color="bluegrey" size=+1.0><b>XAI techniques - Eli5, LIME, and SHAP</b></font>

In [19]:
import eli5
from lime import lime_tabular
import shap
import matplotlib.pyplot as plt

# Gradient Boosting model trained on dataset
model_stroke = model_gradient_boosting

# X_train_stroke and y_train_stroke as stroke prediction dataset
X_train_stroke, X_test_stroke, y_train_stroke, y_test_stroke = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Eli5 for stroke prediction
explainability_stroke = eli5.show_weights(model_stroke, feature_names=selected_features)

# Apply LIME for stroke prediction
lime_explainer = lime_tabular.LimeTabularExplainer(X_train_stroke.values, mode='classification', feature_names=selected_features)
instance_to_explain = X_test_stroke.iloc[0]
lime_explanation = lime_explainer.explain_instance(instance_to_explain.values, model_stroke.predict_proba)

# Explores advanced local surrogate methods i.e SHAP for feature importance analysis
explainer = shap.TreeExplainer(model_stroke)
shap_values = explainer.shap_values(X_test_stroke)

# Feature importance plot using SHAP
shap.summary_plot(shap_values, X_test_stroke, feature_names=selected_features)

# Visualize specific instance using SHAP
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0, :], instance_to_explain, feature_names=selected_features)

ImportError: cannot import name 'if_delegate_has_method' from 'sklearn.utils.metaestimators' (c:\Users\zen\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\metaestimators.py)