In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import shap

In [None]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data.head()

<font color="bluegrey" size=+1.0><b>Data Preprocessing</b></font>

In [None]:
# Check for missing values
print(data.isnull().sum())

# Handle missing values (fill BMI with median)
data['bmi'] = data['bmi'].fillna(data['bmi'].median())

# Encode categorical variables
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col, encoder in label_encoders.items():
    data[col] = encoder.fit_transform(data[col])

# Drop irrelevant columns (e.g., 'id' if not used)
# data = data.drop('id', axis=1)

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['age', 'avg_glucose_level', 'bmi']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# class distribution
print(data['stroke'].value_counts())

X = data.drop('stroke', axis=1)
y = data['stroke']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

<font color="bluegrey" size=+1.0><b>Decision Tree</b></font>

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_resampled, y_resampled)

# Evaluate Decision Tree
y_pred = dt_model.predict(X)
print("Decision Tree Classification Report:")
print(classification_report(y, y_pred))

# XAI with SHAP
explainer = shap.Explainer(dt_model, X_resampled)
shap_values = explainer(X_resampled)

# Global Feature Importance
shap.summary_plot(shap_values, X_resampled, plot_type="bar")

# Local Explanation for a specific instance
instance = X.iloc[0]
shap.force_plot(explainer.expected_value[1], shap_values[0], feature_names=X.columns)

<font color="bluegrey" size=+1.0><b>Logistic Regression</b></font>

In [None]:
lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_resampled, y_resampled)

# Evaluate Logistic Regression
y_pred = lr_model.predict(X)
print("Logistic Regression Classification Report:")
print(classification_report(y, y_pred))

# XAI with SHAP
explainer = shap.Explainer(lr_model, X_resampled)
shap_values = explainer(X_resampled)

# Global Feature Importance
shap.summary_plot(shap_values, X_resampled, plot_type="bar")

# Local Explanation for a specific instance
shap.force_plot(explainer.expected_value[1], shap_values[0], feature_names=X.columns)

<font color="bluegrey" size=+1.0><b>Naive-Bayes</b></font>

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_resampled, y_resampled)

# Evaluate Naive Bayes
y_pred = nb_model.predict(X)
print("Naive Bayes Classification Report:")
print(classification_report(y, y_pred))

# XAI with SHAP
explainer = shap.Explainer(nb_model, X_resampled)
shap_values = explainer(X_resampled)

# Global Feature Importance
shap.summary_plot(shap_values, X_resampled, plot_type="bar")

# Local Explanation for a specific instance
shap.force_plot(explainer.expected_value[1], shap_values[0], feature_names=X.columns)

<font color="bluegrey" size=+1.0><b>K-Neighbours</b></font>

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_resampled, y_resampled)

# Evaluate KNN
y_pred = knn_model.predict(X)
print("KNN Classification Report:")
print(classification_report(y, y_pred))

# XAI with SHAP
explainer = shap.Explainer(knn_model, X_resampled)
shap_values = explainer(X_resampled)

# Global Feature Importance
shap.summary_plot(shap_values, X_resampled, plot_type="bar")

# Local Explanation for a specific instance
instance = X.iloc[0]
shap.force_plot(explainer.expected_value[1], shap_values[0], feature_names=X.columns)