In [None]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import shap

In [None]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data.head()

<font color="bluegrey" size=+1.0><b>Data Preprocessing</b></font>

In [None]:
data.drop(columns=["id"], inplace=True)

# Encode categorical features
categorical_columns = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
for col in categorical_columns:
    data[col] = LabelEncoder().fit_transform(data[col])

# Handle missing values
data["bmi"].fillna(data["bmi"].mean(), inplace=True)

# Split data into features and target
X = data.drop(columns=["stroke"])
y = data["stroke"]

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

<font color="bluegrey" size=+1.0><b>Decision Tree</b></font>

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(dt_model, X_resampled, y_resampled, scoring='accuracy', cv=skf)
print("Decision Tree Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Evaluate the model
y_pred = dt_model.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred, target_names=["No Stroke", "Stroke"]))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Stroke", "Stroke"], yticklabels=["No Stroke", "Stroke"])
plt.title("Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

# Feature Importance
feature_importances = dt_model.feature_importances_
importance_df = pd.DataFrame({"Feature": X.columns, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
print("Feature Importances:\n", importance_df)

# SHAP Explanation
explainer = shap.TreeExplainer(dt_model)
shap_values = explainer.shap_values(X_test)

# Global SHAP Summary Plot
shap.summary_plot(shap_values[1], X_test, feature_names=X.columns)

# SHAP Force Plot for a Specific Instance
instance_idx = 0  # Adjust to view a specific test sample
shap.force_plot(explainer.expected_value[1], shap_values[1][instance_idx], X_test.iloc[instance_idx], feature_names=X.columns)

<font color="bluegrey" size=+1.0><b>Logistic Regression</b></font>

In [None]:
lr_model = LogisticRegression(max_iter=500, random_state=42)

# Cross-validation
cv_scores = cross_val_score(lr_model, X_resampled, y_resampled, scoring='accuracy', cv=skf)
print("Logistic Regression Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Train the model
lr_model.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = lr_model.predict(X)
print("Logistic Regression Classification Report:")
print(classification_report(y, y_pred))

# XAI with SHAP
explainer = shap.Explainer(lr_model, X_resampled)
shap_values = explainer(X_resampled)

# Global Feature Importance
shap.summary_plot(shap_values, X_resampled, plot_type="bar")

# Local Explanation for a specific instance
shap.force_plot(explainer.expected_value[1], shap_values[0], feature_names=X.columns)

<font color="bluegrey" size=+1.0><b>Naive-Bayes</b></font>

In [None]:
nb_model = GaussianNB()

# Cross-validation
cv_scores = cross_val_score(nb_model, X_resampled, y_resampled, scoring='accuracy', cv=skf)
print("Naive Bayes Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Train the model
nb_model.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = nb_model.predict(X)
print("Naive Bayes Classification Report:")
print(classification_report(y, y_pred))

# XAI with SHAP
explainer = shap.Explainer(nb_model, X_resampled)
shap_values = explainer(X_resampled)

# Global Feature Importance
shap.summary_plot(shap_values, X_resampled, plot_type="bar")

# Local Explanation for a specific instance
shap.force_plot(explainer.expected_value[1], shap_values[0], feature_names=X.columns)

<font color="bluegrey" size=+1.0><b>K-Neighbours</b></font>

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)

# Cross-validation
cv_scores = cross_val_score(knn_model, X_resampled, y_resampled, scoring='accuracy', cv=skf)
print("KNN Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Train the model
knn_model.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = knn_model.predict(X)
print("KNN Classification Report:")
print(classification_report(y, y_pred))

# XAI with SHAP
explainer = shap.Explainer(knn_model, X_resampled)
shap_values = explainer(X_resampled)

# Global Feature Importance
shap.summary_plot(shap_values, X_resampled, plot_type="bar")

# Local Explanation for a specific instance
instance = X.iloc[0]
shap.force_plot(explainer.expected_value[1], shap_values[0], feature_names=X.columns)