In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import shap
from lime.lime_tabular import LimeTabularExplainer
from anchor import anchor_tabular

In [16]:
filepath = 'healthcare-dataset-stroke-data.csv'
data = pd.read_csv(filepath)
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


<font color="bluegrey" size=+1.0><b>Data Preprocessing</b></font>

In [17]:
X, y = preprocess_data(filepath, encoding_strategy="label", scale=False)

sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000]

training_sample_sizes = [max(size - size // 5, 1) for size in sample_sizes]  
testing_sample_sizes = [size - train_size for size, train_size in zip(sample_sizes, training_sample_sizes)]  

print("Training sample sizes:", training_sample_sizes)
print("Testing sample sizes:", testing_sample_sizes)

Training sample sizes: [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000]
Testing sample sizes: [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Check if SMOTE worked by displaying the class distribution after resampling
print(f"Before SMOTE: {y_train.value_counts()}")
print(f"After SMOTE: {y_train_res.value_counts()}")

Before SMOTE: stroke
0    3901
1     187
Name: count, dtype: int64
After SMOTE: stroke
0    3901
1    3901
Name: count, dtype: int64


<font color="bluegrey" size=+1.0><b>Decision Tree</b></font>

In [21]:
for train_size, test_size in zip(training_sample_sizes, testing_sample_sizes):
    print(f"\nTraining with {train_size} samples and testing with {test_size} samples")
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, test_size=test_size, stratify=y
    )

    # Train the model
    dt_model = DecisionTreeClassifier(random_state=42, class_weight="balanced")
    dt_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = dt_model.predict(X_test)
    
    # Print metrics
    print("Decision Tree Metrics:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    
    # Flatten classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()  # Convert to DataFrame for tabular display
    
    # Ensure support values are integers
    report_df['support'] = report_df['support'].fillna(0).astype(int)
    
    print("Classification Report (Flattened):")
    print(report_df.round(2))  # Round other values for better readability
    
    # Feature importance
    feature_importance = pd.Series(dt_model.feature_importances_, index=X.columns)
    print("Feature importance:")
    print(feature_importance.sort_values(ascending=False))
    
    print("=" * 50)


Training with 20 samples and testing with 5 samples
Decision Tree Metrics:
Accuracy: 1.00
Classification Report (Flattened):
              precision  recall  f1-score  support
0                   1.0     1.0       1.0        5
accuracy            1.0     1.0       1.0        1
macro avg           1.0     1.0       1.0        5
weighted avg        1.0     1.0       1.0        5
Feature importance:
stroke_risk_score            8.095238e-01
age                          1.904762e-01
smoking_stroke_risk          1.887379e-15
gender                       0.000000e+00
bmi_stroke_risk              0.000000e+00
glucose_stroke_risk          0.000000e+00
residence_stroke_risk        0.000000e+00
work_type_stroke_risk        0.000000e+00
ever_married_stroke_risk     0.000000e+00
heart_disease_stroke_risk    0.000000e+00
hypertension_stroke_risk     0.000000e+00
age_stroke_risk              0.000000e+00
gender_stroke_risk           0.000000e+00
smoking_status               0.000000e+00
bmi        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
