# Stroke Prediction Classifier Models

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This project uses the [Stroke Prediction Dataset](https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data) in Kaggle to to predict whether a patient is likely to get stroke based on personal information that are used as input features such as gender, age, past diseases, smoking status and the like. This project trains and tests the following 6 models:
- Logistic Regression
- Naive Bayes
- Gaussian Discriminant Analysis (GDA)
- Support Vector Machine
- Decision Tree
- Random Forest

# Preprocessing

#### Importing libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as GDA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

#### Reading and loading the dataset

In [2]:
file_path = './dataset/healthcare-dataset-stroke-data.csv' 
dataset = pd.read_csv(file_path)
data = dataset.copy()
data.shape

(5110, 12)

#### Cleaning the data 

In [3]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [4]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [5]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
data = data.dropna()
data.shape

(5110, 12)

#### Encoding Categorical Variables

In [7]:
label_encoders = {}
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le 

#### Scaling Numerical Features

In [8]:
scaler = StandardScaler()
numerical_columns = ['age', 'avg_glucose_level', 'bmi']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [9]:
X = data.drop(columns=['id', 'stroke'])
Y = data['stroke']


# Split into train (70%), test (20%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42, stratify=y_temp)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

#### Visualize the Data

# Training the Models

### 1. Logistic Regression

In [11]:
# Training with Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_resampled, y_resampled)

# Evaluating on Test Set
y_pred_test = lr_model.predict(X_test)
print("Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

Logistic Regression
Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.75      0.85       977
           1       0.13      0.72      0.22        50

    accuracy                           0.75      1027
   macro avg       0.56      0.74      0.53      1027
weighted avg       0.94      0.75      0.82      1027

Confusion Matrix:
[[734 243]
 [ 14  36]]


### 2. Gaussian Discriminant Analysis GDA

In [12]:
# Train Gaussian Discriminant Analysis (GDA)
gda_model = GDA()  
gda_model.fit(X_resampled, y_resampled) 

# Evaluate on Test Set
y_pred_test = gda_model.predict(X_test)  
print("Gaussian Discriminant Analysis (GDA)")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

Gaussian Discriminant Analysis (GDA)
Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.73      0.83       977
           1       0.12      0.72      0.20        50

    accuracy                           0.73      1027
   macro avg       0.55      0.72      0.52      1027
weighted avg       0.94      0.73      0.80      1027

Confusion Matrix:
[[709 268]
 [ 14  36]]


### 3. Naive Bayes

In [13]:
# Train Naive Bayes
nb_model = GaussianNB()  
nb_model.fit(X_resampled, y_resampled)  

# Evaluate on Test Set
y_pred_test = nb_model.predict(X_test) 
print("Naive Bayes")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

Naive Bayes
Accuracy: 0.69
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.69      0.81       977
           1       0.11      0.76      0.19        50

    accuracy                           0.69      1027
   macro avg       0.55      0.72      0.50      1027
weighted avg       0.94      0.69      0.78      1027

Confusion Matrix:
[[672 305]
 [ 12  38]]


### 4. Support Vector Machine 

In [14]:
# Training with SVM
svm_model = SVC(random_state=42, kernel='linear', probability=True)
svm_model.fit(X_resampled, y_resampled)

# Evaluating on Test Set
y_pred_test = svm_model.predict(X_test)
print("Support Vector Machine")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

Support Vector Machine
Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.74      0.84       977
           1       0.13      0.76      0.22        50

    accuracy                           0.74      1027
   macro avg       0.56      0.75      0.53      1027
weighted avg       0.94      0.74      0.81      1027

Confusion Matrix:
[[719 258]
 [ 12  38]]


### 5. Decision Tree

In [15]:
# Training with Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_resampled, y_resampled)

# Evaluating with Test Set
y_pred_test = dt_model.predict(X_test)
print("Decision Tree")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

Decision Tree
Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       977
           1       0.10      0.24      0.14        50

    accuracy                           0.85      1027
   macro avg       0.53      0.56      0.53      1027
weighted avg       0.92      0.85      0.88      1027

Confusion Matrix:
[[863 114]
 [ 38  12]]


### 6. Random Forest

In [16]:
# Training with Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_resampled, y_resampled)

# Evaluating on Test Set
y_pred_test = rf_model.predict(X_test)
print("Random Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

Random Forest
Accuracy: 0.88
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.91      0.94       977
           1       0.12      0.22      0.15        50

    accuracy                           0.88      1027
   macro avg       0.54      0.57      0.54      1027
weighted avg       0.92      0.88      0.90      1027

Confusion Matrix:
[[893  84]
 [ 39  11]]


## Comparing the algorithms used

In [17]:
results = {
    "Algorithm": [],
    "Accuracy": [],
    "Precision (macro)": [],
    "Recall (macro)": [],
    "F1-score (macro)": []
}


def evaluate_model(model, algorithm_name, X_test, y_test):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    results["Algorithm"].append(algorithm_name)
    results["Accuracy"].append(accuracy_score(y_test, y_pred))
    results["Precision (macro)"].append(report["macro avg"]["precision"])
    results["Recall (macro)"].append(report["macro avg"]["recall"])
    results["F1-score (macro)"].append(report["macro avg"]["f1-score"])

# Evaluate pre-trained models
evaluate_model(lr_model, "Logistic Regression", X_test, y_test)
evaluate_model(nb_model, "Naive Bayes", X_test, y_test)
evaluate_model(gda_model, "Gaussian Discriminant Analysis(GDA)", X_test, y_test)
evaluate_model(svm_model, "Support Vector Machine", X_test, y_test)
evaluate_model(dt_model, "Decision Tree", X_test, y_test)
evaluate_model(rf_model, "Random Forest", X_test, y_test)

# Tabulate Results
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Algorithm,Accuracy,Precision (macro),Recall (macro),F1-score (macro)
0,Logistic Regression,0.749757,0.555158,0.73564,0.53493
1,Naive Bayes,0.691334,0.546622,0.72391,0.501268
2,Gaussian Discriminant Analysis(GDA),0.725414,0.549529,0.722845,0.518754
3,Support Vector Machine,0.737098,0.555981,0.747963,0.530787
4,Decision Tree,0.851996,0.526531,0.561658,0.527713
5,Random Forest,0.880234,0.536972,0.567011,0.543646
