In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [16]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [17]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [19]:
missing_values = data.isnull().sum()
missing_values[:]

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [20]:
data.drop(['id', 'bmi'],axis=1, inplace=True)




---


---
# ***Encoding***
---

---



In [21]:
df_uniques = data.nunique()
df_uniques

gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
smoking_status          4
stroke                  2
dtype: int64

In [22]:
binary_vals = list(df_uniques[df_uniques == 2].index)
binary_vals

['hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'stroke']

In [23]:
data[binary_vals].dtypes

hypertension       int64
heart_disease      int64
ever_married      object
Residence_type    object
stroke             int64
dtype: object

In [26]:
categorical_vals  = list(df_uniques[(df_uniques > 2) & (df_uniques <=6)].index)
categorical_vals

['gender', 'work_type', 'smoking_status']

In [27]:
numaric_vals = list(set(data.columns) - set(categorical_vals)- set(binary_vals))
numaric_vals

['avg_glucose_level', 'age']

In [28]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder
lb, le, lo = LabelBinarizer(), LabelEncoder(), OrdinalEncoder()

In [30]:
for col in categorical_vals:
  data[col] = le.fit_transform(data[col])

In [31]:
for col in binary_vals:
  data[col] = lb.fit_transform(data[col])

In [32]:
data.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [33]:
features_col = [x for x in data.columns if x not in 'stroke']
features_col

['gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'avg_glucose_level',
 'smoking_status']

In [34]:
data[features_col]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status
0,1,67.0,0,1,1,2,1,228.69,1
1,0,61.0,0,0,1,3,0,202.21,2
2,1,80.0,0,1,1,2,0,105.92,2
3,0,49.0,0,0,1,2,1,171.23,3
4,0,79.0,1,0,1,3,0,174.12,2
...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,2
5106,0,81.0,0,0,1,3,1,125.20,2
5107,0,35.0,0,0,1,3,0,82.99,2
5108,1,51.0,0,0,1,2,0,166.29,1




---


---
# ***Train test split***
---

---



In [35]:
from sklearn.model_selection import StratifiedShuffleSplit


In [36]:
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=1000, random_state=42)

In [37]:
train_index , test_index = next(strat_shuff_split.split(data[features_col], data['stroke']))

In [38]:
x_train = data.loc[train_index, features_col]
y_train = data.loc[train_index, 'stroke']
x_test = data.loc[test_index, features_col]
y_test = data.loc[test_index, 'stroke']

In [39]:
y_test.value_counts(normalize=True).sort_index()

0    0.951
1    0.049
Name: stroke, dtype: float64

In [40]:
data.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,0.414286,43.226614,0.097456,0.054012,0.656164,2.16771,0.508023,106.147677,1.376908,0.048728
std,0.493044,22.612647,0.296607,0.226063,0.475034,1.090293,0.499985,45.28356,1.071534,0.21532
min,0.0,0.08,0.0,0.0,0.0,0.0,0.0,55.12,0.0,0.0
25%,0.0,25.0,0.0,0.0,0.0,2.0,0.0,77.245,0.0,0.0
50%,0.0,45.0,0.0,0.0,1.0,2.0,1.0,91.885,2.0,0.0
75%,1.0,61.0,0.0,0.0,1.0,3.0,1.0,114.09,2.0,0.0
max,2.0,82.0,1.0,1.0,1.0,4.0,1.0,271.74,3.0,1.0


In [41]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,1,1
1,0,61.0,0,0,1,3,0,202.21,2,1
2,1,80.0,0,1,1,2,0,105.92,2,1
3,0,49.0,0,0,1,2,1,171.23,3,1
4,0,79.0,1,0,1,3,0,174.12,2,1
...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,2,0
5106,0,81.0,0,0,1,3,1,125.20,2,0
5107,0,35.0,0,0,1,3,0,82.99,2,0
5108,1,51.0,0,0,1,2,0,166.29,1,0


In [42]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Drop rows with missing values
x_train = x_train.dropna()
y_train = y_train.loc[x_train.index]
x_test = x_test.dropna()
y_test = y_test.loc[x_test.index]

# Model training and evaluation
models = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    DecisionTreeClassifier(),
    KNeighborsClassifier(n_neighbors=3),
    HistGradientBoostingClassifier(random_state=42)
]

for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    print(f"Model: {type(model).__name__}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("--------------------------------------------------------")

Model: RandomForestClassifier
Confusion Matrix:
[[947   4]
 [ 47   2]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       951
           1       0.33      0.04      0.07        49

    accuracy                           0.95      1000
   macro avg       0.64      0.52      0.52      1000
weighted avg       0.92      0.95      0.93      1000

--------------------------------------------------------
Model: DecisionTreeClassifier
Confusion Matrix:
[[905  46]
 [ 41   8]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       951
           1       0.15      0.16      0.16        49

    accuracy                           0.91      1000
   macro avg       0.55      0.56      0.55      1000
weighted avg       0.92      0.91      0.91      1000

--------------------------------------------------------
Model: KNeighborsClassifier
Confusion Matr