In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
df = pd.read_csv("/content/drive/MyDrive/ML/clg_project_1/healthcare-dataset-stroke-data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


#### Attribute Information
1. id: unique identifier
2. gender: "Male", "Female" or "Other"
3. age: age of the patient
4. hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5. heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6. ever_married: "No" or "Yes"
7. work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8. Residence_type: "Rural" or "Urban"
9. avg_glucose_level: average glucose level in blood
10. bmi: body mass index
11. smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12. stroke: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
df['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4861
1,249


In [6]:
df.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


In [7]:
X = df.drop(columns=['id','stroke'])
y = df['stroke']

In [8]:
# custom transformer for BMI imputation based on gender
from sklearn.base import BaseEstimator, TransformerMixin

class GenderBasedImputer(BaseEstimator, TransformerMixin):
  def fit(self,X,y=None):
    # Access 'gender' and 'bmi' from the original DataFrame X
    self.avg_bmi_male = X.loc[X['gender'] == 'Male', 'bmi'].mean()
    self.avg_bmi_female = X.loc[X['gender'] == 'Female', 'bmi'].mean()
    self.avg_bmi_overall = X['bmi'].mean()
    return self

  def transform(self,X):
    # Ensure X is a DataFrame to access columns by name
    X = pd.DataFrame(X, columns=['bmi'])

    # Now you can apply the imputation logic
    X['bmi'] = X.apply(lambda row: self._fill_bmi(row), axis=1)

    return X.values  # Return as numpy array to be compatible with ColumnTransformer

  def _fill_bmi(self,row):
    # Add 'gender' column to row if it's not there
    if 'gender' not in row:
        row['gender'] = X.loc[X.index[row.name], 'gender'] if isinstance(row.name, int) else None

    if pd.isnull(row['bmi']):
      if row['gender'] == 'Male':
        return self.avg_bmi_male
      elif row['gender'] == 'Female':
        return self.avg_bmi_female
      else:
        return self.avg_bmi_overall
    else:
      return row['bmi']

In [9]:
numerical_cols = ['age','avg_glucose_level']
bmi_cols = ['bmi']
binary_cols = ['hypertension','heart_disease']
categorical_cols = ['gender','ever_married','work_type','Residence_type','smoking_status']

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [11]:
# preprocessing for bmi feature
bmi_transformer = Pipeline(steps=[
    ('bmi_imputer', GenderBasedImputer()),
    ('scaler',StandardScaler())
])

In [12]:
# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [13]:
# Preprocessing for binary features (No scaling needed)
binary_transformer = 'passthrough'

In [14]:
# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

In [15]:
# combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('bmi', bmi_transformer, bmi_cols + ['gender']),
        ('binary', binary_transformer, binary_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, test_size=0.2, random_state=42)

In [37]:
# importing all 8 classification algoritham
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

In [21]:
pip install imbalanced-learn



In [22]:
from imblearn.over_sampling import SMOTE

In [29]:
# Apply preprocessing to the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Apply preprocessing to the test data
X_test_processed = preprocessor.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# Check the class distribution after SMOTE
print("Class distribution before SMOTE:", y_train.value_counts())
print("Class distribution after SMOTE:", y_train_resampled.value_counts())


Class distribution before SMOTE: stroke
0    3889
1     199
Name: count, dtype: int64
Class distribution after SMOTE: stroke
0    3889
1    3889
Name: count, dtype: int64


### Dictionary Basics
1. Dictionaries in Python are mutable: This means you can modify the content of a dictionary after it is created.
2. You can add, remove, or update key-value pairs in the dictionary.
3. Dictionaries are unordered collections of key-value pairs (prior to Python 3.7), meaning there’s no guarantee of order, but from Python 3.7 onwards, they preserve insertion order.

In [30]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

In [34]:
# evaluate models
for i in range(len(list(models))):
  name = list(models.keys())[i]
  model = list(models.values())[i]

  pipeline = Pipeline(steps=[
      ('model', model)
  ])

  pipeline.fit(X_train_resampled, y_train_resampled)

  y_train_pred = pipeline.predict(X_train_resampled)
  y_test_pred = pipeline.predict(X_test_processed)

  print(f"{name} - model performance for training set")
  print("- Accuracy: {:.4f}".format(accuracy_score(y_train_resampled, y_train_pred)))
  print("- F1 score: {:.4f}".format(f1_score(y_train_resampled, y_train_pred, average='weighted')))
  print("- Precision: {:.4f}".format(precision_score(y_train_resampled, y_train_pred, average='weighted')))
  print("- Recall: {:.4f}".format(recall_score(y_train_resampled, y_train_pred, average='weighted')))
  print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_train_resampled, y_train_pred)))

  print('-' * 35)

  print(f"{name} - Model performance for Test set")
  print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
  print("- F1 score: {:.4f}".format(f1_score(y_test, y_test_pred, average='weighted')))
  print("- Precision: {:.4f}".format(precision_score(y_test, y_test_pred, average='weighted')))
  print("- Recall: {:.4f}".format(recall_score(y_test, y_test_pred, average='weighted')))
  print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_test, y_test_pred)))
  print('=' * 40, '\n')


Logistic Regression - model performance for training set
- Accuracy: 0.7882
- F1 score: 0.7878
- Precision: 0.7908
- Recall: 0.7882
- ROC AUC Score: 0.7882
-----------------------------------
Logistic Regression - Model performance for Test set
- Accuracy: 0.7534
- F1 score: 0.8229
- Precision: 0.9452
- Recall: 0.7534
- ROC AUC Score: 0.7755

Support Vector Machine - model performance for training set
- Accuracy: 0.9031
- F1 score: 0.9027
- Precision: 0.9093
- Recall: 0.9031
- ROC AUC Score: 0.9031
-----------------------------------
Support Vector Machine - Model performance for Test set
- Accuracy: 0.8072
- F1 score: 0.8563
- Precision: 0.9259
- Recall: 0.8072
- ROC AUC Score: 0.6426

K-Nearest Neighbors - model performance for training set
- Accuracy: 0.9447
- F1 score: 0.9446
- Precision: 0.9498
- Recall: 0.9447
- ROC AUC Score: 0.9447
-----------------------------------
K-Nearest Neighbors - Model performance for Test set
- Accuracy: 0.8209
- F1 score: 0.8620
- Precision: 0.9130
-

In [39]:
random_forest_params = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 10],
    "model__bootstrap": [True, False]
}
knn_params = {
    "model__n_neighbors": [5, 10, 15],
    "model__weights": ['uniform', 'distance'],
    "model__algorithm": ['auto', 'ball_tree'],
    "model__leaf_size": [30, 40]
}
svm_params = {
    "model__C": [0.1, 1, 10],
    "model__kernel": ['linear', 'rbf'],
    "model__gamma": ['scale', 'auto'],
    "model__degree": [3, 4]
}
gradient_params = {
    "model__loss": ['log_loss', 'deviance'],
    "model__criterion": ['friedman_mse', 'squared_error'],
    "model__min_samples_split": [2, 15],
    "model__n_estimators": [100, 200]
}

In [40]:
randomcv_models = [
    ("gb", GradientBoostingClassifier(), gradient_params),
    ("svm", SVC(), svm_params),
    ("rf", RandomForestClassifier(), random_forest_params),
    ("knn", KNeighborsClassifier(), knn_params)
]

In [41]:
# Hyperparameter tuning with Pipelines
model_param = {}
for name, model, params in randomcv_models:
    pipeline = Pipeline([
        ('model', model)
    ])

    random = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
    )
    random.fit(X_train_resampled, y_train_resampled)
    model_param[name] = random.best_params_

# Print best parameters
for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits
---------------- Best Params for gb -------------------
{'model__n_estimators': 200, 'model__min_samples_split': 15, 'model__loss': 'log_loss', 'model__criterion': 'friedman_mse'}
---------------- Best Params for svm -------------------
{'model__kernel': 'rbf', 'model__gamma': 'scale', 'model__degree': 3, 'model__C': 10}
---------------- Best Params for rf -------------------
{'model__n_estimators': 100, 'model__min_samples_split': 2, 'model__max_depth': None, 'model__bootstrap': False}
---------------- Best Params for knn -------------------
{'model__weights': 'distance', 'model__n_neighbors': 5, 'model__leaf_size': 30, 'model__algorithm': 'auto'}


In [42]:
models = {"Gradient_boosting": GradientBoostingClassifier(n_estimators=200, min_samples_split=15, loss='log_loss', criterion='friedman_mse'),
          "SVM": SVC(kernel='rbf', gamma='scale', degree=3, C=10),
          "RandomForest": RandomForestClassifier(n_estimators=100, min_samples_split=2, max_depth=None, bootstrap=False),
          "KNN": KNeighborsClassifier(weights='distance', n_neighbors=5, leaf_size=30, algorithm='auto')
}

In [43]:
# evaluate models
for i in range(len(list(models))):
  name = list(models.keys())[i]
  model = list(models.values())[i]

  pipeline = Pipeline(steps=[
      ('model', model)
  ])

  pipeline.fit(X_train_resampled, y_train_resampled)

  y_train_pred = pipeline.predict(X_train_resampled)
  y_test_pred = pipeline.predict(X_test_processed)

  print(f"{name} - model performance for training set")
  print("- Accuracy: {:.4f}".format(accuracy_score(y_train_resampled, y_train_pred)))
  print("- F1 score: {:.4f}".format(f1_score(y_train_resampled, y_train_pred, average='weighted')))
  print("- Precision: {:.4f}".format(precision_score(y_train_resampled, y_train_pred, average='weighted')))
  print("- Recall: {:.4f}".format(recall_score(y_train_resampled, y_train_pred, average='weighted')))
  print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_train_resampled, y_train_pred)))

  print('-' * 35)

  print(f"{name} - Model performance for Test set")
  print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
  print("- F1 score: {:.4f}".format(f1_score(y_test, y_test_pred, average='weighted')))
  print("- Precision: {:.4f}".format(precision_score(y_test, y_test_pred, average='weighted')))
  print("- Recall: {:.4f}".format(recall_score(y_test, y_test_pred, average='weighted')))
  print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_test, y_test_pred)))
  print('=' * 40, '\n')


Gradient_boosting - model performance for training set
- Accuracy: 0.9508
- F1 score: 0.9508
- Precision: 0.9508
- Recall: 0.9508
- ROC AUC Score: 0.9508
-----------------------------------
Gradient_boosting - Model performance for Test set
- Accuracy: 0.9119
- F1 score: 0.9176
- Precision: 0.9239
- Recall: 0.9119
- ROC AUC Score: 0.6027

SVM - model performance for training set
- Accuracy: 0.9523
- F1 score: 0.9522
- Precision: 0.9543
- Recall: 0.9523
- ROC AUC Score: 0.9523
-----------------------------------
SVM - Model performance for Test set
- Accuracy: 0.8630
- F1 score: 0.8887
- Precision: 0.9203
- Recall: 0.8630
- ROC AUC Score: 0.5960

RandomForest - model performance for training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0000
-----------------------------------
RandomForest - Model performance for Test set
- Accuracy: 0.9247
- F1 score: 0.9183
- Precision: 0.9123
- Recall: 0.9247
- ROC AUC Score: 0.5241

KNN - model per

## Final model is Gradient_boosting
Gradient_boosting - model performance for training set
- Accuracy: 0.9508
- F1 score: 0.9508
- Precision: 0.9508
- Recall: 0.9508
- ROC AUC Score: 0.9508
-----------------------------------
Gradient_boosting - Model performance for Test set
- Accuracy: 0.9119
- F1 score: 0.9176
- Precision: 0.9239
- Recall: 0.9119
- ROC AUC Score: 0.6027