In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 10,8
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import math

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import xgboost as x
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("DATA/healthcare-dataset-stroke-data.csv")

---
# Stroke Prediction Dataset

This dataset is used to predict the likelihood of a patient experiencing a stroke based on various health and demographic factors. Each row contains relevant information about a patient, including their health history, lifestyle, and other attributes.

## Attribute Information

1. **id**: A unique identifier for each patient.
2. **gender**: The gender of the patient. Possible values are:
   - "Male"
   - "Female"
   - "Other"
3. **age**: The age of the patient (in years).
4. **hypertension**: Indicates whether the patient has hypertension (high blood pressure):
   - 0: The patient does not have hypertension.
   - 1: The patient has hypertension.
5. **heart_disease**: Indicates whether the patient has a heart disease:
   - 0: The patient does not have heart disease.
   - 1: The patient has heart disease.
6. **ever_married**: Indicates if the patient has ever been married:
   - "No": The patient has never been married.
   - "Yes": The patient has been married.
7. **work_type**: The type of work the patient engages in. Possible values are:
   - "children"
   - "Govt_job"
   - "Never_worked"
   - "Private"
   - "Self-employed"
8. **Residence_type**: The type of residence the patient lives in:
   - "Rural"
   - "Urban"
9. **avg_glucose_level**: The average glucose level in the patient’s blood.
10. **bmi**: The body mass index (BMI) of the patient.
11. **smoking_status**: The smoking habits of the patient. Possible values are:
    - "formerly smoked"
    - "never smoked"
    - "smokes"
    - "Unknown" (Indicates that smoking status information is unavailable for this patient)
12. **stroke**: Indicates if the patient has experienced a stroke:
    - 0: The patient has not had a stroke.
    - 1: The patient has had a stroke.

---

In [3]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [7]:
df['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

## Label Encoding

In [8]:
"Suraj".lower()

'suraj'

In [9]:
df.sample(2)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
3677,29816,Male,64.0,1,0,Yes,Private,Rural,91.85,31.8,formerly smoked,0
4576,66786,Female,53.0,0,0,Yes,Private,Rural,94.14,27.7,smokes,0


In [10]:
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])
gender_mapping = dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))
print("Gender Mapping:", gender_mapping)

le_ever_married = LabelEncoder()
df['ever_married'] = le_ever_married.fit_transform(df['ever_married'])
ever_married_mapping = dict(zip(le_ever_married.classes_, le_ever_married.transform(le_ever_married.classes_)))
print("Ever Married Mapping:", ever_married_mapping)

le_work_type = LabelEncoder()
df['work_type'] = le_work_type.fit_transform(df['work_type'])
work_type_mapping = dict(zip(le_work_type.classes_, le_work_type.transform(le_work_type.classes_)))
print("Work Type Mapping:", work_type_mapping)

le_residence_type = LabelEncoder()
df['Residence_type'] = le_residence_type.fit_transform(df['Residence_type'])
residence_type_mapping = dict(zip(le_residence_type.classes_, le_residence_type.transform(le_residence_type.classes_)))
print("Residence Type Mapping:", residence_type_mapping)

le_smoking_status = LabelEncoder()
df['smoking_status'] = le_smoking_status.fit_transform(df['smoking_status'])
smoking_status_mapping = dict(zip(le_smoking_status.classes_, le_smoking_status.transform(le_smoking_status.classes_)))
print("Smoking Status Mapping:", smoking_status_mapping)

Gender Mapping: {'Female': 0, 'Male': 1, 'Other': 2}
Ever Married Mapping: {'No': 0, 'Yes': 1}
Work Type Mapping: {'Govt_job': 0, 'Never_worked': 1, 'Private': 2, 'Self-employed': 3, 'children': 4}
Residence Type Mapping: {'Rural': 0, 'Urban': 1}
Smoking Status Mapping: {'Unknown': 0, 'formerly smoked': 1, 'never smoked': 2, 'smokes': 3}


## Handling missing values

In [11]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [12]:
bmi_mean = df['bmi'].mean()
df['bmi'].fillna(bmi_mean, inplace=True)

In [13]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [14]:
df.corr()['stroke']

id                   0.006388
gender               0.008929
age                  0.245257
hypertension         0.127904
heart_disease        0.134914
ever_married         0.108340
work_type           -0.032316
Residence_type       0.015458
avg_glucose_level    0.131945
bmi                  0.038947
smoking_status       0.028123
stroke               1.000000
Name: stroke, dtype: float64

In [15]:
X = df.drop(columns=['id', 'stroke']) 
y = df['stroke']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## SMOTE 

In [16]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"Before SMOTE: {y_train.value_counts()}")
print(f"After SMOTE: {y_train_smote.value_counts()}")

Before SMOTE: stroke
0    3417
1     160
Name: count, dtype: int64
After SMOTE: stroke
0    3417
1    3417
Name: count, dtype: int64


## Scaling

In [17]:
scaler = StandardScaler()

# Fit the scaler on training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

## Model Training

In [18]:
def demo_models(Model, X_TRAIN, X_TEST, Y_TRAIN, Y_TEST):
    model = Model
    model.fit(X_TRAIN, Y_TRAIN)

    train_pred = model.predict(X_TRAIN)
    test_pred = model.predict(X_TEST)
    
    train_accuracy = accuracy_score(Y_TRAIN, train_pred)
    train_precision = precision_score(Y_TRAIN, train_pred)
    train_recall = recall_score(Y_TRAIN, train_pred)
    train_f1 = f1_score(Y_TRAIN, train_pred)
    
    test_accuracy = accuracy_score(Y_TEST, test_pred)
    test_precision = precision_score(Y_TEST, test_pred)
    test_recall = recall_score(Y_TEST, test_pred)
    test_f1 = f1_score(Y_TEST, test_pred)

    # Printing the summary
    print(f"• The Summary for 👉 {Model.__class__.__name__} is:")
    print(f"→ For Training data: ")
    print(f"    ‣ Accuracy  = {'%.4f' % train_accuracy}")
    print(f"    ‣ Precision = {'%.4f' % train_precision}")
    print(f"    ‣ Recall    = {'%.4f' % train_recall}")
    print(f"    ‣ F1 Score  = {'%.4f' % train_f1}")
    
    print(f"→ For Test data: ")
    print(f"    ‣ Accuracy  = {'%.4f' % test_accuracy}")
    print(f"    ‣ Precision = {'%.4f' % test_precision}")
    print(f"    ‣ Recall    = {'%.4f' % test_recall}")
    print(f"    ‣ F1 Score  = {'%.4f' % test_f1}")
    return model

In [26]:
# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model = demo_models(lr_model, X_train_scaled, X_test_scaled, y_train_smote, y_test)

• The Summary for 👉 LogisticRegression is:
→ For Training data: 
    ‣ Accuracy  = 0.9550
    ‣ Precision = 0.4000
    ‣ Recall    = 0.0125
    ‣ F1 Score  = 0.0242
→ For Test data: 
    ‣ Accuracy  = 0.9419
    ‣ Precision = 0.5000
    ‣ Recall    = 0.0112
    ‣ F1 Score  = 0.0220


In [27]:
rf_model = RandomForestClassifier(random_state=42)
best_model = demo_models(rf_model,  X_train_scaled, X_test_scaled, y_train_smote, y_test)

• The Summary for 👉 RandomForestClassifier is:
→ For Training data: 
    ‣ Accuracy  = 1.0000
    ‣ Precision = 1.0000
    ‣ Recall    = 1.0000
    ‣ F1 Score  = 1.0000
→ For Test data: 
    ‣ Accuracy  = 0.9406
    ‣ Precision = 0.0000
    ‣ Recall    = 0.0000
    ‣ F1 Score  = 0.0000


In [28]:
new_lr_model = LogisticRegression(C= 0.01, max_iter= 100, solver= 'liblinear')
new_lr_model = demo_models(rf_model,  X_train_scaled, X_test_scaled, y_train_smote, y_test)

• The Summary for 👉 RandomForestClassifier is:
→ For Training data: 
    ‣ Accuracy  = 1.0000
    ‣ Precision = 1.0000
    ‣ Recall    = 1.0000
    ‣ F1 Score  = 1.0000
→ For Test data: 
    ‣ Accuracy  = 0.9406
    ‣ Precision = 0.0000
    ‣ Recall    = 0.0000
    ‣ F1 Score  = 0.0000


## optimizing

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report

In [None]:
# Logistic Regression hyperparameter grid
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],  # Optimization solvers
    'max_iter': [100, 200, 300]  # Maximum iterations
}

# Random Forest hyperparameter grid
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples at each leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used
}


# Logistic Regression with GridSearchCV
lr = LogisticRegression(random_state=42)
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, n_jobs=-1, verbose=2)
grid_search_lr.fit(X_train_scaled, y_train_smote)

# Random Forest with GridSearchCV
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train_scaled, y_train_smote)

# Best parameters and score for Logistic Regression
print("Best parameters for Logistic Regression: ", grid_search_lr.best_params_)
print("Best score for Logistic Regression: ", grid_search_lr.best_score_)

# Best parameters and score for Random Forest
print("Best parameters for Random Forest: ", grid_search_rf.best_params_)
print("Best score for Random Forest: ", grid_search_rf.best_score_)

In [None]:
# Evaluate best Logistic Regression model on test data
best_lr = grid_search_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr))

# Evaluate best Random Forest model on test data
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

In [22]:
import joblib

joblib.dump(best_model, 'MODELS/stroke_prediction_model.pkl')

['MODELS/stroke_prediction_model.pkl']

In [23]:
joblib.dump(scaler, 'MODELS/scaler.pkl')

['MODELS/scaler.pkl']

In [24]:
joblib.dump(lr_model, "MODELS/lr_model.pkl")

['MODELS/lr_model.pkl']

In [25]:
joblib.dump(new_lr_model, "MODELS/new_lr_model.pkl")

['MODELS/new_lr_model.pkl']