In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Step 1: Load the dataset
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
# Step 2: Exploratory Data Analysis
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [5]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [6]:
# Step 3: Data Preprocessing
# Drop the 'id' column as it's not useful for prediction
data = data.drop(['id'], axis=1)

In [7]:
# Encoding categorical variables
encoder = LabelEncoder()
data['gender'] = encoder.fit_transform(data['gender'])
data['ever_married'] = encoder.fit_transform(data['ever_married'])
data['work_type'] = encoder.fit_transform(data['work_type'])
data['Residence_type'] = encoder.fit_transform(data['Residence_type'])
data['smoking_status'] = encoder.fit_transform(data['smoking_status'])

In [8]:
# Handle missing values in 'bmi' by filling with the mean
data['bmi'].fillna(data['bmi'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].mean(), inplace=True)


In [9]:
# Step 4: Feature Scaling
scaler = StandardScaler()
numerical_features = ['age', 'avg_glucose_level', 'bmi']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [10]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,1.051434,0,1,1,2,1,2.706375,1.001234,1,1
1,0,0.78607,0,0,1,3,0,2.121559,4.615554e-16,2,1
2,1,1.62639,0,1,1,2,0,-0.005028,0.4685773,2,1
3,0,0.255342,0,0,1,2,1,1.437358,0.7154182,3,1
4,0,1.582163,1,0,1,3,0,1.501184,-0.6357112,2,1


In [11]:
data.dtypes

gender                 int32
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int32
work_type              int32
Residence_type         int32
avg_glucose_level    float64
bmi                  float64
smoking_status         int32
stroke                 int64
dtype: object

In [12]:
data['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [13]:
# Step 5: Splitting data into training and testing sets
X = data.drop(['stroke'], axis=1)  # Features
y = data['stroke']  # Target variable

In [14]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)

In [15]:
X_resampled, y_resampled = sm.fit_resample(X, y)

data_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [16]:
from collections import Counter

print("Original dataset shape {}".format(Counter(y)))
print("Resampled dataset shape {}".format(Counter(y_resampled)))

Original dataset shape Counter({0: 4861, 1: 249})
Resampled dataset shape Counter({1: 4861, 0: 4861})


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify = y_resampled)

In [23]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

In [25]:
# Step 7: Model Evaluation
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7856041131105398
Confusion Matrix:
 [[736 237]
 [180 792]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78       973
           1       0.77      0.81      0.79       972

    accuracy                           0.79      1945
   macro avg       0.79      0.79      0.79      1945
weighted avg       0.79      0.79      0.79      1945



In [26]:
from sklearn.tree import DecisionTreeClassifier
dtclf = DecisionTreeClassifier(random_state=42)
dtclf.fit(X_train, y_train)

In [27]:
y_pred = dtclf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9059125964010283
Confusion Matrix:
 [[868 105]
 [ 78 894]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.89      0.90       973
           1       0.89      0.92      0.91       972

    accuracy                           0.91      1945
   macro avg       0.91      0.91      0.91      1945
weighted avg       0.91      0.91      0.91      1945



In [18]:
rfclf = RandomForestClassifier(random_state=42)
rfclf.fit(X_train, y_train)

In [19]:
y_pred = rfclf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9398457583547558
Confusion Matrix:
 [[886  87]
 [ 30 942]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.91      0.94       973
           1       0.92      0.97      0.94       972

    accuracy                           0.94      1945
   macro avg       0.94      0.94      0.94      1945
weighted avg       0.94      0.94      0.94      1945



In [20]:
def predict_stroke(input_data):
    input_data = pd.DataFrame([input_data], columns=X.columns)
    input_data[numerical_features] = scaler.transform(input_data[numerical_features])

    for col in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
        known_classes = encoder.classes_ if hasattr(encoder, 'classes_') else []

        if input_data[col].iloc[0] not in known_classes:
            if hasattr(encoder, 'classes_'):
                encoder.classes_ = np.append(encoder.classes_, input_data[col].iloc[0])

    input_data['gender'] = encoder.transform(input_data['gender'].astype(str))
    input_data['ever_married'] = encoder.transform(input_data['ever_married'].astype(str))
    input_data['work_type'] = encoder.transform(input_data['work_type'].astype(str))
    input_data['Residence_type'] = encoder.transform(input_data['Residence_type'].astype(str))
    input_data['smoking_status'] = encoder.transform(input_data['smoking_status'].astype(str))

    # Prediction
    prediction = rfclf.predict(input_data)
    return "Stroke" if prediction[0] == 1 else "No Stroke"

In [21]:
example_data = {
    'gender': 'Male',
    'age': 67,
    'hypertension': 0,
    'heart_disease': 1,
    'ever_married': 'Yes',
    'work_type': 'Private',
    'Residence_type': 'Urban',
    'avg_glucose_level': 120.0,
    'bmi': 25.0,
    'smoking_status': 'formerly smoked'
}

print("Prediction:", predict_stroke(example_data))

Prediction: No Stroke


In [22]:
import pickle

with open('stroke_prediction_model.pkl', 'wb') as file:
    pickle.dump(rfclf, file)