In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/healthcare-dataset-stroke-data.csv")


In [None]:
# Check for missing values
missing = df.isnull().sum()

# Check for duplicates
duplicates = df.duplicated().sum()

# Remove duplicates
df = df.drop_duplicates()

# Convert categorical columns to category dtype
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_cols:
    df[col] = df[col].astype('category')


In [None]:
missing

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [None]:
# Using dataframe df: check percentage of missing values

missing_values = df.isnull().sum().div(df.shape[0])
missing_values_percent = missing_values * 100
print(missing_values_percent)


id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  3.933464
smoking_status       0.000000
stroke               0.000000
dtype: float64


In [None]:
# Drop rows with missing values since a little percentage is missing

df = df.dropna()


In [None]:
# Check unique values along with percentage that each unique value takes for every categorical column

for col in df.select_dtypes(include='category'):
    unique_values = df[col].value_counts()
    total_values = len(df[col])
    percentage = (unique_values / total_values) * 100
    print(f"Unique values for column '{col}':")
    for value, percent in zip(unique_values.index, percentage):
        print(f"- {value}: {percent:.2f}%")


Unique values for column 'gender':
- Female: 59.01%
- Male: 40.97%
- Other: 0.02%
Unique values for column 'ever_married':
- Yes: 65.27%
- No: 34.73%
Unique values for column 'work_type':
- Private: 57.26%
- Self-employed: 15.79%
- children: 13.67%
- Govt_job: 12.83%
- Never_worked: 0.45%
Unique values for column 'Residence_type':
- Urban: 50.72%
- Rural: 49.28%
Unique values for column 'smoking_status':
- never smoked: 37.73%
- Unknown: 30.21%
- formerly smoked: 17.05%
- smokes: 15.01%


In [None]:
# Drop rows that hold the value "Other" in the "gender" column

df = df[df['gender'] != 'Other']


In [None]:
# check non categorical columns and display description

df.select_dtypes(exclude='category').describe()


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0,4908.0
mean,37060.423594,42.86881,0.091891,0.049511,105.297402,28.89456,0.042584
std,20995.468407,22.556128,0.288901,0.216954,44.42555,7.85432,0.201937
min,77.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,18602.5,25.0,0.0,0.0,77.0675,23.5,0.0
50%,37580.5,44.0,0.0,0.0,91.68,28.1,0.0
75%,55181.75,60.0,0.0,0.0,113.495,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [None]:
# convert the values in the age column to int type

df['age'] = df['age'].astype(int)


Strokes in children from 29 days to 18 years old are often associated with existing conditions, most commonly congenital heart disease and sickle cell disease. Other risk factors include infectious diseases, trauma to the head or neck, vascular problems and blood disorders.

https://www.stroke.org.uk/childhood-stroke/about-childhood-stroke#:~:text=Strokes%20in%20children%20from%2029,vascular%20problems%20and%20blood%20disorders.

In [None]:
# check the rows that have the age column below 10 and check percentage of those that have heart disease value as 0.

young_df = df[df['age'] < 18]
heart_disease_zeros = young_df[young_df['heart_disease'] == 0]
percentage = (len(heart_disease_zeros) / len(young_df)) * 100
print(f"{percentage:.2f}% of young people (age < 18) have no heart disease.")

stroke_zeros = young_df[young_df['stroke'] == 0]
percentage_stroke = (len(stroke_zeros) / len(young_df)) * 100
print(f"{percentage_stroke:.2f}% of young people (age < 18) have no stroke.")

99.88% of young people (age < 18) have no heart disease.
99.88% of young people (age < 18) have no stroke.


Apparently some kids (below 18) also work according to this dataset

In [None]:
# check the rows that have the age column below 18 and display the percentage that DON'T have 'work_type' value as children.

not_children_work_type = young_df[young_df['work_type'] != 'children']
percentage = (len(not_children_work_type) / len(young_df)) * 100
print(f"{percentage:.2f}% of young people (age < 18) don't have 'work_type' value as children.")


19.74% of young people (age < 18) don't have 'work_type' value as children.


In [None]:
# check if the target column is skewed

df['stroke'].value_counts(normalize=True) * 100


stroke
0    95.741646
1     4.258354
Name: proportion, dtype: float64

In [None]:
df.shape

(4908, 12)

In [None]:
# percentage comparison of the gender of people who had stroke and those who didn't

stroke_yes = df[df['stroke'] == 1]
stroke_no = df[df['stroke'] == 0]

gender_stroke_yes = stroke_yes['gender'].value_counts(normalize=True) * 100
gender_stroke_no = stroke_no['gender'].value_counts(normalize=True) * 100

print("Percentage of gender for people who had stroke:")
print(gender_stroke_yes)

print("\nPercentage of gender for people who didn't have stroke:")
print(gender_stroke_no)

Percentage of gender for people who had stroke:
gender
0    57.416268
1    42.583732
Name: proportion, dtype: float64

Percentage of gender for people who didn't have stroke:
gender
0    59.09768
1    40.90232
Name: proportion, dtype: float64


In [None]:
# encode categorical values

df['gender'] = df['gender'].astype('category').cat.codes
df['ever_married'] = df['ever_married'].astype('category').cat.codes
df['work_type'] = df['work_type'].astype('category').cat.codes
df['Residence_type'] = df['Residence_type'].astype('category').cat.codes
df['smoking_status'] = df['smoking_status'].astype('category').cat.codes


In [None]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67,0,1,1,2,1,228.69,36.6,1,1
2,31112,1,80,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79,1,0,1,3,0,174.12,24.0,2,1
5,56669,1,81,0,0,1,2,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,0,13,0,0,0,4,0,103.08,18.6,0,0
5106,44873,0,81,0,0,1,3,1,125.20,40.0,2,0
5107,19723,0,35,0,0,1,3,0,82.99,30.6,2,0
5108,37544,1,51,0,0,1,2,0,166.29,25.6,1,0


In [None]:
df.stroke.value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = df.drop(['stroke',"id"], axis=1)
y = df['stroke']



# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Split your data into training and testing sets
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:

# Train your model (example using RandomForestClassifier)
clf_re = RandomForestClassifier(random_state=42)
clf_re.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = clf_re.predict(X_test_resampled)

# Evaluate your model
print(classification_report(y_test_resampled, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.94      0.96       922
           1       0.95      0.97      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

# Train your model (example using RandomForestClassifier)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate your model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       929
           1       0.00      0.00      0.00        53

    accuracy                           0.95       982
   macro avg       0.47      0.50      0.49       982
weighted avg       0.89      0.95      0.92       982



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pickle

# Open the file in binary mode
with open("random_forest.pkl", 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(clf_re, file)