In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [40]:
data = pd.read_csv("stroke_prediction_data.csv")
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [41]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [42]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

DATA-PREPROCESSING

In [43]:
#. Dropping the columns that are not needed
data = data.drop('id', axis=1)
data.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [44]:
#. Handle null values in "bmi" column
data['bmi'].fillna(round(data['bmi'].mean(), 2), inplace=True)
data.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [45]:
#. Convert categorical values to numeric values
categorical_columns = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,0,61.0,0,0,1,3,0,202.21,28.89,2,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [46]:
#. Normalize the "avg_glucose_level" column
scaler = StandardScaler()
data['avg_glucose_level'] = scaler.fit_transform(data[['avg_glucose_level']])
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,2.706375,36.6,1,1
1,0,61.0,0,0,1,3,0,2.121559,28.89,2,1
2,1,80.0,0,1,1,2,0,-0.005028,32.5,2,1
3,0,49.0,0,0,1,2,1,1.437358,34.4,3,1
4,0,79.0,1,0,1,3,0,1.501184,24.0,2,1


DIVIDING DATA INTO 3 COMBINATIONS

In [47]:
correlation_matrix = data.corr()

# Focus on correlations with the 'stroke' column
stroke_correlation = correlation_matrix['stroke'].sort_values(ascending=False)

# Print correlations for inspection
print("Correlations with 'stroke':\n", stroke_correlation)

Correlations with 'stroke':
 stroke               1.000000
age                  0.245257
heart_disease        0.134914
avg_glucose_level    0.131945
hypertension         0.127904
ever_married         0.108340
bmi                  0.038935
smoking_status       0.028123
Residence_type       0.015458
gender               0.008929
work_type           -0.032316
Name: stroke, dtype: float64


In [49]:
# Convert the data to train and test sets
X_high = data[['age', 'heart_disease', 'avg_glucose_level', 'hypertension', 'ever_married']]
X_low = data[['bmi', 'smoking_status', 'Residence_type', 'gender', 'work_type']]
X_mixed = data[['age', 'bmi', 'heart_disease', 'smoking_status', 'hypertension']]

#. Apply SMOTE to oversample the minority class
sm = SMOTE(random_state=42)
X1, y1 = sm.fit_resample(X_high, y)
X2, y2 = sm.fit_resample(X_low, y)
X3, y3 = sm.fit_resample(X_mixed, y)

MODEL IMPLEMENTATION

In [50]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

#. Random Forest Implementation
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8735218508997429
              precision    recall  f1-score   support

           0       0.87      0.88      0.87       975
           1       0.87      0.87      0.87       970

    accuracy                           0.87      1945
   macro avg       0.87      0.87      0.87      1945
weighted avg       0.87      0.87      0.87      1945

[[854 121]
 [125 845]]


In [51]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

#. Random Forest Implementation
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8524421593830335
              precision    recall  f1-score   support

           0       0.86      0.85      0.85       975
           1       0.85      0.86      0.85       970

    accuracy                           0.85      1945
   macro avg       0.85      0.85      0.85      1945
weighted avg       0.85      0.85      0.85      1945

[[828 147]
 [140 830]]


In [52]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size=0.2, random_state=42)

#. Random Forest Implementation
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9275064267352185
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       975
           1       0.92      0.94      0.93       970

    accuracy                           0.93      1945
   macro avg       0.93      0.93      0.93      1945
weighted avg       0.93      0.93      0.93      1945

[[897  78]
 [ 63 907]]
