# Brain Stroke Prediction

Predict if patients will suffer from brain stroke by implementing machine learning algorithms.



In [2]:
# Import the necessary libraries
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

## Data Preprocessing

### Data Cleaning

In [9]:
# Load the data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

df = pd.read_csv('/kaggle/input/brain-stroke-dataset/brain_stroke.csv')

/kaggle/input/brain-stroke-dataset/brain_stroke.csv


In [11]:
# See available columns
for columns in df.columns:
    print(columns)

gender
age
hypertension
heart_disease
ever_married
work_type
Residence_type
avg_glucose_level
bmi
smoking_status
stroke


In [22]:
# Check if there are any duplicate rows
#df[df.duplicated() == True]
df.duplicated().any()

False

No duplicates were found

In [21]:
# Check if there are null values
df.isnull().any()

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

No null values were found

In [25]:
# View the first few rows of data
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


### Data Transformation

In [23]:
# Check data types
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [29]:
# Print unique values for each column
for col in df.columns:
    print(df[col].unique())

['Male' 'Female']
[6.70e+01 8.00e+01 4.90e+01 7.90e+01 8.10e+01 7.40e+01 6.90e+01 7.80e+01
 6.10e+01 5.40e+01 5.00e+01 6.40e+01 7.50e+01 6.00e+01 7.10e+01 5.20e+01
 8.20e+01 6.50e+01 5.70e+01 4.20e+01 4.80e+01 7.20e+01 5.80e+01 7.60e+01
 3.90e+01 7.70e+01 6.30e+01 7.30e+01 5.60e+01 4.50e+01 7.00e+01 5.90e+01
 6.60e+01 4.30e+01 6.80e+01 4.70e+01 5.30e+01 3.80e+01 5.50e+01 4.60e+01
 3.20e+01 5.10e+01 1.40e+01 3.00e+00 8.00e+00 3.70e+01 4.00e+01 3.50e+01
 2.00e+01 4.40e+01 2.50e+01 2.70e+01 2.30e+01 1.70e+01 1.30e+01 4.00e+00
 1.60e+01 2.20e+01 3.00e+01 2.90e+01 1.10e+01 2.10e+01 1.80e+01 3.30e+01
 2.40e+01 3.60e+01 6.40e-01 3.40e+01 4.10e+01 8.80e-01 5.00e+00 2.60e+01
 3.10e+01 7.00e+00 1.20e+01 6.20e+01 2.00e+00 9.00e+00 1.50e+01 2.80e+01
 1.00e+01 1.80e+00 3.20e-01 1.08e+00 1.90e+01 6.00e+00 1.16e+00 1.00e+00
 1.40e+00 1.72e+00 2.40e-01 1.64e+00 1.56e+00 7.20e-01 1.88e+00 1.24e+00
 8.00e-01 4.00e-01 8.00e-02 1.48e+00 5.60e-01 1.32e+00 1.60e-01 4.80e-01]
[0 1]
[1 0]
['Yes' 'No']
['Priva

#### Convert string data into numerical data for machine learning algorithms 

Replace "Female", "No", "Private", "Urban", "never smoked", Urban with "0"

Replace "Male", "Yes", "Self-employed", "Rural", "formerly smoked", "Unknown" with "1"

Replace "Govt_job", "smokes" with "2"

Replace "Children" with "3"


Update the corresponding columns.

In [32]:
def n(k):
    if k=="Female" or k=="No" or k=="Private" or k=="never smoked" or k=="Urban":
        return 0
    if k=="Male" or k=="Yes" or k=="Self-Employed" or k=="Rural" or k=="formerly smoked" or k=="Unknown": 
        return 1
    if k=="Govt_job" or k=="smokes": 
        return 2
    if k=="Children":
        return 3

In [33]:
df['gender']=df['gender'].apply(n)
df['ever_married']=df['ever_married'].apply(n)
df['work_type']=df['work_type'].apply(n)
df['Residence_type']=df['Residence_type'].apply(n)
df['smoking_status']=df['smoking_status'].apply(n)

In [34]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,0.0,0,228.69,36.6,1,1
1,1,80.0,0,1,1,0.0,1,105.92,32.5,0,1
2,0,49.0,0,0,1,0.0,0,171.23,34.4,2,1
3,0,79.0,1,0,1,,1,174.12,24.0,0,1
4,1,81.0,0,0,1,0.0,0,186.21,29.0,1,1


In [35]:
# Check for null values again
df.isnull().any()

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type             True
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [38]:
# Replace missing values with 0
df = df.fillna(value = 0)

In [39]:
df.isnull().any()

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [51]:
# Split the data into train and test sets
from sklearn.model_selection import train_test_split
x = df.drop(['stroke'],axis =1)
y = df['stroke']
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8) 

## Supervised learning with machine learning alghorithms
Build machine learning models using Support Vector Machine (SVM), Naive Bayes (NB), and Random Forest (RF). Then, compare the accuracies of each model.

### SVM

In [56]:
# Build a supervised machine learning model using Support Vector Machine 
from sklearn.svm import SVC
clf = SVC(kernel='rbf', C=1).fit(x_train, y_train)
clf.score(x_test,y_test)

0.9478435305917753

### Naive Bayes

In [58]:
from sklearn.naive_bayes import CategoricalNB

nb = CategoricalNB()
nb.fit(x_train,y_train)
nb.score(x_test,y_test)

0.9307923771313942

### Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train,y_train)
rf.score(x_test,y_test)

0.9458375125376128

#### Create predictions using each model

In [62]:
pred_svm = clf.predict(x_test)
pred_nb = nb.predict(x_test)
pred_rf = rf.predict(x_test)

#### Compare metrics of predictions produced by different models, such as Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and Classification Report

In [63]:
from sklearn import metrics

In [66]:
# MSE
print("MSE of SVM: ",metrics.mean_absolute_error(y_test, pred_svm))
print("MSE of NB: ",metrics.mean_absolute_error(y_test, pred_nb))
print("MSE of RF: ",metrics.mean_absolute_error(y_test, pred_rf))

MSE of SVM:  0.05215646940822467
MSE of NB:  0.06920762286860582
MSE of RF:  0.05416248746238716


Results show that SVM has the lowest MSE

In [70]:
# RMSE
print("RMSE of SVM: ",np.sqrt(metrics.mean_squared_error(y_test, pred_svm)))
print("RMSE of NB: ",np.sqrt(metrics.mean_squared_error(y_test, pred_nb)))
print("RMSE of RF: ",np.sqrt(metrics.mean_squared_error(y_test, pred_rf)))

RMSE of SVM:  0.2283779091948796
RMSE of NB:  0.2630734172595282
RMSE of RF:  0.23272835551858986


The MSE and RMSE of SVM is the lowest, hence it it the best performing model based on these metrics.

In [80]:
# Classification report
from sklearn.metrics import classification_report

In [87]:
# SVM
# mat_svm = confusion_matrix(y_test, pred_svm)
print(mat_svm)

print(classification_report(y_test, pred_svm))

[[945   0]
 [ 52   0]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       945
           1       0.00      0.00      0.00        52

    accuracy                           0.95       997
   macro avg       0.47      0.50      0.49       997
weighted avg       0.90      0.95      0.92       997



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
# NB
# mat_nb = confusion_matrix(y_test, pred_nb)
print(mat_nb)
print(classification_report(y_test, pred_nb))

[[921  24]
 [ 45   7]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       945
           1       0.23      0.13      0.17        52

    accuracy                           0.93       997
   macro avg       0.59      0.55      0.57       997
weighted avg       0.92      0.93      0.92       997



In [89]:
# RF
# mat_rf = confusion_matrix(y_test, pred_rf)
print(mat_rf)

print(classification_report(y_test, pred_rf))

[[943   2]
 [ 52   0]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       945
           1       0.00      0.00      0.00        52

    accuracy                           0.95       997
   macro avg       0.47      0.50      0.49       997
weighted avg       0.90      0.95      0.92       997



In [97]:
# Check the number of people with and without stroke, where 0 means no stroke and 1 indicates otherwise
print(y.value_counts()[0])
print(y.value_counts()[1])

4733
248


#### Conclusion
- It is found that although all 3 models have high accuracies, the precision for those suffer from stroke is very low. 

- This is because the dataset is unbalanced, where the number of people suffering from stroke is significantly lower than those who do not.

- Techniques such as undersampling and oversampling can be used to make the dataset balanced.