# Stroke Prediction

## Menyiapkan Pustaka

In [279]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler

## Data Collection

In [280]:
!unzip "/content/stroke.zip"

Archive:  /content/stroke.zip
replace healthcare-dataset-stroke-data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: healthcare-dataset-stroke-data.csv  


In [281]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [282]:
baskom = df.shape
print('Jumlah Baris :',baskom[0])
print('Jumlah Kolom :',baskom[1])

Jumlah Baris : 5110
Jumlah Kolom : 12


## Preprocessing Data

### Assesing Data

In [283]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [284]:
df['work_type'].value_counts()

Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64

In [285]:
df['Residence_type'].value_counts()

Urban    2596
Rural    2514
Name: Residence_type, dtype: int64

In [286]:
df['smoking_status'].value_counts()

never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64

In [287]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [288]:
df.duplicated().sum()

0

### Cleaning Data

In [289]:
mean_bmi = df['bmi'].mean()
bmi = round(mean_bmi,1)
bmi

28.9

In [290]:
df['bmi'] = df['bmi'].fillna(value=bmi)
df.dropna()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,28.9,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [291]:
df.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [292]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [293]:
sex_mapping = {"Male" : 1, "Female" : 0}
df['gender'] = df['gender'].map(sex_mapping)

In [294]:
residence_mapping = {"Urban" : 1, "Rural" : 0}
df['Residence_type'] = df['Residence_type'].map(residence_mapping)

In [295]:
status_mapping = {"Yes":1,"No":0}
df['ever_married'] = df['ever_married'].map(status_mapping)

In [296]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [297]:
work = df['work_type']
work_dummies = pd.get_dummies(work)
work_dummies.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,0,1,0


In [298]:
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [299]:
smok = df['smoking_status']
smok_dummies = pd.get_dummies(smok)
smok_dummies.head()

Unnamed: 0,Unknown,formerly smoked,never smoked,smokes
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,0,1
4,0,0,1,0


In [300]:
df = pd.merge(df.reset_index(), smok_dummies.reset_index())
df = pd.merge(df.reset_index(),work_dummies.reset_index())
df.head()

Unnamed: 0,level_0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,...,stroke,Unknown,formerly smoked,never smoked,smokes,Govt_job,Never_worked,Private,Self-employed,children
0,0,0,9046,1.0,67.0,0,1,1,Private,1,...,1,0,1,0,0,0,0,1,0,0
1,1,1,51676,0.0,61.0,0,0,1,Self-employed,0,...,1,0,0,1,0,0,0,0,1,0
2,2,2,31112,1.0,80.0,0,1,1,Private,0,...,1,0,0,1,0,0,0,1,0,0
3,3,3,60182,0.0,49.0,0,0,1,Private,1,...,1,0,0,0,1,0,0,1,0,0
4,4,4,1665,0.0,79.0,1,0,1,Self-employed,0,...,1,0,0,1,0,0,0,0,1,0


In [301]:
df = df.drop(['level_0','index','id','work_type','smoking_status'], axis = 1)
df = df.dropna()
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,Unknown,formerly smoked,never smoked,smokes,Govt_job,Never_worked,Private,Self-employed,children
0,1.0,67.0,0,1,1,1,228.69,36.6,1,0,1,0,0,0,0,1,0,0
1,0.0,61.0,0,0,1,0,202.21,28.9,1,0,0,1,0,0,0,0,1,0
2,1.0,80.0,0,1,1,0,105.92,32.5,1,0,0,1,0,0,0,1,0,0
3,0.0,49.0,0,0,1,1,171.23,34.4,1,0,0,0,1,0,0,1,0,0
4,0.0,79.0,1,0,1,0,174.12,24.0,1,0,0,1,0,0,0,0,1,0


## Membagi Dataset

In [302]:
x = df.drop(['stroke'], axis= 1)
y = df['stroke']

In [303]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=42)

In [304]:
x_train.shape

(3065, 17)

In [305]:
x_test.shape

(2044, 17)

## Modeling

### KNN

In [306]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
knn_predictions = knn.predict(x_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
print("K-Nearest Neighbors Accuracy: {:.2%}".format(knn_accuracy))

K-Nearest Neighbors Accuracy: 94.03%


### Decision Tree

In [307]:
tree = DecisionTreeClassifier(random_state=1)
tree.fit(x_train, y_train)
tree_predictions = tree.predict(x_test)
tree_accuracy = accuracy_score(y_test, tree_predictions)
print("Decision Tree Accuracy : {:.2%}:".format(tree_accuracy))

Decision Tree Accuracy : 90.07%:


### Random Forest


In [308]:
rf_model = RandomForestClassifier(n_estimators=150, random_state=1)
rf_model.fit(x_train, y_train)
rf_predictions = rf_model.predict(x_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy: {:.2%}:".format(rf_accuracy))

Random Forest Accuracy: 94.42%:


### SVM

In [309]:
svm_model = SVC(C=1.0, kernel='rbf', gamma='scale')
svm_model.fit(x_train, y_train)
svm_predictions = svm_model.predict(x_test)
svm_accuracy = accuracy_score(y_test,svm_predictions)
print("SVM Acuuracy : {:.2%}:".format(svm_accuracy))

SVM Acuuracy : 94.42%:


### Naive Bayes

In [310]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(x_train, y_train)
bayes_predictions = naive_bayes_model.predict(x_test)
bayes_accuracy = accuracy_score(y_test,bayes_predictions)
print("Naive Bayes Accuracy : {:.2%}:".format(bayes_accuracy))

Naive Bayes Accuracy : 53.91%:


### Logistic Regression

In [311]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(x_train, y_train)
logistic_predictions = logistic_model.predict(x_test)
logistic_accuracy = accuracy_score(y_test,logistic_predictions)
print("Logistic Regression Accuracy : {:.2%}:".format(logistic_accuracy))

Logistic Regression Accuracy : 94.42%:


## Test Model

In [336]:
test = x_test.sample(1)
test

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,Unknown,formerly smoked,never smoked,smokes,Govt_job,Never_worked,Private,Self-employed,children
4471,1.0,56.0,0,0,1,0,93.72,31.4,0,0,1,0,0,0,1,0,0


In [337]:
test = x_test.sample(1)
predict = logistic_model.predict(test)
if predict[0] == 1:
  print('Stroke')
else:
  print('Not Stroke')

Not Stroke
