In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# !wget -O dataset.csv "https://storage.googleapis.com/kagglesdsdata/datasets/1120859/1882037/healthcare-dataset-stroke-data.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20221021%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20221021T050208Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=4c895a20665b62be3b446cc54fc0c2bfe03fcc45439ca1690e9de130ee37382445c3296dfb729a36d860f51c5c70b275baf13126840f199f42a338819dfb56098efbb97d721b6d13c9379543925f99db6b1958a7c7aef6c92609c32e56723a9f81657b30a772536d90bf15ce6edd02a9077faee3d9c0848297a72c52b37e847be21c174a0f94816243549035fd93138ecbb9291f133fb72fcc2e408be3fcd9875fc681a38d45afedb5c962f614cc99bac9031b8e59f285d6339bbbd548a48f52ec37732a02d3fd7cbf15212efb423801f9ec0fb5f6c0f7dc80f33e5c318f7d4ef3a65d5cd0fa36ff28980ded27bfcf333f259253112f1624950e1a4937ccfba3"

In [3]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.shape

(5110, 12)

In [5]:
df.columns = [column.lower() for column in df.columns]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
df['bmi'].min(), df['bmi'].max(), df['bmi'].mode(), df['bmi'].mean()

(10.3,
 97.6,
 0    28.7
 Name: bmi, dtype: float64,
 28.893236911794666)

In [8]:
df = df.drop(columns=["id"])

In [9]:
df = df[df['smoking_status'] != 'Unknown']
df.shape

(3566, 11)

In [10]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  140
smoking_status         0
stroke                 0
dtype: int64

In [11]:
df['bmi'] = df['bmi'].map(str)
df = df[df['bmi']!='nan']
df['bmi'] = df['bmi'].astype('float64')


In [12]:
categorical_columns = [column for column in df.columns if df[column].dtype == 'O']
for column in categorical_columns:
    print(f"Kolom {column}: {np.unique(df[column])}")

Kolom gender: ['Female' 'Male' 'Other']
Kolom ever_married: ['No' 'Yes']
Kolom work_type: ['Govt_job' 'Never_worked' 'Private' 'Self-employed' 'children']
Kolom residence_type: ['Rural' 'Urban']
Kolom smoking_status: ['formerly smoked' 'never smoked' 'smokes']


In [13]:
df['gender'] = df['gender'].map({
    'Female': 0,
    'Male': 1,
    'Other': 2
})
df['ever_married'] = df['ever_married'].map({
    'No': 0,
    'Yes': 1,
})
df['work_type'] = df['work_type'].map({
    'Govt_job': 0,
    'Never_worked': 1,
    'Private': 2,
    'Self-employed': 3,
    'children': 4,
})
df['residence_type'] = df['residence_type'].map({
    'Rural': 0,
    'Urban': 1,
})
df['smoking_status'] = df['smoking_status'].map({
    'never smoked': 0,
    'formerly smoked': 1,
    'smokes': 2,
})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3426 entries, 0 to 5108
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             3426 non-null   int64  
 1   age                3426 non-null   float64
 2   hypertension       3426 non-null   int64  
 3   heart_disease      3426 non-null   int64  
 4   ever_married       3426 non-null   int64  
 5   work_type          3426 non-null   int64  
 6   residence_type     3426 non-null   int64  
 7   avg_glucose_level  3426 non-null   float64
 8   bmi                3426 non-null   float64
 9   smoking_status     3426 non-null   int64  
 10  stroke             3426 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 321.2 KB


In [14]:
x = df.drop(columns=["stroke"])
y = df["stroke"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9)

In [15]:
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [16]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9489795918367347

In [17]:
joblib.dump([rf], "brain_stroke.model")

['brain_stroke.model']

In [18]:
!pip list

Package                        Version




------------------------------ ------------
absl-py                        1.2.0
aiohttp                        3.8.3
aiosignal                      1.2.0
ale-py                         0.7.4
altgraph                       0.17.2
anyio                          3.6.1
appdirs                        1.4.4
argon2-cffi                    20.1.0
asgiref                        3.5.0
astroid                        2.9.3
asttokens                      2.0.8
astunparse                     1.6.3
async-generator                1.10
async-timeout                  4.0.2
atari-py                       0.2.9
attrs                          21.2.0
auto-py-to-exe                 2.21.0
AutoROM                        0.4.2
AutoROM.accept-rom-license     0.4.2
backcall                       0.2.0
baselines                      0.1.5
bcrypt                         4.0.0
bleach                         3.3.0
bottle                         0.12.21
bottle-websocket               0.2.9
box2d-py                 