In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from tensorflow.keras.utils import to_categorical
import numpy as np
import pickle

In [2]:
stroke_df = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
stroke_df.drop(columns=['id'], inplace = True)
stroke_df.dropna(inplace=True)


In [4]:
stroke_df.work_type.value_counts()

Private          2811
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: work_type, dtype: int64

In [5]:
stroke_df = stroke_df[stroke_df.gender != 'Other']
stroke_df.gender.value_counts()

Female    2897
Male      2011
Name: gender, dtype: int64

In [6]:
#replace male with 0, female with 1
stroke_df['gender'].replace({'Male': 0, 'Female':1}, inplace=True)

#replace married with 1, not married with 0
stroke_df['ever_married'].replace({'Yes': 1, 'No':0}, inplace=True)

#Replace Urban with 0, Rural with 1
stroke_df['Residence_type'].replace({'Urban': 0, 'Rural':1}, inplace=True)

#Replace Smoker : 0, Former Smoker : 1, Never smoked : 2, Unknown : 3
stroke_df['smoking_status'].replace({'smokes': 0, 'formerly smoked':1, 
                                    'never smoked': 2, 'Unknown':3}, inplace=True)

#Replace Work type. Private :0, Self-employed : 1, child : 2, Govt-job :3, Never worked : 4
stroke_df['work_type'].replace({'Private': 0, 'Self-employed':1, 
                                    'children': 2, 'Govt_job':3, 'Never_worked' : 4}, inplace=True)


stroke_df.head(50)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,0,0,228.69,36.6,1,1
2,0,80.0,0,1,1,0,1,105.92,32.5,2,1
3,1,49.0,0,0,1,0,0,171.23,34.4,0,1
4,1,79.0,1,0,1,1,1,174.12,24.0,2,1
5,0,81.0,0,0,1,0,0,186.21,29.0,1,1
6,0,74.0,1,1,1,0,1,70.09,27.4,2,1
7,1,69.0,0,0,0,0,0,94.39,22.8,2,1
9,1,78.0,0,0,1,0,0,58.57,24.2,3,1
10,1,81.0,1,0,1,0,1,80.43,29.7,2,1
11,1,61.0,0,1,1,3,1,120.46,36.8,0,1


In [7]:
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,0,0,228.69,36.6,1,1
2,0,80.0,0,1,1,0,1,105.92,32.5,2,1
3,1,49.0,0,0,1,0,0,171.23,34.4,0,1
4,1,79.0,1,0,1,1,1,174.12,24.0,2,1
5,0,81.0,0,0,1,0,0,186.21,29.0,1,1


In [8]:
dummies_stroke_df = pd.get_dummies(stroke_df)
dummies_stroke_df.rename(columns={"gender_Female":"female","gender_Male":"male", 
                                  "ever_married_No":"unmarried","ever_married_Yes":"married",
                                 "Residence_type_Urban":"Urban", "Residence_type_Rural":"Rural",
                                 "smoking_status_smokes": "smokes", "smoking_status_never smoked": "never_smoked",
                                  "smoking_status_formerly smoked": "formerly smoked",
                                  "smoking_status_Unknown": "unknown_smoker",
                                  "work_type_Private" : "private_worker", "work_type_Self-employed" : "self_employed",
                                  "work_type_children" : "child_non_worker", "work_type_Govt_job" : "govt_job",
                                  "work_type_Never_worked" : "never_worked"
                                 }, inplace =True)
dummies_stroke_df.head(130)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,0,0,228.69,36.6,1,1
2,0,80.0,0,1,1,0,1,105.92,32.5,2,1
3,1,49.0,0,0,1,0,0,171.23,34.4,0,1
4,1,79.0,1,0,1,1,1,174.12,24.0,2,1
5,0,81.0,0,0,1,0,0,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
149,1,70.0,0,1,1,0,1,239.07,26.1,2,1
151,0,68.0,0,1,1,0,0,223.83,31.9,1,1
152,1,80.0,0,0,1,1,0,76.57,34.1,2,1
153,0,68.0,0,0,1,1,0,77.82,27.5,0,1


In [9]:
data = stroke_df.values

X = data[:, 0:-1]
y = data[:, -1]

In [10]:
X[0]

array([  0.  ,  67.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  , 228.69,
        36.6 ,   1.  ])

In [11]:
individual_test = np.array([[  0.  ,  90.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  , 380.12,
        0 ,   1.  ]])

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [126]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train.data, y_train)
rf.score(X_test, y_test)

0.9494702526487367

In [127]:
importances = rf.feature_importances_
importances

array([0.03237867, 0.22580486, 0.0246157 , 0.02220219, 0.01561057,
       0.0497975 , 0.03385858, 0.29616164, 0.23490092, 0.06466936])

In [128]:
column_names = list(stroke_df.columns.values)


In [129]:
sorted(zip(rf.feature_importances_, column_names[0:-1]), reverse=True)

[(0.2961616446432603, 'avg_glucose_level'),
 (0.23490091647507844, 'bmi'),
 (0.22580486343746284, 'age'),
 (0.06466935513144235, 'smoking_status'),
 (0.04979750066703505, 'work_type'),
 (0.03385857778460384, 'Residence_type'),
 (0.032378673298254945, 'gender'),
 (0.02461570311004764, 'hypertension'),
 (0.022202190927100584, 'heart_disease'),
 (0.015610574525713922, 'ever_married')]

In [139]:
rf.predict_proba(individual_test)

array([[0.86, 0.14]])

In [145]:
pickle.dump(rf, open('stroke_machine_learning.sav', 'wb'))

In [12]:
loaded_model = pickle.load(open('stroke_machine_learning.sav', 'rb'))
individual_test = np.array([[  0.  ,  60.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  , 380.12,
        23.5 ,   1.  ]])
prediction = loaded_model.predict_proba(individual_test)

In [13]:
prediction[0][0]

0.9