In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from tensorflow.keras.utils import to_categorical
import numpy as np
import pickle

In [50]:
stroke_df = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [51]:
stroke_df.drop(columns=['id'], inplace = True)
stroke_df.dropna(inplace=True)


In [52]:
stroke_df.work_type.value_counts()

Private          2811
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: work_type, dtype: int64

In [53]:
stroke_df = stroke_df[stroke_df.gender != 'Other']
stroke_df.gender.value_counts()

Female    2897
Male      2011
Name: gender, dtype: int64

In [54]:
#replace male with 0, female with 1
stroke_df['gender'].replace({'Male': 0, 'Female':1}, inplace=True)

#replace married with 1, not married with 0
stroke_df['ever_married'].replace({'Yes': 1, 'No':0}, inplace=True)

#Replace Urban with 0, Rural with 1
stroke_df['Residence_type'].replace({'Urban': 0, 'Rural':1}, inplace=True)

#Replace Smoker : 0, Former Smoker : 1, Never smoked : 2, Unknown : 3
stroke_df['smoking_status'].replace({'smokes': 0, 'formerly smoked':1, 
                                    'never smoked': 2, 'Unknown':3}, inplace=True)

#Replace Work type. Private :0, Self-employed : 1, child : 2, Govt-job :3, Never worked : 4
stroke_df['work_type'].replace({'Private': 0, 'Self-employed':1, 
                                    'children': 2, 'Govt_job':3, 'Never_worked' : 4}, inplace=True)


stroke_df.head(20)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,0,0,228.69,36.6,1,1
2,0,80.0,0,1,1,0,1,105.92,32.5,2,1
3,1,49.0,0,0,1,0,0,171.23,34.4,0,1
4,1,79.0,1,0,1,1,1,174.12,24.0,2,1
5,0,81.0,0,0,1,0,0,186.21,29.0,1,1
6,0,74.0,1,1,1,0,1,70.09,27.4,2,1
7,1,69.0,0,0,0,0,0,94.39,22.8,2,1
9,1,78.0,0,0,1,0,0,58.57,24.2,3,1
10,1,81.0,1,0,1,0,1,80.43,29.7,2,1
11,1,61.0,0,1,1,3,1,120.46,36.8,0,1


In [55]:
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,0,0,228.69,36.6,1,1
2,0,80.0,0,1,1,0,1,105.92,32.5,2,1
3,1,49.0,0,0,1,0,0,171.23,34.4,0,1
4,1,79.0,1,0,1,1,1,174.12,24.0,2,1
5,0,81.0,0,0,1,0,0,186.21,29.0,1,1


In [56]:
data = stroke_df.values

X = data[:, 0:-1]
y = data[:, -1]

In [57]:
X[0]

array([  0.  ,  67.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  , 228.69,
        36.6 ,   1.  ])

In [58]:
individual_test = np.array([[  0.  ,  90.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  , 380.12,
        0 ,   1.  ]])

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [60]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train.data, y_train)
rf.score(X_test, y_test)

0.9494702526487367

In [61]:
importances = rf.feature_importances_
importances

array([0.03178978, 0.22414677, 0.02587363, 0.02223227, 0.01591293,
       0.0484985 , 0.03316731, 0.29092963, 0.24122718, 0.06622201])

In [62]:
column_names = list(stroke_df.columns.values)


In [63]:
sorted(zip(rf.feature_importances_, column_names[0:-1]), reverse=True)

[(0.2909296261954029, 'avg_glucose_level'),
 (0.2412271778078351, 'bmi'),
 (0.22414676940963166, 'age'),
 (0.06622200867966423, 'smoking_status'),
 (0.04849850288065196, 'work_type'),
 (0.03316731141355066, 'Residence_type'),
 (0.03178978248991089, 'gender'),
 (0.02587362816563511, 'hypertension'),
 (0.022232267092293753, 'heart_disease'),
 (0.01591292586542387, 'ever_married')]

In [64]:
rf.predict_proba(individual_test)

array([[0.775, 0.225]])

In [65]:
pickle.dump(rf, open('stroke_machine_learning.sav', 'wb'))

In [66]:
loaded_model = pickle.load(open('stroke_machine_learning.sav', 'rb'))
prediction = loaded_model.predict_proba(individual_test)

In [67]:
prediction[0][1]

0.225