# Victor Jong Soon Peng

## Import Libraries

In [1]:
import os # access directory
import numpy as np # computing with array objects
import pandas as pd # data structures and data analysis 

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler # Feature Scaling
from sklearn.preprocessing import OneHotEncoder # e.g. {1,0,0,0}, {0,1,0,0}
from sklearn.preprocessing import LabelEncoder # e.g. {"no" : 0, "yes" : 1}

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report, confusion_matrix 

## Import data

In [2]:
# location of the dataset
data = pd.read_csv("..\healthcare-dataset-stroke-data.csv") 

In [3]:
# look at the first 5 lines of data
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Data Preprocessing

### Handling Missing Values

In [4]:
# check how many values are missing (NaN)
count_nan = data.isnull().sum() # the number of missing values for every column
print(count_nan[count_nan > 0])

bmi    201
dtype: int64


In [5]:
# filling the missing values with mean value
data['bmi'].fillna(data['bmi'].mean(), inplace = True)
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
# check how many values are missing (NaN)
count_nan = data.isnull().sum() # the number of missing values for every column
print(count_nan[count_nan > 0])

Series([], dtype: int64)


### Convert Categorical Data into Numbers

In [7]:
# checking for unique values in each columns
print("gender: ", data['gender'].unique())
print("ever_married: ", data['ever_married'].unique())
print("work_type: ", data['work_type'].unique())
print("Residence_type: ", data['Residence_type'].unique())
print("smoking_status: ", data['smoking_status'].unique())

gender:  ['Male' 'Female' 'Other']
ever_married:  ['Yes' 'No']
work_type:  ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type:  ['Urban' 'Rural']
smoking_status:  ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


#### pandas get_dummies: maps each category to 0 (cold) or 1 (hot) = one hot encoder

In [8]:
# Gender
gender = data["gender"] # series 
gender_encoded = pd.get_dummies(gender, prefix='')

print(gender[:10]) # original version 
print(gender_encoded[:10]) # encoded numbers for categories 

0      Male
1    Female
2      Male
3    Female
4    Female
5      Male
6      Male
7    Female
8    Female
9    Female
Name: gender, dtype: object
   _Female  _Male  _Other
0        0      1       0
1        1      0       0
2        0      1       0
3        1      0       0
4        1      0       0
5        0      1       0
6        0      1       0
7        1      0       0
8        1      0       0
9        1      0       0


In [9]:
# Work type
workType = data["work_type"]
workType_encoded = pd.get_dummies(workType, prefix='')

print(workType[:10]) 
print(workType_encoded[:10])

0          Private
1    Self-employed
2          Private
3          Private
4    Self-employed
5          Private
6          Private
7          Private
8          Private
9          Private
Name: work_type, dtype: object
   _Govt_job  _Never_worked  _Private  _Self-employed  _children
0          0              0         1               0          0
1          0              0         0               1          0
2          0              0         1               0          0
3          0              0         1               0          0
4          0              0         0               1          0
5          0              0         1               0          0
6          0              0         1               0          0
7          0              0         1               0          0
8          0              0         1               0          0
9          0              0         1               0          0


In [10]:
# Smoking status
smokingStatus = data["smoking_status"]
smokingStatus_encoded = pd.get_dummies(smokingStatus, prefix='')

print(smokingStatus[:10])
print(smokingStatus_encoded[:10]) 

0    formerly smoked
1       never smoked
2       never smoked
3             smokes
4       never smoked
5    formerly smoked
6       never smoked
7       never smoked
8            Unknown
9            Unknown
Name: smoking_status, dtype: object
   _Unknown  _formerly smoked  _never smoked  _smokes
0         0                 1              0        0
1         0                 0              1        0
2         0                 0              1        0
3         0                 0              0        1
4         0                 0              1        0
5         0                 1              0        0
6         0                 0              1        0
7         0                 0              1        0
8         1                 0              0        0
9         1                 0              0        0


In [11]:
# Ever Married
everMarried = data["ever_married"]
everMarried_encoded = pd.get_dummies(everMarried, prefix='')

print(everMarried[:10])
print(everMarried_encoded[:10]) 

0    Yes
1    Yes
2    Yes
3    Yes
4    Yes
5    Yes
6    Yes
7     No
8    Yes
9    Yes
Name: ever_married, dtype: object
   _No  _Yes
0    0     1
1    0     1
2    0     1
3    0     1
4    0     1
5    0     1
6    0     1
7    1     0
8    0     1
9    0     1


In [12]:
# Residence type
residenceType = data["Residence_type"]
residenceType_encoded = pd.get_dummies(residenceType, prefix='')

print(residenceType[:10])
print(residenceType_encoded[:10]) 

0    Urban
1    Rural
2    Rural
3    Urban
4    Rural
5    Urban
6    Rural
7    Urban
8    Rural
9    Urban
Name: Residence_type, dtype: object
   _Rural  _Urban
0       0       1
1       1       0
2       1       0
3       0       1
4       1       0
5       0       1
6       1       0
7       0       1
8       1       0
9       0       1


### Dividing the Data into Test and Train

In [13]:
# Take the numerical data from the original data
X_num = data[['age', 'hypertension', 'heart_disease','avg_glucose_level','bmi']].copy()

In [14]:
X_num.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
0,67.0,0,1,228.69,36.6
1,61.0,0,0,202.21,28.893237
2,80.0,0,1,105.92,32.5
3,49.0,0,0,171.23,34.4
4,79.0,1,0,174.12,24.0


In [15]:
# Take the encoded data and add to numerical data
X_final = pd.concat([X_num, gender_encoded, workType_encoded, smokingStatus_encoded, 
                     everMarried_encoded, residenceType_encoded], axis = 1)

In [16]:
X_final.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,_Female,_Male,_Other,_Govt_job,_Never_worked,...,_Self-employed,_children,_Unknown,_formerly smoked,_never smoked,_smokes,_No,_Yes,_Rural,_Urban
0,67.0,0,1,228.69,36.6,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,1
1,61.0,0,0,202.21,28.893237,1,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0
2,80.0,0,1,105.92,32.5,0,1,0,0,0,...,0,0,0,0,1,0,0,1,1,0
3,49.0,0,0,171.23,34.4,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,79.0,1,0,174.12,24.0,1,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0


In [17]:
# define y as being the "stroke column" from the original dataset
y_final = data[['stroke']].copy()

In [18]:
y_final.head()

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1


In [19]:
#Test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )

### Feature Scaling

In [20]:
s_scaler = StandardScaler()

In [21]:
X_train = s_scaler.fit_transform(X_train.astype(np.float))

In [22]:
X_test= s_scaler.transform(X_test.astype(np.float))

## Model Building

### #1 - Support Vector Classifier

In [23]:
svc = SVC()

#fit model
svc = svc.fit(X_train,y_train.values.ravel())

#print score
print('svc train score %.3f, svc test score: %.3f' % (
svc.score(X_train,y_train),
svc.score(X_test, y_test)))

svc train score 0.950, svc test score: 0.954


### #2 - Random Forest Classifier

In [24]:
rfc = RandomForestClassifier()

#fit model
rfc = rfc.fit(X_train,y_train.values.ravel())

#print score
print('rfc train score %.3f, rfc test score: %.3f' % (
rfc.score(X_train,y_train),
rfc.score(X_test, y_test)))

rfc train score 1.000, rfc test score: 0.953


## Model Tuning

### Support Vector Classifier Hyperparameter Tuning

In [25]:
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  

gs = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 

# fitting the model for grid search 
gs.fit(X_train,y_train.values.ravel()) 

# print best parameter after tuning 
print(gs.best_params_) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.950, total=   0.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.949, total=   0.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.949, total=   0.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.950, total=   0.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.950, total=   0.2s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.950, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.949, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.949, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.950, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.949, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.949, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.950, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.950, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.950, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.949, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.949, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.950, total=   1.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.950, total=   0.5s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.949, total=   0.5s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.949, total=   0.6s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.950, total=   0.5s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:   24.3s finished


[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.950, total=   0.5s
{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}


In [26]:
svc = SVC(kernel= 'rbf', C=0.1, gamma=1)

#fit model
svc = svc.fit(X_train,y_train.values.ravel())

#print score
print('svc train score %.3f, svc test score: %.3f' % (
svc.score(X_train,y_train),
svc.score(X_test, y_test)))

svc train score 0.950, svc test score: 0.954


### Random Forest Classifier Hyperparameter Tuning

In [27]:
# Best parametars 
param_grid = { "criterion" : ["gini", "entropy"], 
              "n_estimators": [20, 30, 40, 50, 100, 150],
              "max_depth": [2, 4, 10, 12, 14, 16], 
              "min_samples_leaf" : [2, 4, 6, 8, 10],
                "max_features": ['auto', 'sqrt', 'log2']}

gs = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

gs = gs.fit(X_train,y_train.values.ravel())

print(gs.best_score_)
print(gs.best_params_)

0.9500439663635977
{'criterion': 'entropy', 'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 30}


In [30]:
rfc = RandomForestClassifier(criterion='entropy', 
                             n_estimators=30,
                             max_depth=14,
                             min_samples_leaf=2,
                             max_features='auto',                           
                            )

#fit model
rfc = rfc.fit(X_train,y_train.values.ravel())

#print score
print('rfc train score %.3f, rfc test score: %.3f' % (
rfc.score(X_train,y_train),
rfc.score(X_test, y_test)))

rfc train score 0.961, rfc test score: 0.953
