In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('https://github.com/tzekiattok/strokedata/blob/main/healthcare-dataset-stroke-data.csv?raw=true')

In [3]:
df.info()
#let stroke the the label, (0,1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
df.drop('id', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=0))
                              ])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi

In [6]:
if df['ever_married'].dtype == type(object):
    le = preprocessing.LabelEncoder()
    df['ever_married'] = le.fit_transform(df['ever_married'])
    
if df['Residence_type'].dtype == type(object):
    le = preprocessing.LabelEncoder()
    df['Residence_type'] = le.fit_transform(df['Residence_type'])
        
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,1,Self-employed,0,202.21,29.879487,never smoked,1
2,Male,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
3,Female,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
4,Female,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1


In [7]:
#Encode values
df_encoded = df
df_encoded= pd.concat([df_encoded, pd.get_dummies(df_encoded['gender'], prefix = 'gender=')], axis = 1).drop(['gender'], axis = 1)
df_encoded= pd.concat([df_encoded, pd.get_dummies(df_encoded['work_type'], prefix = 'work_type=')], axis = 1).drop(['work_type'], axis = 1)
df_encoded= pd.concat([df_encoded, pd.get_dummies(df_encoded['smoking_status'], prefix = 'smoking_status=')], axis = 1).drop(['smoking_status'], axis = 1)
df1 = df_encoded
df_encoded.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender=_Female,gender=_Male,gender=_Other,work_type=_Govt_job,work_type=_Never_worked,work_type=_Private,work_type=_Self-employed,work_type=_children,smoking_status=_Unknown,smoking_status=_formerly smoked,smoking_status=_never smoked,smoking_status=_smokes
0,67.0,0,1,1,1,228.69,36.6,1,0,1,0,0,0,1,0,0,0,1,0,0
1,61.0,0,0,1,0,202.21,29.879487,1,1,0,0,0,0,0,1,0,0,0,1,0
2,80.0,0,1,1,0,105.92,32.5,1,0,1,0,0,0,1,0,0,0,0,1,0
3,49.0,0,0,1,1,171.23,34.4,1,1,0,0,0,0,1,0,0,0,0,0,1
4,79.0,1,0,1,0,174.12,24.0,1,1,0,0,0,0,0,1,0,0,0,1,0


In [8]:
df1['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

# Method 1: imblearn Random Undersampling

In [9]:
X = df1[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'ever_married', 'work_type=_Govt_job','work_type=_Never_worked','work_type=_Private','work_type=_Self-employed','work_type=_children','Residence_type','smoking_status=_Unknown','smoking_status=_formerly smoked','smoking_status=_never smoked','smoking_status=_smokes','gender=_Female','gender=_Male', 'gender=_Other']]
y = df1['stroke']

In [21]:
# imblearn undersampling

# import library
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

rus = RandomUnderSampler(random_state=424, replacement=True)# fit predictor and target variable
X_rus, y_rus = rus.fit_resample(X, y)

print('original dataset shape:', Counter(y))
print('Resample dataset shape', Counter(y_rus))

original dataset shape: Counter({0: 4861, 1: 249})
Resample dataset shape Counter({0: 249, 1: 249})


In [22]:
# split into training and test set
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus, y_rus, test_size=0.4, random_state=424)

from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=100, random_state=424)

rforest.fit(X_train_rus,y_train_rus)
y_pred_rf = rforest.predict(X_test_rus)

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score

conf_matrix_rf = confusion_matrix(y_pred_rf, y_test_rus)
accuracy = accuracy_score(y_pred_rf, y_test_rus)
precision = metrics.precision_score(y_pred_rf, y_test_rus) 
recall = metrics.recall_score(y_pred_rf, y_test_rus) 
f1_score = metrics.f1_score(y_pred_rf, y_test_rus) 
print(conf_matrix_rf, '\n Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score)


[[70 19]
 [31 80]] 
 Accuracy: 0.75 
 Precision: 0.8080808080808081 
 Recall: 0.7207207207207207 
 f1_score: 0.7619047619047618


# Method 2: imblearn Random Oversampling

In [23]:
# imblearn oversampling

# import library
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=424)
X_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({0: 4861, 1: 249})
Resample dataset shape Counter({1: 4861, 0: 4861})


In [24]:
# split into training and test set
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros, y_ros, test_size=0.4, random_state=424)

from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=100, random_state=424)

rforest.fit(X_train_ros,y_train_ros)
y_pred_rf = rforest.predict(X_test_ros)

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score

conf_matrix_rf = confusion_matrix(y_pred_rf, y_test_ros)
accuracy = accuracy_score(y_pred_rf, y_test_ros)
precision = metrics.precision_score(y_pred_rf, y_test_ros) 
recall = metrics.recall_score(y_pred_rf, y_test_ros) 
f1_score = metrics.f1_score(y_pred_rf, y_test_ros) 
print(conf_matrix_rf, '\n Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score)


[[1898    0]
 [  45 1946]] 
 Accuracy: 0.9884289020313706 
 Precision: 1.0 
 Recall: 0.9773982923154194 
 f1_score: 0.9885699771399543


# Method 3: imblearn undersampling: tomeklinks

In [28]:
# imblearn undersampling: tomeklinks

# import library
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()

# fit predictor and target variable
X_tl, y_tl = tl.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_tl))

Original dataset shape Counter({0: 4861, 1: 249})
Resample dataset shape Counter({0: 4750, 1: 249})


In [30]:
# split into training and test set
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size=0.4, random_state=424)

from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=100, random_state=424)

rforest.fit(X_train_tl,y_train_tl)
y_pred_rf = rforest.predict(X_test_tl)

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score

conf_matrix_rf = confusion_matrix(y_pred_rf, y_test_tl)
accuracy = accuracy_score(y_pred_rf, y_test_tl)
precision = metrics.precision_score(y_pred_rf, y_test_tl) 
recall = metrics.recall_score(y_pred_rf, y_test_tl) 
f1_score = metrics.f1_score(y_pred_rf, y_test_tl) 
print(conf_matrix_rf, '\n Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score)


[[1895  100]
 [   5    0]] 
 Accuracy: 0.9475 
 Precision: 0.0 
 Recall: 0.0 
 f1_score: 0.0


# Method 4: imblearn over-sampling - SMOTE

In [33]:
# imblearn over-sampling - SMOTE 

# import library
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=424)

# fit predictor and target variable
X_smote, y_smote = smote.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({0: 4861, 1: 249})
Resample dataset shape Counter({1: 4861, 0: 4861})


In [34]:
# split into training and test set
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.4, random_state=424)

from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=100, random_state=424)

rforest.fit(X_train_smote,y_train_smote)
y_pred_rf = rforest.predict(X_test_smote)

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score

conf_matrix_rf = confusion_matrix(y_pred_rf, y_test_smote)
accuracy = accuracy_score(y_pred_rf, y_test_smote)
precision = metrics.precision_score(y_pred_rf, y_test_smote) 
recall = metrics.recall_score(y_pred_rf, y_test_smote) 
f1_score = metrics.f1_score(y_pred_rf, y_test_smote) 
print(conf_matrix_rf, '\n Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score)


[[1911   81]
 [  32 1865]] 
 Accuracy: 0.9709436873232193 
 Precision: 0.9583761562178829 
 Recall: 0.9831312598840274 
 f1_score: 0.9705958886286754


# Method 5: imblearn under-sampling - NearMiss 

In [36]:
# imblearn under-sampling - NearMiss 

from imblearn.under_sampling import NearMiss

nm = NearMiss()

X_nm, y_nm = nm.fit_resample(X, y)

print('Original dataset shape:', Counter(y))
print('Resample dataset shape:', Counter(y_nm))

Original dataset shape: Counter({0: 4861, 1: 249})
Resample dataset shape: Counter({0: 249, 1: 249})


In [37]:
# split into training and test set
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.4, random_state=424)

from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=100, random_state=424)

rforest.fit(X_train_nm,y_train_nm)
y_pred_rf = rforest.predict(X_test_nm)

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score

conf_matrix_rf = confusion_matrix(y_pred_rf, y_test_nm)
accuracy = accuracy_score(y_pred_rf, y_test_nm)
precision = metrics.precision_score(y_pred_rf, y_test_nm) 
recall = metrics.recall_score(y_pred_rf, y_test_nm) 
f1_score = metrics.f1_score(y_pred_rf, y_test_nm) 
print(conf_matrix_rf, '\n Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score)


[[84 44]
 [17 55]] 
 Accuracy: 0.695 
 Precision: 0.5555555555555556 
 Recall: 0.7638888888888888 
 f1_score: 0.6432748538011696
