In [54]:
import numpy as np
import pandas as pd

# Import tools needed for visualization
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydot
import pydotplus
from sklearn.tree import export_graphviz
import seaborn as sns
import matplotlib.pyplot as plt

# Models & Processing
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from collections import Counter
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingClassifier

# Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef
# Tuning of Model
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

# Imblearn
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

In [55]:
df = pd.read_csv('https://github.com/tzekiattok/strokedata/blob/main/healthcare-dataset-stroke-data.csv?raw=true')

In [56]:
df.info()
#let stroke the the label, (0,1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [57]:
df.drop('id', axis=1, inplace=True)
df.head()
df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [58]:
df[df['stroke']].count()

KeyError: ignored

### 1. Impute missing values (BMI) with Decision Tree Regressor

In [None]:
DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=0))
                              ])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi

In [None]:
residence_type = df['Residence_type'].unique()
ever_married = df['ever_married'].unique()
le = LabelEncoder()

le.fit(residence_type)
df['Residence_type'] = le.transform(df['Residence_type'])

le.fit(ever_married)
df['ever_married'] = le.transform(df['ever_married'])

### 2. OneHotEncode Categorical values

In [None]:
#Encode values
df_encoded = df
df_encoded= pd.concat([df_encoded, pd.get_dummies(df_encoded['work_type'], prefix = 'work_type=')], axis = 1).drop(['work_type'], axis = 1)
df_encoded= pd.concat([df_encoded, pd.get_dummies(df_encoded['smoking_status'], prefix = 'smoking_status=')], axis = 1).drop(['smoking_status'], axis = 1)
df_encoded= pd.concat([df_encoded, pd.get_dummies(df_encoded['gender'], prefix = 'gender=')], axis = 1).drop(['gender'], axis = 1)
df1 = df_encoded
df_encoded.head()

### 3. Define X & y

In [None]:
X = df1[['age', 'hypertension', 'heart_disease','ever_married','Residence_type', 'avg_glucose_level', 'bmi', 'work_type=_Govt_job','work_type=_Never_worked','work_type=_Private','work_type=_Self-employed','work_type=_children','Residence_type','smoking_status=_Unknown','smoking_status=_formerly smoked','smoking_status=_never smoked','smoking_status=_smokes','gender=_Female','gender=_Male','gender=_Other']]
y = df1['stroke']
Columns = ['age', 'hypertension', 'heart_disease','ever_married','Residence_type', 'avg_glucose_level', 'bmi', 'work_type=_Govt_job','work_type=_Never_worked','work_type=_Private','work_type=_Self-employed','work_type=_children','Residence_type','smoking_status=_Unknown','smoking_status=_formerly smoked','smoking_status=_never smoked','smoking_status=_smokes','gender=_Female','gender=_Male','gender=_Other']
X.info()

### 4. Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=424)

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

### 5. Outlier Removal

In [None]:
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train_scaled)

In [None]:
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train_scaled[mask], y_train[mask]

In [None]:
print(X_train.shape, y_train.shape)

### 6. Apply Oversampling

In [None]:
rforest = RandomForestClassifier(random_state=424)

ros = RandomOverSampler(random_state = 424)

# fit predictor and target variable
X_train_ros, y_train_ros =  ros.fit_resample(X_train, y_train.ravel())

In [None]:
print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_train_ros))

### Random Forest with best params

In [None]:
# Random Forest with best hyper params
rforest = RandomForestClassifier(criterion='gini', max_depth=8, max_features=8, n_estimators=200, random_state=424)
rforest.fit(X_train_ros,y_train_ros)

y_pred = rforest.predict(X_test)
y_prob = rforest.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred) 
recall = metrics.recall_score(y_test, y_pred) 
f1_score = metrics.f1_score(y_test, y_pred) 
print('Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score, '\n')


print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('Balanced Accuracy Score: ',balanced_accuracy_score(y_test, y_pred))

In [None]:
# XGBoost with best params

In [None]:
params = {'colsample_bytree' : 0.7, 'learning_rate' : 0.1, 'max_depth' : 10, 'subsample': 0.8, 'eval_metric' : 'error', 'random_state': 424}
classifier = XGBClassifier(**params)

classifier.fit(X_train_ros, y_train_ros)
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred) 
recall = metrics.recall_score(y_test, y_pred) 
f1_score = metrics.f1_score(y_test, y_pred) 
print('Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score, '\n')


print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('Balanced Accuracy Score: ',balanced_accuracy_score(y_test, y_pred))

In [None]:
#Logistic Regression

In [None]:
lr = LogisticRegression(random_state =424,max_iter=5000, C = 0.5, solver = 'sag')
lr.fit(X_train_ros,y_train_ros)
y_pred_lr = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred_lr)

accuracy = accuracy_score(y_test, y_pred_lr)
precision = metrics.precision_score(y_test, y_pred_lr) 
recall = metrics.recall_score(y_test, y_pred_lr) 
f1_score = metrics.f1_score(y_test, y_pred_lr) 
print('Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score, '\n')



print(classification_report(y_test, y_pred_lr))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred_lr))
print('Balanced Accuracy Score: ',balanced_accuracy_score(y_test, y_pred_lr))


# Perform Stacking

In [None]:
# Random Forest + Logistic Regression

# Stacking to find the best combination of models

In [None]:
# random forest and logistic regression

In [None]:

estimators = [
    ('rforest',rforest),
    ('logr',lr)
    
]

#Build the stack model
stack_model = StackingClassifier(
estimators = estimators, final_estimator = LogisticRegression()
)

stack_model.fit(X_train_ros,y_train_ros)

#Predict
y_train_pred = stack_model.predict(X_train_ros)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train_ros, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train_ros, y_train_pred) # Calculate MCC
stack_model_train_f1 = metrics.f1_score(y_train_ros, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = metrics.f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)

In [None]:
y_pred = stack_model.predict(X_test)
y_prob = stack_model.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred) 
recall = metrics.recall_score(y_test, y_pred) 
f1_score = metrics.f1_score(y_test, y_pred) 
print('Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score, '\n')



print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('Balanced Accuracy Score: ',balanced_accuracy_score(y_test, y_pred))

In [None]:
# XGBoost + Logistic Regression

In [None]:
estimators = [
    ('xgb',classifier),
    ('logr',lr)
]

#Build the stack model
stack_model = StackingClassifier(
estimators = estimators, final_estimator = LogisticRegression()
)

stack_model.fit(X_train_ros,y_train_ros)

#Predict
y_train_pred = stack_model.predict(X_train_ros)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train_ros, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train_ros, y_train_pred) # Calculate MCC
stack_model_train_f1 = metrics.f1_score(y_train_ros, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = metrics.f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)

In [None]:
y_pred = stack_model.predict(X_test)
y_prob = stack_model.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred) 
recall = metrics.recall_score(y_test, y_pred) 
f1_score = metrics.f1_score(y_test, y_pred) 
print('Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score, '\n')



print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('Balanced Accuracy Score: ',balanced_accuracy_score(y_test, y_pred))

In [None]:
# XGBoost + Random Forest

In [None]:
estimators = [
    ('xgb',classifier),
    ('rforest',rforest)
    
]

#Build the stack model
stack_model = StackingClassifier(
estimators = estimators, final_estimator = LogisticRegression()
)

stack_model.fit(X_train_ros,y_train_ros)

#Predict
y_train_pred = stack_model.predict(X_train_ros)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train_ros, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train_ros, y_train_pred) # Calculate MCC
stack_model_train_f1 = metrics.f1_score(y_train_ros, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = metrics.f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)

In [None]:

y_pred = stack_model.predict(X_test)
y_prob = stack_model.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred) 
recall = metrics.recall_score(y_test, y_pred) 
f1_score = metrics.f1_score(y_test, y_pred) 
print('Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score, '\n')



print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('Balanced Accuracy Score: ',balanced_accuracy_score(y_test, y_pred))

In [None]:
# XGBoost + Logistic Regression + Random Forest

In [None]:
estimators = [
    ('xgb',classifier),
    ('logr',lr),
    ('rforest',rforest)
    
]

#Build the stack model
stack_model = StackingClassifier(
estimators = estimators, final_estimator = LogisticRegression()
)

stack_model.fit(X_train_ros,y_train_ros)

#Predict
y_train_pred = stack_model.predict(X_train_ros)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train_ros, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train_ros, y_train_pred) # Calculate MCC
stack_model_train_f1 = metrics.f1_score(y_train_ros, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = metrics.f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)

In [None]:

y_pred = stack_model.predict(X_test)
y_prob = stack_model.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred) 
recall = metrics.recall_score(y_test, y_pred) 
f1_score = metrics.f1_score(y_test, y_pred) 
print('Accuracy:',
      accuracy,'\n Precision:',  precision,'\n Recall:',  recall,'\n f1_score:',  f1_score, '\n')



print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('Balanced Accuracy Score: ',balanced_accuracy_score(y_test, y_pred))