In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import make_scorer

raw_data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [2]:
raw_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
raw_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [4]:
raw_data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
raw_data = raw_data.dropna(axis=0).reset_index(drop=True)
raw_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4904,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4905,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4906,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4907,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [6]:
raw_data.describe(include='all')

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,4909.0,4909,4909.0,4909.0,4909.0,4909,4909,4909,4909.0,4909.0,4909,4909.0
unique,,3,,,,2,5,2,,,4,
top,,Female,,,,Yes,Private,Urban,,,never smoked,
freq,,2897,,,,3204,2811,2490,,,1852,
mean,37064.313506,,42.865374,0.091872,0.049501,,,,105.30515,28.893237,,0.042575
std,20995.098457,,22.555115,0.288875,0.216934,,,,44.424341,7.854067,,0.201917
min,77.0,,0.08,0.0,0.0,,,,55.12,10.3,,0.0
25%,18605.0,,25.0,0.0,0.0,,,,77.07,23.5,,0.0
50%,37608.0,,44.0,0.0,0.0,,,,91.68,28.1,,0.0
75%,55220.0,,60.0,0.0,0.0,,,,113.57,33.1,,0.0


In [7]:
raw_data['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [8]:
raw_data = raw_data.drop(raw_data[raw_data['gender'] == 'Other'].index, axis=0).reset_index(drop=True)
raw_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4903,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4904,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4905,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4906,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [9]:
raw_data.drop(['id'],inplace=True,axis=1)

In [10]:
raw_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4903,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4904,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4905,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4906,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [11]:
raw_data = raw_data.rename(columns={'Residence_type': 'residence_type'})

In [12]:
print(raw_data['gender'].unique())
print(raw_data['ever_married'].unique())
print(raw_data['residence_type'].unique())
print(raw_data['work_type'].unique())
print(raw_data['smoking_status'].unique())

['Male' 'Female']
['Yes' 'No']
['Urban' 'Rural']
['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [13]:
raw_data['gender'] = raw_data['gender'].replace({'Male':0,'Female':1})
raw_data['ever_married'] = raw_data['ever_married'].replace({'No':0,'Yes':1})
raw_data['residence_type'] = raw_data['residence_type'].replace({'Rural':0,'Urban':1})
raw_data['work_type'] = raw_data['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':3,'Never_worked':4})
raw_data['smoking_status'] = raw_data['smoking_status'].replace({'never smoked':0,'Unknown':1,'formerly smoked': 2,'smokes':3})

In [14]:
raw_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,0,1,228.69,36.6,2,1
1,0,80.0,0,1,1,0,0,105.92,32.5,0,1
2,1,49.0,0,0,1,0,1,171.23,34.4,3,1
3,1,79.0,1,0,1,1,0,174.12,24.0,0,1
4,0,81.0,0,0,1,0,1,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
4903,1,13.0,0,0,0,3,0,103.08,18.6,1,0
4904,1,81.0,0,0,1,1,1,125.20,40.0,0,0
4905,1,35.0,0,0,1,1,0,82.99,30.6,0,0
4906,0,51.0,0,0,1,0,0,166.29,25.6,2,0


In [15]:
corr=raw_data.corr()
corr

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
gender,1.0,0.03028,-0.021811,-0.08295,0.03638,-0.070889,0.004351,-0.053161,0.026164,-0.064168,-0.006904
age,0.03028,1.0,0.274395,0.257104,0.680742,-0.415506,0.010795,0.236,0.333314,0.080461,0.232313
hypertension,-0.021811,0.274395,1.0,0.115978,0.16235,-0.073458,-0.00114,0.180614,0.16777,0.010515,0.142503
heart_disease,-0.08295,0.257104,0.115978,1.0,0.111203,-0.054965,-0.002409,0.154577,0.041322,0.067151,0.137929
ever_married,0.03638,0.680742,0.16235,0.111203,1.0,-0.378109,0.004707,0.151657,0.341553,0.085798,0.105051
work_type,-0.070889,-0.415506,-0.073458,-0.054965,-0.378109,1.0,0.012931,-0.063029,-0.347412,-0.050086,-0.057569
residence_type,0.004351,0.010795,-0.00114,-0.002409,0.004707,0.012931,1.0,-0.007441,-0.000293,0.032694,0.005988
avg_glucose_level,-0.053161,0.236,0.180614,0.154577,0.151657,-0.063029,-0.007441,1.0,0.175672,0.018793,0.138984
bmi,0.026164,0.333314,0.16777,0.041322,0.341553,-0.347412,-0.000293,0.175672,1.0,0.047944,0.042341
smoking_status,-0.064168,0.080461,0.010515,0.067151,0.085798,-0.050086,0.032694,0.018793,0.047944,1.0,0.029772


In [16]:
raw_data.drop(['gender', 'residence_type', 'bmi', 'work_type', 'smoking_status'], inplace=True, axis=1)
raw_data

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,stroke
0,67.0,0,1,1,228.69,1
1,80.0,0,1,1,105.92,1
2,49.0,0,0,1,171.23,1
3,79.0,1,0,1,174.12,1
4,81.0,0,0,1,186.21,1
...,...,...,...,...,...,...
4903,13.0,0,0,0,103.08,0
4904,81.0,0,0,1,125.20,0
4905,35.0,0,0,1,82.99,0
4906,51.0,0,0,1,166.29,0


In [23]:
!pip install -q --user pycaret



In [30]:
!pip install --upgrade scikit-learn





In [17]:
from pycaret.classification import *
clf1 = setup(data=raw_data, target='stroke')

Unnamed: 0,Description,Value
0,Session id,1700
1,Target,stroke
2,Target type,Binary
3,Original data shape,"(4908, 6)"
4,Transformed data shape,"(4908, 6)"
5,Transformed train set shape,"(3435, 6)"
6,Transformed test set shape,"(1473, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


In [18]:
compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.8809,0.8411,0.4319,0.1617,0.2347,0.1845,0.2113,0.042
nb,Naive Bayes,0.8748,0.8367,0.439,0.1562,0.23,0.1784,0.2072,0.017
dt,Decision Tree Classifier,0.9243,0.5548,0.151,0.15,0.1468,0.1083,0.1097,0.019
lda,Linear Discriminant Analysis,0.9482,0.8446,0.0752,0.1932,0.1068,0.0869,0.0974,0.015
lightgbm,Light Gradient Boosting Machine,0.9502,0.8133,0.0405,0.1683,0.064,0.048,0.0618,0.098
et,Extra Trees Classifier,0.9397,0.6822,0.0476,0.0962,0.0622,0.0351,0.038,0.13
rf,Random Forest Classifier,0.9467,0.7338,0.0271,0.0992,0.0405,0.022,0.0287,0.15
svm,SVM - Linear Kernel,0.9467,0.0,0.05,0.0272,0.0338,0.025,0.027,0.019
gbc,Gradient Boosting Classifier,0.952,0.8317,0.0133,0.0833,0.0229,0.0119,0.018,0.135
lr,Logistic Regression,0.9578,0.8542,0.0067,0.1,0.0125,0.012,0.0253,0.803


In [21]:
from sklearn.model_selection import train_test_split

data, labels = raw_data.iloc[:,:4], raw_data['stroke']

raw_data.to_csv("stroke_data_transformed.csv", index=False)

X_train, X_test, y_train, y_test = train_test_split(data, labels, 
                                                    test_size=0.3, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=labels)

In [22]:
from imblearn.over_sampling import SMOTE
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [23]:
classifier = QuadraticDiscriminantAnalysis()
model = classifier.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

In [42]:
df_result = pd.DataFrame(columns=['model', 'tp', 'tn', 'fp', 'fn', 'correct', 'incorrect',
                                  'accuracy', 'precision', 'recall', 'f1', 'roc_auc','avg_pre'])

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
avg_precision = average_precision_score(y_test, y_pred)
row = {'model': 'QuadraticDiscriminantAnalysis without SMOTE',
        'tp': tp,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'correct': tp+tn,
        'incorrect': fp+fn,
        'accuracy': round(accuracy,3),
        'precision': round(precision,3),
        'recall': round(recall,3),
        'f1': round(f1,3),
        'roc_auc': round(roc_auc,3),
        'avg_pre': round(avg_precision,3),
    }

df_result = df_result.append(row, ignore_index=True)
df_result.head()

Unnamed: 0,model,tp,tn,fp,fn,correct,incorrect,accuracy,precision,recall,f1,roc_auc,avg_pre
0,QuadraticDiscriminantAnalysis without SMOTE,24,1280,130,39,1304,169,0.885,0.156,0.381,0.221,0.644,0.086


In [43]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, 
                                                    test_size=0.3, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=labels)

In [44]:
oversampled = SMOTE(random_state=0)
X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

In [45]:
y_train_smote.value_counts()

0    3289
1    3289
Name: stroke, dtype: int64

In [46]:
classifier_smote = QuadraticDiscriminantAnalysis()
model = classifier_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = model.predict(X_test)

In [47]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_smote).ravel()
accuracy = accuracy_score(y_test, y_pred_smote)
precision = precision_score(y_test, y_pred_smote)
recall = recall_score(y_test, y_pred_smote)
f1 = f1_score(y_test, y_pred_smote)
roc_auc = roc_auc_score(y_test, y_pred_smote)
avg_precision = average_precision_score(y_test, y_pred_smote)

In [48]:
row = {'model': 'QuadraticDiscriminantAnalysis with SMOTE',
        'tp': tp,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'correct': tp+tn,
        'incorrect': fp+fn,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'avg_pre': round(avg_precision,3),       
    }
df_result = df_result.append(row, ignore_index=True)
df_result.head()

Unnamed: 0,model,tp,tn,fp,fn,correct,incorrect,accuracy,precision,recall,f1,roc_auc,avg_pre
0,QuadraticDiscriminantAnalysis without SMOTE,24,1280,130,39,1304,169,0.885,0.156,0.381,0.221,0.644,0.086
1,QuadraticDiscriminantAnalysis with SMOTE,49,973,437,14,1022,451,0.693822,0.100823,0.777778,0.178506,0.733924,0.088
