In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("complete_dataset.csv")
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [2]:
X = df[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']]
y = df["stroke"]

In [3]:
y.value_counts()

0    42617
1      783
Name: stroke, dtype: int64

In [4]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=0.15, random_state=42)
X_res, y_res = rus.fit_resample(X, y)

In [5]:
y_res.value_counts()

0    5220
1     783
Name: stroke, dtype: int64

In [6]:
y_res[X_res['bmi'].isna()].value_counts()

0    152
1    140
Name: stroke, dtype: int64

# DATA CLEANING

In [7]:
X_res['gender'].value_counts()

Female    3554
Male      2448
Other        1
Name: gender, dtype: int64

In [8]:
m = X_res['gender'] == "Other"
X_res = X_res.drop(index=1749)
y_res = y_res.drop(index=1749)

In [9]:
X_res['gender'].value_counts()

Female    3554
Male      2448
Name: gender, dtype: int64

In [10]:
X_res['work_type'].value_counts()

Private          3458
Self-employed    1060
children          734
Govt_job          725
Never_worked       25
Name: work_type, dtype: int64

In [11]:
X_res['work_type'] = X_res['work_type'].replace(to_replace="children", value="Never_worked")

In [12]:
X_res['work_type'].value_counts()

Private          3458
Self-employed    1060
Never_worked      759
Govt_job          725
Name: work_type, dtype: int64

In [13]:
X_res['bmi_unknown'] = X_res['bmi'].isna().astype(int)
X_res['bmi'] = X_res['bmi'].fillna(X_res['bmi'].mean())
X_res

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,bmi_unknown
0,Male,79.0,0,0,Yes,Private,Rural,201.49,24.200000,smokes,0
1,Female,25.0,0,0,No,Private,Urban,90.01,26.400000,smokes,0
2,Female,68.0,0,0,Yes,Private,Urban,109.23,31.300000,never smoked,0
3,Male,49.0,0,0,Yes,Private,Rural,99.39,36.300000,formerly smoked,0
4,Female,20.0,0,0,No,Private,Rural,98.50,30.200000,never smoked,0
...,...,...,...,...,...,...,...,...,...,...,...
5998,Female,56.0,0,0,Yes,Private,Rural,83.27,32.900000,smokes,0
5999,Male,69.0,0,0,Yes,Self-employed,Rural,97.36,28.000000,,0
6000,Female,80.0,0,0,Yes,Self-employed,Urban,75.91,26.700000,never smoked,0
6001,Male,62.0,1,1,Yes,Private,Rural,77.97,31.500000,formerly smoked,0


In [14]:
X_res['smoking_status'] = X_res['smoking_status'].fillna('unknown')

In [15]:
X_res

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,bmi_unknown
0,Male,79.0,0,0,Yes,Private,Rural,201.49,24.200000,smokes,0
1,Female,25.0,0,0,No,Private,Urban,90.01,26.400000,smokes,0
2,Female,68.0,0,0,Yes,Private,Urban,109.23,31.300000,never smoked,0
3,Male,49.0,0,0,Yes,Private,Rural,99.39,36.300000,formerly smoked,0
4,Female,20.0,0,0,No,Private,Rural,98.50,30.200000,never smoked,0
...,...,...,...,...,...,...,...,...,...,...,...
5998,Female,56.0,0,0,Yes,Private,Rural,83.27,32.900000,smokes,0
5999,Male,69.0,0,0,Yes,Self-employed,Rural,97.36,28.000000,unknown,0
6000,Female,80.0,0,0,Yes,Self-employed,Urban,75.91,26.700000,never smoked,0
6001,Male,62.0,1,1,Yes,Private,Rural,77.97,31.500000,formerly smoked,0


In [19]:
X_res

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,bmi_unknown
0,Male,79.0,0,0,Yes,Private,Rural,201.49,24.200000,smokes,0
1,Female,25.0,0,0,No,Private,Urban,90.01,26.400000,smokes,0
2,Female,68.0,0,0,Yes,Private,Urban,109.23,31.300000,never smoked,0
3,Male,49.0,0,0,Yes,Private,Rural,99.39,36.300000,formerly smoked,0
4,Female,20.0,0,0,No,Private,Rural,98.50,30.200000,never smoked,0
...,...,...,...,...,...,...,...,...,...,...,...
5998,Female,56.0,0,0,Yes,Private,Rural,83.27,32.900000,smokes,0
5999,Male,69.0,0,0,Yes,Self-employed,Rural,97.36,28.000000,unknown,0
6000,Female,80.0,0,0,Yes,Self-employed,Urban,75.91,26.700000,never smoked,0
6001,Male,62.0,1,1,Yes,Private,Rural,77.97,31.500000,formerly smoked,0


In [None]:
import imblearn

smote_nc = imblearn.over_sampling.SMOTENC(categorical_features=[0, 2, 3, 4, 5, 6, 9])
X_res_ov, y_res_ov = smote_nc.fit_resample(X_res.drop("bmi_unknown", axis=1), y_res)

#WARNING WE ARE OVERSAMPLING !!!
# X_res = X_res_ov
# y_res = y_res_ov

In [None]:
y_res_ov.value_counts()

In [None]:
import seaborn
tmp = X_res_ov.copy()
tmp["stroke"] = y_res_ov
seaborn.pairplot(tmp, vars=["age", "avg_glucose_level", "bmi"], hue="stroke")

# DATA VISUALIZATION

In [None]:
X_res[["age", "avg_glucose_level", "bmi"]].describe()

In [None]:
# plt.rcParams['figure.figsize'] = [10, 18]
# plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

fcount = X_res.gender.value_counts()[0]
mcount = X_res.gender.value_counts()[1]
nohyp = X_res.hypertension.value_counts()[0]
hyp = X_res.hypertension.value_counts()[1]
noheart = X_res.heart_disease.value_counts()[0]
heart = X_res.heart_disease.value_counts()[1]
nomarried = X_res.ever_married.value_counts()[0]
married = X_res.ever_married.value_counts()[1]


fig, ax = plt.subplots(4, 2)
ax[0, 0].pie([fcount, mcount], labels=['Female', 'Male'], autopct='%1.1f%%')
ax[0, 0].set_title("gender")
ax[0, 1].pie([nohyp, hyp], labels=['No hypertension', 'Hypertension'], autopct='%1.1f%%')
ax[0, 1].set_title("Hypertension")
ax[1, 0].pie([noheart, heart], labels=['No heart disease', 'Heart disease'], autopct='%1.1f%%')
ax[1, 0].set_title("Heart disease")
ax[1, 1].pie([nomarried, married], labels=['Never married', 'Ever married'], autopct='%1.1f%%')
ax[1, 1].set_title("Marital status")
ax[2, 0].pie(list(X_res.work_type.value_counts()), labels=list(X_res.work_type.value_counts().index), autopct='%1.1f%%')
ax[2, 0].set_title("Work type")
ax[2, 1].pie(list(X_res.Residence_type.value_counts()), labels=list(X_res.Residence_type.value_counts().index), autopct='%1.1f%%')
ax[2, 1].set_title("Residence type")
ax[3, 0].pie(list(X_res.smoking_status.value_counts()), labels=list(X_res.smoking_status.value_counts().index), autopct='%1.1f%%')
ax[3, 0].set_title("Smoking status")
ax[3, 1].pie(list(y_res.value_counts()), labels=["No stroke", "Had stroke"], autopct='%1.1f%%')
ax[3, 1].set_title("Stroke")

In [None]:
import plotly.express as px

tmp = X_res.copy()
tmp["stroke"] = y_res
tmp.stroke = tmp.stroke.replace(0, 'No stroke').replace(1, 'Had stroke')
tmp.ever_married = tmp.ever_married.replace('Yes', 'Ever married').replace('No', 'Never married')
# fig = px.sunburst(tmp, path=['stroke', 'gender', 'ever_married', 'Residence_type', 'work_type'])
fig = px.sunburst(tmp, path=['stroke', 'gender'])
fig.show()

# DATA ENCODING

In [None]:
def encode_data(X_res, y_res):
    X_res['gender'] = X_res['gender'].replace({"Male" : 0, "Female" : 1})
    X_res['ever_married'] = X_res['ever_married'].replace({"No" : 0, "Yes" : 1})
    X_res['Residence_type'] = X_res['Residence_type'].replace({"Rural" : 0, "Urban" : 1})
    X_res['smoking_status'] = X_res['smoking_status'].replace({
        "unknown" : 0,
        "never smoked" : 1,
        "formerly smoked" : 2,
        "smokes" : 3,
    })
    X_res["smoking_unknown"] = (X_res["smoking_status"] == 0).astype(int)
    X_res["work_type_Private"] = (X_res["work_type"] == "Private").astype(int)
    X_res["work_type_Self-employed"] = (X_res["work_type"] == "Self-employed").astype(int)
    X_res["work_type_Never_worked"] = (X_res["work_type"] == "Never_worked").astype(int)
    X_res["work_type_Govt_job"] = (X_res["work_type"] == "Govt_job").astype(int)
    X_res.drop(columns="work_type", axis=1, inplace=True)
    return X_res, y_res

In [None]:
X_res, y_res = encode_data(X_res, y_res)
X_res_ov, y_res_ov = encode_data(X_res_ov, y_res_ov)

tmp = X_res.copy()
tmp["stroke"] = y_res
tmp.corr().style.background_gradient(cmap="PiYG", vmin=-1)

In [None]:
from sklearn.feature_selection import chi2, f_classif

cols = ["gender", "hypertension", "heart_disease", "ever_married", "Residence_type", "smoking_status", "work_type_Private", "work_type_Self-employed", "work_type_Never_worked", "work_type_Govt_job"]
categorical = X_res[cols]
chi2stats, pvals = chi2(categorical, y_res)
results = pd.Series(data=chi2stats, index=cols).sort_values(ascending=False)
results

In [None]:
plt.barh(results.index, results.values)
plt.title("χ² test results")
plt.show()

In [None]:
cols = ["age", "avg_glucose_level", "bmi"]
categorical = X_res[cols]
fstats, fpvals = f_classif(categorical, y_res)
results = pd.Series(data=fstats, index=cols).sort_values(ascending=False)
plt.barh(results.index, results.values)
plt.title("F-test results")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_res, y_res)
results = pd.Series(data=model.feature_importances_, index=X_res.columns).sort_values(ascending=False)
plt.barh(results.index, results.values)
plt.title("Random Forest feature importance")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.30)
X_train_ov, X_test_ov, y_train_ov, y_test_ov = train_test_split(X_res_ov, y_res_ov, test_size=0.30)

In [None]:
from sklearn import metrics

def testModels(modellist, X_train, X_test, y_train, y_test):

    for m in modellist:
        model = m()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        print(model)
        print("Accuracy:",metrics.accuracy_score(y_test, preds))
        print(metrics.classification_report(y_test, preds))
        metrics.plot_confusion_matrix(model, X_test, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

mlist = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier, SVC]

testModels(mlist, X_train_ov, X_test_ov, y_train_ov, y_test_ov)

In [None]:
testModels(mlist, X_train, X_test, y_train, y_test)