In [1]:
import sklearn
sklearn.__version__

'1.0.2'

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
from sklearn.feature_selection import mutual_info_classif

In [3]:
def scores_print(y_true, predictions):
    print(f'accuracy = {accuracy_score(y_true, predictions)}')
    print(f'Cross-entropy = {log_loss(y_true, predictions)}')
    print(f'Confusion_matrix = \n{confusion_matrix(y_true, predictions)}')

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
path = "healthcare-dataset-stroke-data.csv"

In [6]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [8]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [9]:
data = df.copy()

In [10]:
# data.bmi.fillna(data.bmi.median(), inplace = True)
data.dropna(inplace = True)
data = data[data['gender']!="Other"]

In [11]:
data['age/bmi'] = data['age']/data['bmi']
data['age*bmi'] = data['age']*data['bmi']

In [12]:
data.shape

(4908, 14)

In [13]:
X, y = data.drop(['id', 'stroke'], axis = 1), data.stroke

In [14]:
X_oh = pd.get_dummies(X)

In [15]:
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE(random_state=123)
X_sm , y_sm = sm.fit_resample(X_oh,y)

In [16]:
X_train, X_val, y_train, y_val =  train_test_split(X_sm, y_sm, stratify = y_sm, random_state = 777)

In [17]:
X_train.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'age/bmi', 'age*bmi', 'gender_Female', 'gender_Male', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes'],
      dtype='object')

In [18]:
scaler = StandardScaler()
histgradient = HistGradientBoostingClassifier(random_state=0)
logistic = LogisticRegression(random_state=0)
randomforest = RandomForestClassifier(random_state = 0)

In [19]:
from imblearn.pipeline import Pipeline
histgradientmodel = Pipeline(
    steps = [
        ("scaler", scaler),
        ("classifier", histgradient)
    ]
)
cv_results = cross_validate(
    histgradientmodel, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1
)
print(
    f"Balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}"
)

scores = []
for fold_id, cv_model in enumerate(cv_results["estimator"]):
    scores.append(
        balanced_accuracy_score(
            y_val, cv_model.predict(X_val)
        )
    )
print(
    f"Balanced accuracy mean +/- std. dev.: "
    f"{np.mean(scores):.3f} +/- {np.std(scores):.3f}"
)

Balanced accuracy mean +/- std. dev.: 0.973 +/- 0.003
Balanced accuracy mean +/- std. dev.: 0.975 +/- 0.001


In [20]:
logisticmodel = Pipeline(
    steps = [
        ("scaler", scaler),
        ("classifier", logistic)
    ]
)
cv_results = cross_validate(
    logisticmodel, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1
)
print(
    f"Balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}"
)

scores = []
for fold_id, cv_model in enumerate(cv_results["estimator"]):
    scores.append(
        balanced_accuracy_score(
            y_val, cv_model.predict(X_val)
        )
    )
print(
    f"Balanced accuracy mean +/- std. dev.: "
    f"{np.mean(scores):.3f} +/- {np.std(scores):.3f}"
)

Balanced accuracy mean +/- std. dev.: 0.971 +/- 0.006
Balanced accuracy mean +/- std. dev.: 0.972 +/- 0.000


In [21]:
RFGmodel = Pipeline(
    steps = [
        ("scaler", scaler),
        ("classifier", randomforest)
    ]
)
cv_results = cross_validate(
    RFGmodel, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1
)
print(
    f"Balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}"
)

scores = []
for fold_id, cv_model in enumerate(cv_results["estimator"]):
    scores.append(
        balanced_accuracy_score(
            y_val, cv_model.predict(X_val)
        )
    )
print(
    f"Balanced accuracy mean +/- std. dev.: "
    f"{np.mean(scores):.3f} +/- {np.std(scores):.3f}"
)

Balanced accuracy mean +/- std. dev.: 0.974 +/- 0.001
Balanced accuracy mean +/- std. dev.: 0.974 +/- 0.002


In [22]:
logisticmodel.fit(X_train, y_train)
y_preds = logisticmodel.predict(X_val)
print()
scores_print(y_val, y_preds)


accuracy = 0.9719148936170213
Cross-entropy = 0.9700252093889818
Confusion_matrix = 
[[1175    0]
 [  66 1109]]


In [23]:
RFGmodel.fit(X_train, y_train)
y_preds = RFGmodel.predict(X_val)
scores_print(y_val, y_preds)

accuracy = 0.9757446808510638
Cross-entropy = 0.8377538080314612
Confusion_matrix = 
[[1161   14]
 [  43 1132]]


In [24]:
histgradientmodel.fit(X_train, y_train)
y_preds = histgradientmodel.predict(X_val)
scores_print(y_val, y_preds)

accuracy = 0.9761702127659575
Cross-entropy = 0.8230537343402475
Confusion_matrix = 
[[1169    6]
 [  50 1125]]


In [39]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators = [
    ('logistic', logisticmodel), ('random forest', RFGmodel), ('histgradient', histgradientmodel)
], voting="soft")
vc.fit(X_train.values, y_train)
y_preds = vc.predict(X_val.values)
scores_print(y_val, y_preds)

accuracy = 0.9748936170212766
Cross-entropy = 0.8671451088040226
Confusion_matrix = 
[[1171    4]
 [  55 1120]]


In [40]:
import pickle
with open('Stroke_Prediction.pickle', 'wb') as  f:
    pickle.dump(vc, f)

In [37]:
import json
columns = {
    'data_columns': [col.lower() for col in X_oh.columns]
}
with open('columns.json', 'w') as f:
    f.write(json.dumps(columns))

In [34]:
for i, j in enumerate(columns['data_columns']):
    print(f'{i}: {j}')

0: age
1: hypertension
2: heart_disease
3: avg_glucose_level
4: bmi
5: age/bmi
6: age*bmi
7: gender_female
8: gender_male
9: ever_married_no
10: ever_married_yes
11: work_type_govt_job
12: work_type_never_worked
13: work_type_private
14: work_type_self-employed
15: work_type_children
16: residence_type_rural
17: residence_type_urban
18: smoking_status_unknown
19: smoking_status_formerly smoked
20: smoking_status_never smoked
21: smoking_status_smokes


In [33]:
columns

{'data_columns': ['age',
  'hypertension',
  'heart_disease',
  'avg_glucose_level',
  'bmi',
  'age/bmi',
  'age*bmi',
  'gender_female',
  'gender_male',
  'ever_married_no',
  'ever_married_yes',
  'work_type_govt_job',
  'work_type_never_worked',
  'work_type_private',
  'work_type_self-employed',
  'work_type_children',
  'residence_type_rural',
  'residence_type_urban',
  'smoking_status_unknown',
  'smoking_status_formerly smoked',
  'smoking_status_never smoked',
  'smoking_status_smokes']}