In [None]:
import glob
import urllib
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, fbeta_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
urllib.request.urlretrieve('https://raw.githubusercontent.com/zbz95/Intro_to_ML/refs/heads/main/hfp_dataset.csv', 'hfp_dataset.csv')

('hfp_dataset.csv', <http.client.HTTPMessage at 0x7e6ec203ed10>)

We are working with dataset from Davide Chicco, Giuseppe Jurman: Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone. BMC Medical Informatics and Decision Making 20, 16 (2020)  (Link to the journal article - https://doi.org/10.1186/s12911-020-1023-5)

In [None]:
df = pd.read_csv('hfp_dataset.csv')

In [None]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


Cardiovascular diseases (CVDs) are the number 1 cause of death globally, taking an estimated 17.9 million lives each year, which accounts for 31% of all deaths worlwide.
Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.


People with cardiovascular disease or who are at high cardiovascular risk (due to the presence of one or more risk factors such as hypertension, diabetes, hyperlipidaemia or already established disease) need early detection and management wherein a machine learning model can be of great help.

age - age of patient in years

anaemia - Decrease of red blood cells or hemoglobin (boolean)

creatinine_phosphokinase - Level of the CPK enzyme in the blood (mcg/L)

diabetes - If the patient has diabetes (boolean)

ejection_fraction - Percentage of blood leaving the heart at each contraction (percentage)

high_blood_pressure - If the patient has hypertension (boolean)

platelets - Platelets in the blood (kiloplatelets/mL)

serum_creatinine - Level of serum creatinine in the blood (mg/dL)

serum_sodium - Level of serum sodium in the blood (mEq/L)

sex - Woman or man (binary) Male = 1, Female =0

smoking - If the patient smokes or not (boolean)

time 	- Follow-up period (days)

DEATH_EVENT - If the patient deceased during the follow-up period (boolean) - Target column

In [None]:
df['DEATH_EVENT'].value_counts()

Unnamed: 0_level_0,count
DEATH_EVENT,Unnamed: 1_level_1
0,203
1,96


# Exercise 1. Split the dataframe into train and test by 80 to 20 ratio. Use train_test_split function from sklearn. Use 'stratify' argument to split data according to target column

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['DEATH_EVENT'])

#Lets try logistic regression model together. We will build the model and check its score using cross_val_score

In [None]:
model = LogisticRegression()
features = [col for col in df.columns if col != 'DEATH_EVENT']

scores = cross_val_score(model, train[features], train['DEATH_EVENT'], cv=5, scoring='f1')
print(scores)
print(np.mean(scores))

[0.62068966 0.53846154 0.88235294 0.64       0.73333333]
0.6829674936287513


#Average f1 score of Logistic Regression is 0.68.

#Exercise 2. Try building DecisionTreeClassifier with different parameters of max_depth from 2 to 10 and see if you can beat score of 0.68

In [None]:
### your code here
for k in range(2, 10):
    model = DecisionTreeClassifier(max_depth=k, random_state=42)
    scores = cross_val_score(model, train[features], train['DEATH_EVENT'], cv=5, scoring='f1')
    mean_score = np.mean(scores)

    print(f"{k}   {mean_score}")


2   0.6882163228927934
3   0.6884615384615385
4   0.6599533996118436
5   0.611533185020393
6   0.646494708994709
7   0.6397435897435897
8   0.6616379310344828
9   0.6433823529411764


#Exercise 3. Try building RandomForestClassifier with different parameters of max_depth from 2 to 10 and see if you can beat score of 0.68

In [None]:
### your code here

for k in range(2,10):
    model = RandomForestClassifier(max_depth=k, random_state=42)
    scores = cross_val_score(model, train[features], train['DEATH_EVENT'], cv=5, scoring='f1')
    mean_score = np.mean(scores)
    print(f"{k}   {mean_score}")


2   0.684417947314499
3   0.7628852240546393
4   0.7601203648047967
5   0.7769955524030101
6   0.7561657712151725
7   0.7626394718065201
8   0.7324408384043272
9   0.7355008210180625


# Exercise 4. Authors of the article claim that "ejection_fraction" and "serum_creatinine" are strong predictors of heart failure. Lets do some feature engineering and try to combine these features. Create two new features in dataframe  "ejection_fraction"x"serum_creatinine" and "ejection_fraction"/"serum_creatinine"

In [None]:
##your code here

train['ejection_fraction_x_serum_creatinine'] = train['ejection_fraction']*train['serum_creatinine']
train['ejection_fraction_/_serum_creatinine'] = train['ejection_fraction']/train['serum_creatinine']


#Exercise 5. add two nea features to your features list and try building RandomForestClassifier with different parameters again and see if your scores have improved

In [None]:
### Your code here

In [None]:
train['creatinine_per_day'] = train['serum_creatinine']/train['time']
train['platelet_to_creatinine_ratio'] = train['platelets']/train['serum_creatinine']


In [None]:
train

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,ejection_fraction_x_serum_creatinine,ejection_fraction_/_serum_creatinine,creatinine_per_day,platelet_to_creatinine_ratio
115,58.0,1,400,0,40,0,164000.0,1.0,139,0,0,91,0,40.0,40.000000,0.010989,164000.000000
23,53.0,0,63,1,60,0,368000.0,0.8,135,1,0,22,0,48.0,75.000000,0.036364,460000.000000
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1,38.0,10.526316,0.475000,139473.684211
247,64.0,0,143,0,25,0,246000.0,2.4,135,1,0,214,0,60.0,10.416667,0.011215,102500.000000
194,45.0,0,582,0,20,1,126000.0,1.6,135,1,0,180,1,32.0,12.500000,0.008889,78750.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,72.0,0,211,0,25,0,274000.0,1.2,134,0,0,207,0,30.0,20.833333,0.005797,228333.333333
131,60.0,1,1082,1,45,0,250000.0,6.1,131,1,0,107,0,274.5,7.377049,0.057009,40983.606557
177,49.0,1,69,0,50,0,132000.0,1.0,140,0,0,147,0,50.0,50.000000,0.006803,132000.000000
112,50.0,0,369,1,25,0,252000.0,1.6,136,1,0,90,0,40.0,15.625000,0.017778,157500.000000


In [None]:
for k in range(2,10):
    model = RandomForestClassifier(max_depth=k, random_state=42)
    scores = cross_val_score(model, train[features], train['DEATH_EVENT'], cv=5, scoring='f1')
    mean_score = np.mean(scores)
    print(f"{k}   {mean_score}")

2   0.684417947314499
3   0.7628852240546393
4   0.7601203648047967
5   0.7769955524030101
6   0.7561657712151725
7   0.7626394718065201
8   0.7324408384043272
9   0.7355008210180625
