In [1]:
import numpy
import pandas
import sklearn.datasets

# Instructions

- Run this script to generate data for all experiments
- This script write the data to `./data/heart_failure.csv`

In [2]:
data = sklearn.datasets.fetch_openml('heart-failure', as_frame=True, parser='auto').frame
data.index.name = 'id'
data.columns = data.columns.str.lower()
data

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death_event
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.00,1.9,130.0,1.0,0.0,4.0,1.0
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.00,1.3,129.0,1.0,1.0,7.0,1.0
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.00,1.9,137.0,1.0,0.0,7.0,1.0
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.00,2.7,116.0,0.0,0.0,8.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0.0,61.0,1.0,38.0,1.0,155000.00,1.1,143.0,1.0,1.0,270.0,0.0
295,55.0,0.0,1820.0,0.0,38.0,0.0,270000.00,1.2,139.0,0.0,0.0,271.0,0.0
296,45.0,0.0,2060.0,1.0,60.0,0.0,742000.00,0.8,138.0,0.0,0.0,278.0,0.0
297,45.0,0.0,2413.0,0.0,38.0,0.0,140000.00,1.4,140.0,1.0,1.0,280.0,0.0


In [3]:
mappers = {
    'anaemia' : {1.0: 'yes', 0.0: 'no'},
    'diabetes': {1.0: 'yes', 0.0: 'no'},
    'high_blood_pressure': {1.0: 'yes', 0.0: 'no'},
    'sex'     : {1.0: 'male', 0.0: 'female'},
    'smoking' : {1.0: 'yes', 0.0: 'no'},
    'death_event': {1.0: 'yes', 0.0: 'no'},
}


for column, mapper in mappers.items():
    data[column] = data[column].map(mapper)
    
data

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death_event
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,75.0,no,582.0,no,20.0,yes,265000.00,1.9,130.0,male,no,4.0,yes
1,55.0,no,7861.0,no,38.0,no,263358.03,1.1,136.0,male,no,6.0,yes
2,65.0,no,146.0,no,20.0,no,162000.00,1.3,129.0,male,yes,7.0,yes
3,50.0,yes,111.0,no,20.0,no,210000.00,1.9,137.0,male,no,7.0,yes
4,65.0,yes,160.0,yes,20.0,no,327000.00,2.7,116.0,female,no,8.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,no,61.0,yes,38.0,yes,155000.00,1.1,143.0,male,yes,270.0,no
295,55.0,no,1820.0,no,38.0,no,270000.00,1.2,139.0,female,no,271.0,no
296,45.0,no,2060.0,yes,60.0,no,742000.00,0.8,138.0,female,no,278.0,no
297,45.0,no,2413.0,no,38.0,no,140000.00,1.4,140.0,male,yes,280.0,no


In [4]:
numpy.random.seed(0)
data = data.mask(
    numpy.random.choice(
        [False, True], 
        size=data.shape, 
        p=[0.999, 0.001]),
    numpy.nan
)
data

Unnamed: 0_level_0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death_event
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,75.0,no,582.0,no,20.0,yes,265000.00,1.9,130.0,male,no,4.0,yes
1,55.0,no,7861.0,no,38.0,no,263358.03,1.1,136.0,male,no,6.0,yes
2,65.0,no,146.0,no,20.0,no,162000.00,1.3,129.0,male,yes,7.0,yes
3,50.0,yes,111.0,no,20.0,no,210000.00,1.9,137.0,male,no,7.0,yes
4,65.0,yes,160.0,yes,20.0,no,327000.00,2.7,116.0,female,no,8.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,no,61.0,yes,38.0,yes,155000.00,1.1,143.0,male,yes,270.0,no
295,55.0,no,1820.0,no,38.0,no,270000.00,1.2,139.0,female,no,271.0,no
296,45.0,no,2060.0,yes,60.0,no,742000.00,0.8,138.0,female,no,278.0,no
297,45.0,no,2413.0,no,38.0,no,140000.00,1.4,140.0,male,yes,280.0,no


In [5]:
data.to_csv('./data/heart_failure.csv')