In [8]:
# import essential libraries
import numpy as np
import pandas as pd
import gzip

## Export to CSV

### 10 Features - 1 Label

We'll only keep 10 features:
* age
* sex
* height
* weight
* heart_axis
* validated_by
* second_opinion
* validated_by_human
* pacemaker
* strat_fold

and 1 label: 
* ritmi

We'll specifically drop missing values for the height and weight columns. Therefore, we'll end up having 1803 data points with 10 features and 1 label, then export to csv file.

In [4]:
df = pd.read_csv('../../../data/afib_data/coorteeqsrafva.csv', sep=';', header=0, index_col=0)
df.head()

Unnamed: 0,diagnosi,ecg_id,ritmi,patient_id,age,sex,height,weight,nurse,site,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,STACH,10900,VA,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
1,AFLT,10900,AF,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
2,SR,8209,SR,12281.0,55.0,0,,,1.0,2.0,...,True,,,,,,,10,records100/08000/08209_lr,records500/08000/08209_hr
3,STACH,17620,VA,2007.0,29.0,1,164.0,56.0,7.0,1.0,...,True,,,,,,,1,records100/17000/17620_lr,records500/17000/17620_hr
4,SBRAD,12967,VA,8685.0,57.0,0,,,0.0,0.0,...,False,,", I-AVR,",,,,,1,records100/12000/12967_lr,records500/12000/12967_hr


In [5]:
sub_df = df.drop(columns=['diagnosi', 'ecg_id', 'patient_id', 'nurse', 'site', 'device', 'recording_date', 'report', 'scp_codes', 'infarction_stadium1', 'infarction_stadium2', 'initial_autogenerated_report', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'filename_lr', 'filename_hr'])
sub_df.head()

Unnamed: 0,ritmi,age,sex,height,weight,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,VA,54.0,0,,,MID,,False,False,,6
1,AF,54.0,0,,,MID,,False,False,,6
2,SR,55.0,0,,,LAD,1.0,False,True,,10
3,VA,29.0,1,164.0,56.0,,0.0,False,True,,1
4,VA,57.0,0,,,MID,,False,False,,1


In [6]:
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6428 entries, 0 to 6427
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ritmi               6428 non-null   object 
 1   age                 6394 non-null   float64
 2   sex                 6428 non-null   int64  
 3   height              1866 non-null   float64
 4   weight              2428 non-null   float64
 5   heart_axis          4124 non-null   object 
 6   validated_by        3676 non-null   float64
 7   second_opinion      6428 non-null   bool   
 8   validated_by_human  6428 non-null   bool   
 9   pacemaker           294 non-null    object 
 10  strat_fold          6428 non-null   int64  
dtypes: bool(2), float64(4), int64(2), object(3)
memory usage: 514.7+ KB


In [7]:
# drop missing values for height and weight
sub_df = sub_df[sub_df['height'].notna()]
sub_df = sub_df[sub_df['weight'].notna()]

# fill missing values for age with the mean
sub_df['age'].fillna((sub_df['age'].mean()), inplace=True)

# fill missing values for validated_by with 0
sub_df['validated_by'] = sub_df['validated_by'].fillna(0)

# fill missing values for heart_axis with 0
sub_df['heart_axis'] = sub_df['heart_axis'].fillna('Missing')

# fill missing values for pacemaker with 0 
sub_df['pacemaker'] = sub_df['pacemaker'].fillna('Missing')

In [8]:
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1803 entries, 3 to 6426
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ritmi               1803 non-null   object 
 1   age                 1803 non-null   float64
 2   sex                 1803 non-null   int64  
 3   height              1803 non-null   float64
 4   weight              1803 non-null   float64
 5   heart_axis          1803 non-null   object 
 6   validated_by        1803 non-null   float64
 7   second_opinion      1803 non-null   bool   
 8   validated_by_human  1803 non-null   bool   
 9   pacemaker           1803 non-null   object 
 10  strat_fold          1803 non-null   int64  
dtypes: bool(2), float64(4), int64(2), object(3)
memory usage: 144.4+ KB


In [7]:
# map categorical values to numerical values
sub_df['ritmi'] = sub_df['ritmi'].map({'SR': 0, 'AF': 1, 'VA': 2}).values
sub_df['second_opinion'] = sub_df['second_opinion'].map({False: 0, True:1}).values
sub_df['validated_by_human'] = sub_df['validated_by_human'].map({False: 0, True:1}).values
sub_df['heart_axis'] = sub_df['heart_axis'].map({'Missing':0, 'LAD':1, 'AXL':2, 'MID':3, 'RAD':4, 'ALAD':5, 'AXR':6}).values
sub_df['pacemaker'] = sub_df['pacemaker'].map({'Missing':0, 'PACE????, nan': 0, '?, nan':0, 'ja, pacemaker':1}).values

# reset index
sub_df = sub_df.reset_index(drop=True)
sub_df

Unnamed: 0,ritmi,age,sex,height,weight,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,2,29.0,1,164.0,56.0,0,0.0,0,1,0,1
1,0,59.0,0,156.0,75.0,0,0.0,0,1,0,9
2,2,84.0,1,152.0,51.0,0,0.0,0,1,0,7
3,0,79.0,0,172.0,66.0,0,0.0,0,1,0,5
4,1,67.0,0,178.0,73.0,4,0.0,0,1,0,5
...,...,...,...,...,...,...,...,...,...,...,...
1798,0,76.0,0,166.0,67.0,0,0.0,0,1,0,4
1799,2,78.0,1,176.0,76.0,1,4.0,0,1,0,1
1800,2,81.0,0,178.0,70.0,1,0.0,0,1,0,4
1801,2,88.0,0,152.0,45.0,0,0.0,0,1,0,10


In [8]:
# export to csv
sub_df.to_csv('../../../data/afib_data/training_11_features.csv', index=False)

### 13 Features - 1 Label

We'll only keep 13 features:
* age
* sex
* height
* weight
* nurse
* site
* device
* heart_axis
* validated_by
* second_opinion
* validated_by_human
* pacemaker
* strat_fold

and 1 label: 
* ritmi

We'll specifically fill missing values with the mean values for the age, height, weight columns. In addition, we'll also fill missing values with 0 for the nurse, site, validated_by, heart_axis, and pacemaker columns. Therefore, we'll end up having 6366 data points with 13 features and 1 label, then export to csv file.

In [9]:
# read in csv file
df = pd.read_csv('../../../data/afib_data/coorteeqsrafva.csv', sep=';', header=0, index_col=0)
df.head()

Unnamed: 0,diagnosi,ecg_id,ritmi,patient_id,age,sex,height,weight,nurse,site,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,STACH,10900,VA,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
1,AFLT,10900,AF,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
2,SR,8209,SR,12281.0,55.0,0,,,1.0,2.0,...,True,,,,,,,10,records100/08000/08209_lr,records500/08000/08209_hr
3,STACH,17620,VA,2007.0,29.0,1,164.0,56.0,7.0,1.0,...,True,,,,,,,1,records100/17000/17620_lr,records500/17000/17620_hr
4,SBRAD,12967,VA,8685.0,57.0,0,,,0.0,0.0,...,False,,", I-AVR,",,,,,1,records100/12000/12967_lr,records500/12000/12967_hr


In [10]:
# drop columns
sub_df = df.drop(columns=['diagnosi', 'ecg_id', 'patient_id', 'recording_date', 'report', 'scp_codes', 'infarction_stadium1', 'infarction_stadium2', 'initial_autogenerated_report', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'filename_lr', 'filename_hr'])
sub_df.head()

Unnamed: 0,ritmi,age,sex,height,weight,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,VA,54.0,0,,,0.0,0.0,CS100 3,MID,,False,False,,6
1,AF,54.0,0,,,0.0,0.0,CS100 3,MID,,False,False,,6
2,SR,55.0,0,,,1.0,2.0,CS-12,LAD,1.0,False,True,,10
3,VA,29.0,1,164.0,56.0,7.0,1.0,AT-6 C 5.6,,0.0,False,True,,1
4,VA,57.0,0,,,0.0,0.0,CS100 3,MID,,False,False,,1


In [11]:
# fill missing values with mean values for the age, height, weight columns
sub_df['age'].fillna(value=sub_df['age'].mean(), inplace=True)
sub_df['height'].fillna(value=sub_df['height'].mean(), inplace=True)
sub_df['weight'].fillna(value=sub_df['weight'].mean(), inplace=True)

# fill missing values with 0 for these columns
sub_df['nurse'] = sub_df['nurse'].fillna(0)
sub_df['site'] = sub_df['site'].fillna(0)
sub_df['validated_by'] = sub_df['validated_by'].fillna(0)
sub_df['heart_axis'] = sub_df['heart_axis'].fillna('Missing')
sub_df['pacemaker'] = sub_df['pacemaker'].fillna('Missing')

In [12]:
# map categorical values to numerical values
sub_df['ritmi'] = sub_df['ritmi'].map({'SR': 0, 'AF': 1, 'VA': 2}).values
sub_df['second_opinion'] = sub_df['second_opinion'].map({False: 0, True:1}).values
sub_df['validated_by_human'] = sub_df['validated_by_human'].map({False: 0, True:1}).values
sub_df['heart_axis'] = sub_df['heart_axis'].map({'Missing':0, 'LAD':1, 'AXL':2, 'MID':3, 'RAD':4, 'ALAD':5, 'AXR':6}).values
sub_df['pacemaker'] = sub_df['pacemaker'].map({'Missing':0, 'PACE????, nan': 0, '?, nan':0, 'ja, pacemaker':1}).values
sub_df['device'] = sub_df['device'].map({'CS100    3':0, 'CS-12': 1, 'AT-6 C 5.5':2, 'CS-12   E':3, 'AT-6     6':4, 'AT-60    3':5, 'AT-6 C 5.8':6, 'AT-6 C':7, 'AT-6 C 5.0':8, 'AT-6 C 5.3':9, 'AT-6 C 5.6':10}).values

In [13]:
# drop missing values
sub_df = sub_df.dropna()

# check sub_df
print(sub_df.shape)
sub_df.head()

(6366, 14)


Unnamed: 0,ritmi,age,sex,height,weight,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,2,54.0,0,166.796356,69.841845,0.0,0.0,0,3.0,0.0,0,0,0.0,6
1,1,54.0,0,166.796356,69.841845,0.0,0.0,0,3.0,0.0,0,0,0.0,6
2,0,55.0,0,166.796356,69.841845,1.0,2.0,1,1.0,1.0,0,1,0.0,10
3,2,29.0,1,164.0,56.0,7.0,1.0,10,0.0,0.0,0,1,0.0,1
4,2,57.0,0,166.796356,69.841845,0.0,0.0,0,3.0,0.0,0,0,0.0,1


In [22]:
# export to csv
sub_df.to_csv('../../../data/afib_data/training_13_features.csv', index=False)

### Compress 3D Numpy Array Features

Since the numpy data file is too large, we'll try to reduce the size of it.

In [None]:
# compress numpy array file and export
f = gzip.GzipFile("../../../data/afib_data/compressed_npy.gz", "w")
np.save(file=f, arr=ecg_arr)
f.close()

### 3D Numpy Arr to CSV

After reading in the compressed numpy data file, we'll transform 3D to 2D array and then convert it to dataframe. We will have 30948374 data points and 12 features with 1 label.

In [28]:
# read in compressed np array file
f = gzip.GzipFile('../../../data/afib_data/compressed_npy.gz', "r")
np_arr = np.load(f)
np_arr.shape

# transform 3d to 2d array
m,n,r = np_arr.shape
out_arr = np.column_stack((np.repeat(np.arange(m),n),X.reshape(m*n,-1)))

# convert to dataframe
out_df = pd.DataFrame(out_arr)
out_df = out_df.drop_duplicates()
out_df.columns= ['index', 'I', 'II', 'III', 'aVF', 'aVR', 'aVL', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
out_df['index'] = out_df['index'].astype('int32')

(6428, 5000, 12)

In [None]:
# get dataframe with only the label column
df = pd.read_csv('coorteeqsrafva.csv', sep=';', header=0, index_col=0)
label_df = df[['ritmi']]
label_df['ritmi'] = label_df['ritmi'].map({'SR': 0, 'AF': 1, 'VA': 2}).values
label_df['unique_id'] = np.arange(label_df.shape[0])

In [None]:
# merge out_df and label_df
merged_df = pd.merge(out_df, label_df, how='inner', left_on='index', right_on='unique_id')
merged_df = merged_df.drop(columns=['index', 'unique_id'])

# export to csv
merged_df.to_csv('../../../data/afib_data/3d_numpy_features.csv', index=False)