# Data Loader for waveform data

In [1]:
import pandas as pd
import ast
import numpy as np

scp_statements_path = '../../../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/scp_statements.csv'
database_path = '../../../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/ptbxl_database.csv'

In [2]:
# scp statements file
df = pd.read_csv(scp_statements_path)
df.head()

Unnamed: 0.1,Unnamed: 0,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
0,NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
1,NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
2,DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
3,LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
4,NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7


In [3]:
# database
df2 = pd.read_csv(database_path)
df2.head()

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [4]:
print(len(df2))
print(len(df2.loc[df2['validated_by_human'] == True]))

21799
16056


In [5]:
# load and convert annotation data
Y = pd.read_csv(database_path, index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

In [6]:
Y.scp_codes

ecg_id
1                 {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2                             {'NORM': 80.0, 'SBRAD': 0.0}
3                               {'NORM': 100.0, 'SR': 0.0}
4                               {'NORM': 100.0, 'SR': 0.0}
5                               {'NORM': 100.0, 'SR': 0.0}
                               ...                        
21833    {'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...
21834             {'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}
21835                           {'ISCAS': 50.0, 'SR': 0.0}
21836                           {'NORM': 100.0, 'SR': 0.0}
21837                           {'NORM': 100.0, 'SR': 0.0}
Name: scp_codes, Length: 21799, dtype: object

In [7]:
# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(scp_statements_path, index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [8]:
Y.head(20)

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_superclass
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr,[NORM]
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr,[NORM]
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr,[NORM]
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr,[NORM]
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr,[NORM]
6,19005.0,18.0,1,,58.0,2.0,0.0,CS-12 E,1984-11-28 13:32:13,sinusrhythmus normales ekg,...,", V1",,,,,,4,records100/00000/00006_lr,records500/00000/00006_hr,[NORM]
7,16193.0,54.0,0,,83.0,2.0,0.0,CS-12 E,1984-11-28 13:32:22,"sinusrhythmus linkstyp t abnormal, wahrscheinl...",...,,,,,,,7,records100/00000/00007_lr,records500/00000/00007_hr,[NORM]
8,11275.0,48.0,0,,95.0,2.0,0.0,CS-12 E,1984-12-01 14:49:52,sinusrhythmus linkstyp qrs(t) abnormal infe...,...,", II,AVF",", I-AVF,",,,,,9,records100/00000/00008_lr,records500/00000/00008_hr,[MI]
9,18792.0,55.0,0,,70.0,2.0,0.0,CS-12 E,1984-12-08 09:44:43,sinusrhythmus normales ekg,...,,", I-AVR,",,,,,10,records100/00000/00009_lr,records500/00000/00009_hr,[NORM]
10,9456.0,22.0,1,,56.0,2.0,0.0,CS-12 E,1984-12-12 14:12:46,sinusrhythmus normales ekg,...,,,,,,,9,records100/00000/00010_lr,records500/00000/00010_hr,[NORM]


In [9]:
# Split data into train and test
test_fold = 10

# Train
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass
# Test
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass

In [10]:
# Filter to get only elements with one class
y_train_single_class = y_train[y_train.apply(lambda x: len(x) == 1)]

# Get unique classes in the filtered elements
unique_classes = np.unique(y_train_single_class)

print(unique_classes)
len(y_train_single_class)

[list(['CD']) list(['HYP']) list(['MI']) list(['NORM']) list(['STTC'])]


14594

In [11]:
y_train_single_class[:30]

ecg_id
1     [NORM]
2     [NORM]
3     [NORM]
4     [NORM]
5     [NORM]
6     [NORM]
7     [NORM]
8       [MI]
10    [NORM]
11    [NORM]
12    [NORM]
13    [NORM]
14    [NORM]
15    [NORM]
16    [NORM]
19    [NORM]
21    [NORM]
22    [STTC]
24    [NORM]
25    [NORM]
26    [STTC]
27    [NORM]
28    [STTC]
29    [NORM]
30     [HYP]
31    [NORM]
32      [CD]
33    [NORM]
35    [NORM]
36    [NORM]
Name: diagnostic_superclass, dtype: object

In [12]:
from sklearn.preprocessing import LabelEncoder
# Flatten the list structure
y_train_flat = y_train_single_class.apply(lambda x: x[0])

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder and transform the labels to integer encoded labels
y_train_encoded = label_encoder.fit_transform(y_train_flat)

print("Integer Encoded Labels: ", y_train_encoded[:30])

Integer Encoded Labels:  [3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 4 3 3 4 3 4 3 1 3 0 3 3 3]


In [13]:
# Print the mapping of integers to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping: ", label_mapping)

Label Mapping:  {'CD': 0, 'HYP': 1, 'MI': 2, 'NORM': 3, 'STTC': 4}


## Run the following cells to download test set

In [25]:
import os
import pandas as pd
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms

split = 'test'
freq = 500
prefix = f"../../../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/"

# Load the database file
ptb_xl_database_df = pd.read_csv(database_path, index_col='ecg_id')
ptb_xl_database_df.scp_codes = ptb_xl_database_df.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(scp_statements_path, index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
ptb_xl_database_df['diagnostic_superclass'] = ptb_xl_database_df.scp_codes.apply(aggregate_diagnostic)
Y = ptb_xl_database_df

# Split data into train and test
test_fold = 10
y_train = Y[Y.strat_fold != test_fold]
y_test = Y[Y.strat_fold == test_fold]

if split == 'test':
    y = y_test
else:
    y = y_train

if freq == 100:
    y_file_names = y.filename_lr.apply(lambda x: prefix+x)
else:
    y_file_names = y.filename_hr.apply(lambda x: prefix+x)

# Filter to get only elements with one class
y_single_class = y[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]
y_file_names = y_file_names[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]

# Flatten the list structure
y_single_class_flat = y_single_class.diagnostic_superclass.apply(lambda x: x[0])

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder and transform the labels to integer encoded labels
y_encoded = label_encoder.fit_transform(y_single_class_flat)

# Print the mapping of integers to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping: ", label_mapping)

y_labels = y_encoded
y_paths = y_file_names.loc[y_single_class.index]
# Reset index 
y_paths.reset_index(drop=True, inplace=True)
y_paths.index += 0  # Update index to start from 0
y_paths.index.name = 'index'

Label Mapping:  {'CD': 0, 'HYP': 1, 'MI': 2, 'NORM': 3, 'STTC': 4}


In [26]:
len(y_paths), len(y_labels)

(1650, 1650)

In [27]:
y_paths[0]

'../../../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/records500/00000/00009_hr'

In [28]:
type(y_labels)

numpy.ndarray

In [29]:
import pandas as pd

# Convert y_paths and y_labels to a DataFrame
df = pd.DataFrame({
    'path': y_paths.values,
    'label': y_labels
})

# Save the DataFrame to a CSV file
output_csv_path = f'{split}-{freq}.csv'
df.to_csv(output_csv_path, index=False)

print(f"DataFrame saved to {output_csv_path}")


DataFrame saved to test-500.csv


In [19]:
import wfdb

for path in y_paths:    
    ecg = wfdb.rdrecord(path)


## Run the following cell to download train set

In [30]:
import os
import pandas as pd
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms

split = 'train'
freq = 500
prefix = f"../../../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/"

# Load the database file
ptb_xl_database_df = pd.read_csv(database_path, index_col='ecg_id')
ptb_xl_database_df.scp_codes = ptb_xl_database_df.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(scp_statements_path, index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
ptb_xl_database_df['diagnostic_superclass'] = ptb_xl_database_df.scp_codes.apply(aggregate_diagnostic)
Y = ptb_xl_database_df

# Split data into train and test
test_fold = 10
y_train = Y[Y.strat_fold != test_fold]
y_test = Y[Y.strat_fold == test_fold]

if split == 'test':
    y = y_test
else:
    y = y_train

if freq == 100:
    y_file_names = y.filename_lr.apply(lambda x: prefix+x)
else:
    y_file_names = y.filename_hr.apply(lambda x: prefix+x)

# Filter to get only elements with one class
y_single_class = y[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]
y_file_names = y_file_names[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]

# Flatten the list structure
y_single_class_flat = y_single_class.diagnostic_superclass.apply(lambda x: x[0])

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder and transform the labels to integer encoded labels
y_encoded = label_encoder.fit_transform(y_single_class_flat)

# Print the mapping of integers to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping: ", label_mapping)

y_labels = y_encoded
y_paths = y_file_names.loc[y_single_class.index]
# Reset index 
y_paths.reset_index(drop=True, inplace=True)
y_paths.index += 0  # Update index to start from 0
y_paths.index.name = 'index'

Label Mapping:  {'CD': 0, 'HYP': 1, 'MI': 2, 'NORM': 3, 'STTC': 4}


In [31]:
import pandas as pd

# Convert y_paths and y_labels to a DataFrame
df = pd.DataFrame({
    'path': y_paths.values,
    'label': y_labels
})

# The specific path you want to remove
path_to_remove = '../../../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/records100/20000/20711_lr'

# Remove the row containing this path
df = df[df['path'] != path_to_remove]

# Optionally, reset the index after removing the row
df.reset_index(drop=True, inplace=True)

# Save the DataFrame to a CSV file
output_csv_path = f'{split}-{freq}.csv'
df.to_csv(output_csv_path, index=False)

print(f"DataFrame saved to {output_csv_path}")


DataFrame saved to train-500.csv


In [32]:
# import wfdb

# for path in y_paths:    
#     ecg = wfdb.rdrecord(path)

# Data Loader and dataset class

In [24]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import wfdb
from torch.utils.data import Dataset, DataLoader

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import wfdb

class PTBXL_Dataset(Dataset):
    def __init__(self, csv_file, reshape=False):
        """
        Args:
            csv_file (str): Path to the CSV file with ECG paths and labels.
            reshape (bool): Whether to reshape the ECG signal.
        """
        self.data_frame = pd.read_csv(csv_file)
        self.reshape = reshape

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        ecg_path = self.data_frame.iloc[idx, 0]  # Get the path to the ECG .dat file
        label = self.data_frame.iloc[idx, 1]     # Get the corresponding label

        # Load the ECG waveform from the .dat file
        ecg_record = wfdb.rdrecord(ecg_path)
        ecg_signal = ecg_record.p_signal  # Get the ECG signal as a NumPy array

        # Normalize the ECG signal (mean = 0, std = 1)
        ecg_signal = (ecg_signal - ecg_signal.mean(axis=0)) / ecg_signal.std(axis=0)

        # Convert to a PyTorch tensor
        ecg_signal = torch.tensor(ecg_signal, dtype=torch.float32)

        if self.reshape:
            # Assuming you want to reshape to (3, 100, -1)
            ecg_signal = ecg_signal.permute(1, 0).contiguous().view(3, 100, -1)  # Permute if needed

        return ecg_signal, label

# Example usage
csv_file = 'train-500.csv'
ptbxl_dataset = PTBXL_Dataset(csv_file=csv_file, reshape=True)

# Example DataLoader
dataloader = DataLoader(ptbxl_dataset, batch_size=32, shuffle=True, num_workers=4)

# Iterate through the DataLoader
for ecg_signals, labels in dataloader:
    print(ecg_signals.size(), labels.size())
