In [2]:
import wfdb
import glob
import os
import tqdm
import pandas as pd
import numpy as np

In [3]:
base_dir = "/Users/taniapazospuig/Desktop/bio/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/"
csv_path = os.path.join(base_dir, "ptbxl_database.csv")
record_dir = os.path.join(base_dir, "records500")

In [4]:
# Load metadata
variables = pd.read_csv(csv_path, index_col=0)

# Find all .dat files from records500 with the raw ECG signals
files = glob.glob(os.path.join(record_dir, "**", "*.dat"), recursive=True)

# Extract ecg_id from filenames
labels = [os.path.splitext(os.path.basename(f))[0] for f in files]
ecg_ids = [int(label.split("_")[0]) for label in labels]

# Filter metadata to keep only rows for which we have actual ECG waveform files
variables = variables.loc[variables.index.isin(ecg_ids)]

# Reorder filtered metadata to match the order of the waveform files
ordered_indices = [id for id in ecg_ids if id in variables.index]
variables = variables.loc[ordered_indices]

In [5]:
# Shape and preview
print("Shape of variables:", variables.shape) # Rows are ECGs and columns are metadata
variables.head()

Shape of variables: (21799, 27)


Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,13619.0,56.0,0,,,2.0,0.0,CS-12 E,1985-01-23 12:55:32,supraventrikulÄre ersatzsystole(n) interponier...,...,True,,,,,VES,,9,records100/00000/00020_lr,records500/00000/00020_hr
771,3063.0,63.0,0,,,10.0,1.0,AT-6 6,1987-05-10 17:22:51,trace only requested.,...,True,,,,,,,9,records100/00000/00771_lr,records500/00000/00771_hr
297,4845.0,73.0,1,170.0,103.0,1.0,1.0,AT-6 C 5.5,1986-09-12 10:22:10,premature ventricular contraction(s). sinus rh...,...,True,,,,,,,7,records100/00000/00297_lr,records500/00000/00297_hr
120,11860.0,45.0,1,,57.0,2.0,0.0,CS-12 E,1986-01-16 06:41:58,sinusrhythmus normales ekg,...,True,,", alles,",,,,,4,records100/00000/00120_lr,records500/00000/00120_hr
671,3977.0,76.0,1,167.0,45.0,3.0,1.0,AT-6 6,1987-04-25 19:35:42,sinus rhythm. normal ecg.,...,True,,,,,,,3,records100/00000/00671_lr,records500/00000/00671_hr


In [6]:
# Check for missing values
missing = variables.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

electrodes_problems    21769
infarction_stadium2    21696
pacemaker              21508
burst_noise            21186
baseline_drift         20201
extra_beats            19850
static_noise           18539
infarction_stadium1    16187
height                 14825
weight                 12378
validated_by            9378
heart_axis              8468
nurse                   1473
site                      17
dtype: int64

Features like `electrodes_problems`, `infarction_stadium2`, `pacemaker`, `burst_noise`, `baseline_drift`, `extra_beats`, and `static_noise` were excluded from analysis because they are mostly or entirely missing across the dataset. Including them would either add noise to the model or require imputation strategies that could introduce bias. Additionally, columns such as `nurse` and `site` were ignored due to their low relevance to ECG signal interpretation. These features reflect administrative or acquisition metadata, which are unlikely to provide generalizable diagnostic value and may lead to overfitting if retained.

In [7]:
print("Sex distribution:\n", variables["sex"].value_counts(), "\n")
print("Age summary:\n", variables["age"].describe())

Sex distribution:
 sex
0    11354
1    10445
Name: count, dtype: int64 

Age summary:
 count    21799.000000
mean        62.769301
std         32.308813
min          2.000000
25%         50.000000
50%         62.000000
75%         72.000000
max        300.000000
Name: age, dtype: float64


In the PTB-XL dataset, patient age is provided at the time of ECG recording. However, in compliance with HIPAA privacy standards, all patients older than 89 years are assigned a value of 300. This is a form of pseudonymization to prevent potential re-identification of elderly individuals. Since this value does not represent a real age and could skew the model or statistical summaries, we cap all age values at 89.

In [8]:
# Cap age at 89
variables["age"] = variables["age"].apply(lambda x: 89 if x == 300 else x)

In [9]:
variables["scp_codes"]

ecg_id
20                 {'AFLT': 100.0, 'ABQRS': 0.0}
771                              {'NORM': 100.0}
297        {'NORM': 80.0, 'PVC': 0.0, 'SR': 0.0}
120                   {'NORM': 100.0, 'SR': 0.0}
671                   {'NORM': 100.0, 'SR': 0.0}
                          ...                   
17141                  {'NDT': 100.0, 'SR': 0.0}
17710                  {'NORM': 80.0, 'SR': 0.0}
17041                {'CLBBB': 100.0, 'SR': 0.0}
17805    {'NORM': 80.0, 'HVOLT': 0.0, 'SR': 0.0}
17905                  {'NDT': 100.0, 'SR': 0.0}
Name: scp_codes, Length: 21799, dtype: object

In [14]:
from ast import literal_eval

# Load statement reference table
scp_df = pd.read_csv(os.path.join(base_dir, "scp_statements.csv"), index_col=0)

# Keep rows with a valid diagnostic class
scp_diagnostic_map = scp_df[scp_df["diagnostic_class"].notnull()]["diagnostic_class"].to_dict()

# Convert columns scp_codes from string to dictionary
variables["scp_codes"] = variables["scp_codes"].apply(literal_eval)

# Map scp_codes to one or more diagnostic superclasses
def map_to_superclasses(scp_code_dict):
    return list({scp_diagnostic_map[code] for code in scp_code_dict if code in scp_diagnostic_map})

# Add a new column with the mapped superclasses
variables["diagnostic_superclass_mapped"] = variables["scp_codes"].apply(map_to_superclasses)

variables[["scp_codes", "diagnostic_superclass_mapped"]].head()


ValueError: malformed node or string: {'AFLT': 100.0, 'ABQRS': 0.0}