In this notebook we want to go from raw data, to a filtered, organized and normalized dataset of ECGs, where we have pre-processed each to basically align it according to the R-peak near the 2-second mark.

First we load the data

In [1]:
import pickle
import pandas as pd
import numpy as np
data = pickle.load(open('../all_points_may_2024.pkl', 'rb'))
data = pd.DataFrame(data).T
import os

Inspecting the data, we see that ECGs are stored in the "Structures" column. A sample of its structure:

In [10]:
import pandas as pd

def unpack_structures(data):
    rows = []
    for patient_id, structures in data['Structures'].items():
        if isinstance(structures, dict):
            for anatomical_region, positions_dict in structures.items():
                for position, ecg_dict in positions_dict.items():
                    rows.append({
                        'patient': patient_id,
                        'anatomical region': anatomical_region,
                        'position': position,
                        'ecg': ecg_dict  # Este es el dict con las 12 derivaciones
                    })
    return pd.DataFrame(rows)

# Supongamos que `data` es tu DataFrame original y 'Structures' es una columna
result_df = unpack_structures(data)
display(result_df)

Unnamed: 0,patient,anatomical region,position,ecg
0,P186,2-LV,P36,"{'I': [-0.075, -0.075, -0.07200000000000001, -..."
1,P186,2-LV,P122,"{'I': [-0.042, -0.045, -0.048, -0.048, -0.048,..."
2,P186,2-LV,P85,"{'I': [-0.048, -0.045, -0.042, -0.042, -0.042,..."
3,P186,2-LV,P103,"{'I': [-0.045, -0.048, -0.048, -0.048, -0.045,..."
4,P186,2-LV,P86,"{'I': [0.003, 0.009000000000000001, 0.015, 0.0..."
...,...,...,...,...
29148,P230,1-AO,P9,"{'I': [0.027, 0.018000000000000002, 0.015, 0.0..."
29149,P230,1-AO,P2,"{'I': [-0.027, -0.024, -0.024, -0.021, -0.0180..."
29150,P230,1-AO,P1,"{'I': [0.03, 0.03, 0.033, 0.033, 0.033, 0.039,..."
29151,P230,1-AO,P4,"{'I': [0.048, 0.045, 0.039, 0.03, 0.024, 0.021..."


en el seguent codi seprem en 12 columnes diferents les 12 derviacions es a dir que obtenim un dataframe and patient, anatomical region, posiiton , I, II, III....

In [2]:
def unpack_structures_expand_ecg(data):
    rows = []
    for patient_id, structures in data['Structures'].items():
        if isinstance(structures, dict):
            for anatomical_region, positions_dict in structures.items():
                for position, ecg_dict in positions_dict.items():
                    # Crear fila base
                    row = {
                        'patient': patient_id,
                        'anatomical region': anatomical_region,
                        'position': position,
                    }
                    # Añadir cada derivación del ecg como columna
                    if isinstance(ecg_dict, dict):
                        row.update(ecg_dict)
                    rows.append(row)
    return pd.DataFrame(rows)

result_df_leads = unpack_structures_expand_ecg(data)


In [13]:
result_df__leads_encoded = pd.get_dummies(result_df_leads, columns=['anatomical region', 'position'])

In [14]:
from scipy import signal

# Define frecuencia de muestreo
fs = 500  # Modifica esto si tu ECG tiene otra frecuencia

def calculate_snr_leadwise(ecg_signal):
    ecg_signal = np.array(ecg_signal)

    # Filtros pasa alto y pasa bajo
    b_high, a_high = signal.butter(2, 0.5, 'high', fs=fs)
    b_low, a_low = signal.butter(2, 45.0, 'low', fs=fs)

    filtered = signal.filtfilt(b_high, a_high, ecg_signal)
    filtered = signal.filtfilt(b_low, a_low, filtered)
    filtered = signal.detrend(filtered)

    noise = ecg_signal - filtered

    P_signal = np.mean(filtered**2)
    P_noise = np.mean(noise**2)

    if P_noise == 0:
        return np.inf  # sin ruido

    snr_db = 10 * np.log10(P_signal / P_noise)
    return snr_db

def compute_snr_for_dataframe(df):
    # Detectar derivaciones (columnas con arrays como valores)
    lead_columns = [col for col in df.columns if isinstance(df[col].dropna().iloc[0], (np.ndarray, list))]

    snr_medians = []
    snr_means = []

    for _, row in df.iterrows():
        snrs = []
        for lead in lead_columns:
            signal_array = row[lead]
            try:
                snr = calculate_snr_leadwise(signal_array)
                snrs.append(snr)
            except Exception:
                snrs.append(np.nan)
        snrs = [s for s in snrs if not np.isnan(s)]
        snr_medians.append(np.median(snrs) if snrs else np.nan)
        snr_means.append(np.mean(snrs) if snrs else np.nan)

    df = df.copy()
    df["median_snr"] = snr_medians
    df["mean_snr"] = snr_means

    return df


In [15]:
# Asegúrate de tener definido fs antes
result_df_with_snr = compute_snr_for_dataframe(result_df_leads)


In [None]:

def calculate_snr(signal):
    signal = np.array(signal)
    mean_signal = np.mean(signal)
    var_noise = np.var(signal - mean_signal)
    if var_noise == 0:
        return np.inf
    return mean_signal**2 / var_noise

def compute_mean_snr_column(df):
    # Obtener los nombres de las columnas de las 12 derivaciones
    ecg_leads = [col for col in df.columns if isinstance(df[col].iloc[0], (list, np.ndarray))]

    snr_values = []
    for _, row in df.iterrows():
        snrs = []
        for lead in ecg_leads:
            signal = row[lead]
            if isinstance(signal, (list, np.ndarray)):
                snr = calculate_snr(signal)
                snrs.append(snr)
        mean_snr = np.mean(snrs) if snrs else np.nan
        snr_values.append(mean_snr)

    df['mean_snr'] = snr_values
    return df




In [13]:
result_df_leads_SNR = compute_mean_snr_column(result_df_leads)


[]

In [3]:

def expand_ecg_ultrafast(df):
    lead_columns = [col for col in df.columns if isinstance(df[col].iloc[0], (list, np.ndarray))]
    static_columns = [col for col in df.columns if col not in lead_columns]

    # Converteix les columnes de derivació en matrius 2D i guarda els noms de columnes
    expanded_leads = []
    col_names = []

    for lead in lead_columns:
        lead_matrix = np.stack(df[lead].values)
        expanded_leads.append(lead_matrix)
        col_names.extend([f"{lead}_{i}" for i in range(lead_matrix.shape[1])])

    # Concatena totes les derivacions horizontalment
    signal_matrix = np.hstack(expanded_leads)

    # Crea un DataFrame amb les derivacions expandides
    signal_df = pd.DataFrame(signal_matrix, columns=col_names)

    # Afegeix les columnes extra (com l'ID)
    if static_columns:
        final_df = pd.concat([df[static_columns].reset_index(drop=True), signal_df], axis=1)
    else:
        final_df = signal_df

    return final_df


In [4]:
# 1. Expandim el DataFrame (aquí és on triga una mica)
expanded_df = expand_ecg_ultrafast(result_df_leads)

# 2. Un cop s'ha acabat i el tens en memòria, el guardes
expanded_df.to_parquet("expanded_ecg.parquet", index=False)
# O en CSV si ho prefereixes
# expanded_df.to_csv("expanded_ecg.csv", index=False)


: 