In this notebook we want to go from raw data, to a filtered, organized and normalized dataset of ECGs, where we have pre-processed each to basically align it according to the R-peak near the 2-second mark.

First we load the data

In [None]:
# %%
import pickle
import pandas as pd
import numpy as np
data = pickle.load(open('../all_points_may_2024.pkl', 'rb'))
data = pd.DataFrame(data).T
import os


In [None]:
# Inspecting the data, we see that ECGs are stored in the "Structures" column. A sample of its structure:

# %%
def unpack_structures(data):
    rows = []
    for patient_id, structures in data['Structures'].items():
        if isinstance(structures, dict):
            for anatomical_region, positions_dict in structures.items():
                for position, ecg_dict in positions_dict.items():
                    rows.append({
                        'patient': patient_id,
                        'anatomical region': anatomical_region,
                        'position': position,
                        'ecg': ecg_dict  # Este es el dict con las 12 derivaciones
                    })
    return pd.DataFrame(rows)

# Supongamos que `data` es tu DataFrame original y 'Structures' es una columna
result_df = unpack_structures(data)
display(result_df)

In [None]:
# %%
# Lista de pacientes a eliminar 
patients_to_remove = ['P186', 'P173', 'P268', 'P164', 'P295', 'P185']

# Elimina de 'data'
data = data.drop(patients_to_remove, errors='ignore')

# Elimina de 'result_df'
result_df = result_df[~result_df['patient'].isin(patients_to_remove)]


# %%
display(result_df)

In [None]:
# %% [markdown]
# en el seguent codi seprem en 12 columnes diferents les 12 derviacions es a dir que obtenim un dataframe and patient, anatomical region, posiiton , I, II, III....

# %%
def unpack_structures_expand_ecg(data):
    rows = []
    for patient_id, structures in data['Structures'].items():
        if isinstance(structures, dict):
            for anatomical_region, positions_dict in structures.items():
                for position, ecg_dict in positions_dict.items():
                    # Crear fila base
                    row = {
                        'patient': patient_id,
                        'anatomical region': anatomical_region,
                        'position': position,
                    }
                    # Añadir cada derivación del ecg como columna
                    if isinstance(ecg_dict, dict):
                        row.update(ecg_dict)
                    rows.append(row)
    return pd.DataFrame(rows)

result_df_leads = unpack_structures_expand_ecg(data)
display(result_df_leads)

In [None]:
# %%
result_df__leads_encoded = pd.get_dummies(result_df_leads, columns=['anatomical region', 'position'])
display(result_df__leads_encoded)

In [None]:
# %%
# Selecciona las columnas de las 12 derivaciones y el identificador de paciente
lead_columns = ['patient', 'I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
df_leads_only = result_df_leads[lead_columns].copy()
display(df_leads_only)


# %%
'''# Expand each lead's list of samples into separate columns
lead_names = ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']

# Stack all leads horizontally for each row
expanded_leads = []
column_names = []

for lead in lead_names:
    lead_matrix = np.stack(df_leads_only[lead].values)
    expanded_leads.append(lead_matrix)
    column_names.extend([f"{lead}_{i}" for i in range(lead_matrix.shape[1])])

# Concatenate all leads horizontally
signal_matrix = np.hstack(expanded_leads)

# Create a DataFrame with the expanded columns
expanded_df = pd.DataFrame(signal_matrix, columns=column_names)

# Add the patient column back
expanded_df.insert(0, 'patient', df_leads_only['patient'].values)
display(expanded_df)'''

# %% [markdown]
# Ara segmentarem els ecgs per poder fer l'alignment

# %%

In [None]:
!pip install git+https://github.com/guillermo-jimenez/sak.git
!pip install timm

In [None]:
# %%
#Example with ECG segmentation

import torch
import sak
from functools import partial
import math
import scipy as sp
import skimage
import skimage.util
from typing import List,Tuple
import numpy as np
models_ECG = [
    torch.load(f"modelos/model.{i+1}") for i in range(5)
]

In [None]:
# %% [markdown]
# Predict_ecg stroing all segmentations

# %%
def predict_ecg(ecg: np.ndarray, fs: float, model: List[torch.nn.Module],
                window_size: int = 2048, stride: int = 256, threshold_ensemble: float = 0.5,
                thr_dice=0.9, percentile=95, ptg_voting = 0.5, batch_size = 16,
                normalize=True, norm_threshold: float = 1e-6, filter = True) -> np.ndarray:
    """This function is used to delineate an ECG recording of arbitrary size ('ecg' variable, SAMPLES x LEADS)
    and to obtain the fiducials for the P, QRS and T segments as a binary mask of shape 3 x SAMPLES.

    Inputs:
    ecg                <- some array, e.g. shape 198484 x 12
    fs                 <- sampling frequency, e.g. 1000Hz. Will downsample to 250Hz for the AI model to work
    model              <- list of segmentation models for the ensemble
    window_size        <- the "chunk" size that will be processed at a time of the input ecg (e.g., in the example,
                          2048 samples out of the 198484 samples of the ECG)
    stride             <- the "stride" parameter allows for some overlap between the windows of the window_size
    threshold_ensemble <- Percentage of voting for the AI ensemble [0-1]
    thr_dice           <- Threshold for considering each sample as positive according to the Dice score
    percentile         <- Percentile for the amplitude normalization
    ptg_voting         <- Threshold for considering each sample as positive according to the Dice score
    batch_size         <- Number of windows that fit in the batch
    normalize          <- Boolean to indicate whether the ECG has to be normalized. In general, set this to True
                          always, as the ECGs must have the amplitude of a normal sinus rhythm around amplitude of
                          "1" to work, as that was the preprocessing for model training. Only change if a
                          comparable pre-processing is performed
    norm_threshold     <- Threshold for the normalization, to avoid passing baseline wander or noise as signal
    filter             <- Filter the signal with a band-pass filter in [0.5-125] Hz
    """
    # Preprocess signal
    ecg = np.copy(ecg).squeeze()
    if ecg.ndim == 0:
        return np.array([])
    elif ecg.ndim == 1:
        ecg = ecg[:,None]
    elif ecg.ndim == 2:
        if ecg.shape[0] < ecg.shape[1]:
            ecg = ecg.T
    else:
        raise ValueError("2 dims max allowed")
    ecg_250 = sak.signal.interpolate.interp1d(ecg,round(ecg.shape[0]*250/fs),axis=0)


    # Pad if necessary
    if ecg_250.shape[0] < window_size:
        padding = math.ceil(ecg_250.shape[0]/window_size)*window_size-ecg_250.shape[0]
        ecg_250 = np.pad(ecg_250,((0,padding),(0,0)),mode='edge')
    if (ecg_250.shape[0]-window_size)%stride != 0:
        padding = math.ceil((ecg_250.shape[0]-window_size)/stride)*stride-(ecg_250.shape[0]%window_size)
        ecg_250 = np.pad(ecg_250,((0,padding),(0,0)),mode='edge')

    # Get dimensions
    N,L = ecg_250.shape

    # (Optional) Normalize amplitudes
    if normalize:
        # Get ecg_250 when it's not flat zero
        norm_signal = ecg_250[np.all(np.abs(np.diff(ecg_250,axis=0,append=0)) >= norm_threshold,axis=1),:]

        # High pass filter normalized ecg_250 to avoid issues with baseline wander
        norm_signal = sp.signal.filtfilt(*sp.signal.butter(2, 0.5/250., 'high'),norm_signal, axis=0)

        # Compute amplitude for those segments
        amplitude = np.array(sak.signal.moving_lambda(
            norm_signal,
            256,
            partial(sak.signal.amplitude,axis=0),
            axis=0
        ))
        amplitude = amplitude[np.all(amplitude > norm_threshold,axis=1),]
        amplitude = np.percentile(amplitude, percentile, axis=0)

        # Apply normalization
        ecg_250 = ecg_250/amplitude[None,:]

    # (Optional) Filter ecg_250
    if filter:
        ecg_250 = sp.signal.filtfilt(*sp.signal.butter(2,   0.5/250., 'high'),ecg_250,axis=0)
        ecg_250 = sp.signal.filtfilt(*sp.signal.butter(2, 125.0/250.,  'low'),ecg_250,axis=0)
        ecg_250 = sp.signal.lfilter(*sp.signal.iirnotch(50,20.0,250.),ecg_250,axis=0)
        ecg_250 = sp.signal.lfilter(*sp.signal.iirnotch(60,20.0,250.),ecg_250,axis=0)

    # Avoid issues with negative strides due to filtering:
    if np.any(np.array(ecg_250.strides) < 0):
        ecg_250 = ecg_250.copy()

    # Data structure for computing the segmentation
    windowed_signal = skimage.util.view_as_windows(ecg_250,(window_size,1),(stride,1))

    # Flat batch shape
    new_shape = (windowed_signal.shape[0]*windowed_signal.shape[1],*windowed_signal.shape[2:])
    windowed_signal = np.reshape(windowed_signal,new_shape)

    # Exchange channel position
    windowed_signal = np.swapaxes(windowed_signal,1,2)

    # Output structures
    windowed_mask = np.zeros((windowed_signal.shape[0],3,windowed_signal.shape[-1]),dtype=int)

    # Check device for segmentation
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Compute segmentation for all leads independently
    with torch.no_grad():
        if isinstance(model,list):
            for m in model:
                m = m.to(device)
                for i in range(0,windowed_signal.shape[0],batch_size):
                    inputs = {"x": torch.tensor(windowed_signal[i:i+batch_size]).float().to(device)}
                    outputs = m(inputs)["sigmoid"].cpu().detach().numpy()
                    windowed_mask[i:i+batch_size] += outputs > thr_dice
            windowed_mask = windowed_mask >= len(model)*threshold_ensemble
        else:
            model = model.to(device)
            for i in range(0,windowed_signal.shape[0],batch_size):
                inputs = {"x": torch.tensor(windowed_signal[i:i+batch_size]).to(device).float()}
                outputs = model(inputs)["sigmoid"].cpu().detach().numpy()
                windowed_mask[i:i+batch_size] = outputs > thr_dice

    # Retrieve mask as 1D
    counter = np.zeros((N), dtype=int)
    segmentation_250 = np.zeros((3,N))

    # Iterate over windows
    for i in range(0,windowed_mask.shape[0],L):
        counter[(i//L)*stride:(i//L)*stride+window_size] += 1
        segmentation_250[:,(i//L)*stride:(i//L)*stride+window_size] += windowed_mask[i:i+L].sum(0)
    segmentation_250 = ((segmentation_250/counter) >= (ecg_250.shape[-1]*ptg_voting))

    # Correct padding
    segmentation_250 = segmentation_250[:,:-padding]

    # Interpolate back to original sampling frequency
    segmentation     = sak.signal.interpolate.interp1d(segmentation_250,ecg.shape[0],axis=-1,kind="nearest")
    # Crear vector de tiempo total
    time_vector = np.linspace(0, ecg.shape[0]/fs, ecg.shape[0], endpoint=False)

    # Crear vectores de tiempo segmentados para cada onda
    time_QRS = time_vector[segmentation[1].astype(bool)]

    return segmentation,time_QRS

In [None]:
# %% [markdown]
# Segmentation por patient 23047

# %%
from sak.signal import StandardHeader
ecg_signals1 = []
for lead in StandardHeader:
    ecg_signals1.append(df_leads_only[lead][23047])
    
ecg_signals1 = np.asarray(ecg_signals1).T
print(ecg_signals1.shape)

In [None]:
# %% [markdown]
# We filter the signal (code profe) and we apply the function predict_ecg.

# %%
#Apply the segmentation for a single record (make function to apply to multiple signals)
fs = 1000
fs_high,fs_low = 0.5,100.0
ecg_250 = sak.signal.interpolate.interp1d(ecg_signals1,round(ecg_signals1.shape[0]*250/fs), axis=0).T
ecg_250 = sp.signal.filtfilt(*sp.signal.butter(2, fs_high/250., 'high'), ecg_250, axis=-1)
ecg_250 = sp.signal.filtfilt(*sp.signal.butter(2,  fs_low/250.,  'low'), ecg_250, axis=-1)

segmentation_250, time_QRS= predict_ecg(ecg_250, 250., models_ECG, normalize=True, filter=False)

# Print entire array without truncation
np.set_printoptions(threshold=np.inf)
segmentation1 = sak.signal.interpolate.interp1d(segmentation_250, ecg_signals1.shape[0], axis=-1, kind="nearest")

# %% [markdown]
# We print what the segmentation returns. It is a binarized array where 1 indicates the possible/correct location of the desired segmed. Segmentation 1 is composed by three arays where the first one indicates the locations of the P wave, the second array indicated the locations of the QRS complex and the last array indicates the lcoations of the waves T. 
print(segmentation1)


In [None]:
# %%
import numpy as np
# %% [markdown]
# We plot the segmented signal
import matplotlib.pyplot as plt
fig, ax = plt.subplots(4,3)
row = 0
col = 0
x = np.linspace(0,2.5,int(fs*2.5))
for i,sig in enumerate(ecg_signals1.T):
    ax[row,col].plot(x, sig)
    ax[row,col].fill_between(x, np.min(sig), np.max(sig), where=(segmentation1[0,:] == 1), color='C0', alpha = 0.3 )
    ax[row,col].fill_between(x, np.min(sig), np.max(sig), where=(segmentation1[1,:] == 1), color='C1', alpha = 0.3 )
    ax[row,col].fill_between(x, np.min(sig), np.max(sig), where=(segmentation1[2,:] == 1), color='C2', alpha = 0.3 )
    col += 1
    if col >= 3:
        row += 1
        col = 0
    plt.plot()


In [None]:
# %% [markdown]
# # PLEASE HAVE FAITH THIS SHOULD WORK
# una mica d'anims per sobreviure i fer aquest treball

# %% [markdown]
# We create a dataframe of the ecg signal for a random patient to test. We take its segmentation and the ecg signal in order to take the last qrs segment. 

# %%
# Initialize a dictionary to store segmentation results
segmentation_results = []

# Extract only the first row (patient 1)
row = df_leads_only.iloc[19654]
patient_id = row['patient']
ecg_signal = np.array(row['V6'])  # Extract the V6 lead signal
fs = 1000  # Sampling frequency

print("Processing patient:", patient_id)

# Preprocess the signal
ecg_250 = sak.signal.interpolate.interp1d(ecg_signal, round(ecg_signal.shape[0] * 250 / fs), axis=0).T
ecg_250 = sp.signal.filtfilt(*sp.signal.butter(2, fs_high / 250., 'high'), ecg_250, axis=-1)
ecg_250 = sp.signal.filtfilt(*sp.signal.butter(2, fs_low / 250., 'low'), ecg_250, axis=-1)

# Apply segmentation
segmentation_250, time_QRS = predict_ecg(ecg_250, 250., models_ECG, normalize=True, filter=False)

# Interpolate back to the original sampling frequency
np.set_printoptions(threshold=np.inf)
segmentation = sak.signal.interpolate.interp1d(segmentation_250, ecg_signal.shape[0], axis=-1, kind="nearest")

# Store the results
segmentation_results.append({
    'patient': patient_id,
    'lead': 'V6',
    'segmentation': segmentation,
    'time_QRS': time_QRS, 
    'ecg_signal': ecg_signal
})

# Convert the results into a DataFrame
segmentation_results_df = pd.DataFrame(segmentation_results)
display(segmentation_results_df)



# %% [markdown]
# We look for the indices of these last qrs segment, with the aim of finding the index where the R peak of the last QRS segment is located

# %%
# Función para extraer los índices del último QRS
def find_last_qrs_indices(segmentation_triplet):
    qrs_seg = np.array(segmentation_triplet[1])
    ones_indices = np.where(qrs_seg == 1)[0]
    if len(ones_indices) == 0:
        return []
    groups = np.split(ones_indices, np.where(np.diff(ones_indices) > 1)[0] + 1)
    return groups[-1].tolist()

# Aplicar para obtener los índices del último QRS
segmentation_results_df["last_qrs_indices"] = segmentation_results_df["segmentation"].apply(find_last_qrs_indices)

# Extraer la señal de esos índices
segmentation_results_df["last_qrs_signal"] = segmentation_results_df.apply(
    lambda row: np.array(row["ecg_signal"])[row["last_qrs_indices"]]
    if len(row["last_qrs_indices"]) > 0 else np.nan,
    axis=1
)

display(segmentation_results_df)

# Exportar a CSV
segmentation_results_df.to_csv("segmentation_results.csv", index=False)


# %% [markdown]
# now we try to plot the detected segmentation to try to see if the process has been done correctly. 

# %%
import matplotlib.pyplot as plt

# Pick one row to visualize
row = segmentation_results_df.iloc[0]  # Change the index to pick a different example

# Get patient ID and lead
patient = row["patient"]
lead = row["lead"]
qrs_indices = row["last_qrs_indices"]

# Fetch the corresponding ECG signal
ecg_row = df_leads_only.iloc[19654] #put the same one as above!!
#ecg_row = df_leads_only[df_leads_only["patient"] == patient].iloc[0]
ecg_signal = np.array(ecg_row[lead])

# Get QRS start and end
if qrs_indices:
    qrs_start = qrs_indices[0]
    qrs_end = qrs_indices[-1]
else:
    print("No QRS complex detected.")
    qrs_start = qrs_end = None

# Time axis (in seconds)
fs = 1000  # sampling frequency
time = np.arange(len(ecg_signal)) / fs

# Plot ECG
plt.figure(figsize=(12, 4))
plt.plot(time, ecg_signal, label='ECG Signal')
if qrs_start is not None:
    plt.plot(time[qrs_start], ecg_signal[qrs_start], 'ro', label='QRS Start')
    plt.plot(time[qrs_end], ecg_signal[qrs_end], 'ro', label='QRS End')

plt.title(f"ECG Lead {lead} - Patient {patient}")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.legend()
plt.grid(True)
plt.show()



In [None]:
# %% [markdown]
# We see the segmentation is done correctly. Now we import the csv created with the segmentations done for all ecg signals

# %%
ecg = pickle.load(open('../segmentation_results.pkl', 'rb'))

ecg = pd.DataFrame(ecg)
display(ecg.head())

In [None]:
# %% [markdown]
# We choose to compute time shift with v6 reference as it is the lead closest to the left ventricle, which means that the R peak will be best detected.

# %% [markdown]
# In the next cell we compute the time shift that will need to be induced to our signal in order to obtain the last R peak perfectly at 2s.

# %%
def compute_time_shift(row):
    qrs_indices = row['last_qrs_indices']
    ecg_signal = row['ecg_signal']

    if not qrs_indices or len(ecg_signal) == 0:
        return np.nan

    qrs_values = ecg_signal[qrs_indices]
    max_idx = np.argmax(np.abs(qrs_values))
    max_ecg_index = qrs_indices[max_idx]

    return 2 - (max_ecg_index / 1000)


# %%
# Aplicar la función a cada fila y guardar resultados en una nueva columna
ecg['time_shift'] = ecg.apply(compute_time_shift, axis=1)

# Mostrar resultados
for idx, row in ecg.iterrows():
    print(f"Patient: {row['patient']}, Lead: {row['lead']}, Time shift: {row['time_shift']:.4f} seconds")



In [None]:

import matplotlib.pyplot as plt
import numpy as np

def plot_shifted_ecg(ecg_df, patient_index, fs=1000):
    """
    Plotea la señal ECG antes y después del desplazamiento temporal.
    
    Parámetros:
        ecg_df: DataFrame con columnas 'ecg_signal' y 'time_shift'
        patient_index: índice entero del paciente dentro del DataFrame
        fs: frecuencia de muestreo (default 1000 Hz)
    """
    row = ecg_df.iloc[patient_index]
    signal = row['ecg_signal']
    time_shift = row['time_shift']
    patient_id = row['patient']
    lead = row['lead']
    
    # Eje temporal original
    t = np.arange(len(signal)) / fs

    # Calcular desplazamiento en muestras
    shift_samples = int(round(time_shift * fs))

    # Aplicar desplazamiento
    if shift_samples > 0:
        shifted_signal = np.pad(signal, (shift_samples, 0), mode='constant',  constant_values=(signal[0],))[:len(signal)]
    elif shift_samples < 0:
        shifted_signal = np.pad(signal, (0, -shift_samples), mode='constant',  constant_values=(signal[-1],))[-shift_samples:]
    else:
        shifted_signal = signal.copy()

    # Graficar
    plt.figure(figsize=(12, 5))
    plt.plot(t, signal, label='Original', linewidth=1.5)
    plt.plot(t, shifted_signal, label=f'Shifted ({time_shift:.3f} s)', linestyle='--')
    plt.title(f'Patient {patient_id} - Lead {lead}\nTime shift: {time_shift:.3f} s ({shift_samples} samples)')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()



In [None]:
# %% [markdown]
# Just to check if the shift is done correctly

# %%
plot_shifted_ecg(ecg, patient_index=0) 
plot_shifted_ecg(ecg, patient_index=23455)


In [None]:
# %% [markdown]
# We put a new column to the df_leads_only the time shift calculated

# %%
display(ecg.head())

# %% [markdown]
# Now we do a new dataframe with only the signals shifted

# %%
import numpy as np
import pandas as pd

fs = 1000
lead_columns = ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']

#Limpieza y merge
ecg_unique = ecg[['patient', 'time_shift']].drop_duplicates(subset='patient')
df_leads_only['patient'] = df_leads_only['patient'].astype(str)
ecg_unique['patient'] = ecg_unique['patient'].astype(str)

df_with_shift = df_leads_only.merge(ecg_unique, on='patient', how='left')
assert 'time_shift' in df_with_shift.columns, "❌ La columna 'time_shift' no está presente tras el merge"

# Función de shift
def shift_row_leads(row):
    shifted = {}
    shift_samples = int(round(row['time_shift'] * fs))
    for lead in lead_columns:
        signal = row[lead]
        if isinstance(signal, np.ndarray):
            if shift_samples > 0:
                shifted_signal = np.pad(signal, (shift_samples, 0), mode='constant', constant_values=(signal[0],))[:len(signal)]
            elif shift_samples < 0:
                shifted_signal = np.pad(signal, (0, -shift_samples), mode='constant', constant_values=(signal[-1],))[-shift_samples:]
            else:
                shifted_signal = signal.copy()
        else:
            shifted_signal = np.zeros(5000)  # Valor por defecto si hay error
        shifted[lead] = shifted_signal
    shifted['patient'] = row['patient']
    return pd.Series(shifted)

# Chunked processing
n_chunks = 10
df_chunks = np.array_split(df_with_shift, n_chunks)

all_shifted = []
for i, chunk in enumerate(df_chunks):
    print(f"🔄 Procesando chunk {i+1}/{n_chunks}...")
    shifted_chunk = chunk.apply(shift_row_leads, axis=1)
    all_shifted.append(shifted_chunk)

df_shifted = pd.concat(all_shifted, ignore_index=True)

# Reordenar columnas: 'patient' primero
cols = ['patient'] + [col for col in df_shifted.columns if col != 'patient']
df_shifted = df_shifted[cols]



# %%
display(df_shifted.head())

# %% [markdown]
# We plot some of these new ignalsto check that the stored signal is correctly shifted

# %%
row = df_shifted.iloc[0]
signal = row['V6']
patient_id = row['patient']
t = np.arange(len(signal)) / fs
     
plt.figure(figsize=(12, 5))
plt.plot(t, signal, label='Original', linewidth=1.5)
plt.title(f'Patient {patient_id}')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

row = df_shifted.iloc[14800]
signal = row['V6']
patient_id = row['patient']
t = np.arange(len(signal)) / fs
     
plt.figure(figsize=(12, 5))
plt.plot(t, signal, label='Original', linewidth=1.5)
plt.title(f'Patient {patient_id}')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# %% [markdown]
# We see that it is correctly stored

# %% [markdown]
# Now we have to do a dataframe to store the samples divided by columns

In [None]:

# %%
from tqdm import tqdm

# Parámetros
n_chunks = 30  # Puedes ajustarlo según tu RAM
lead_columns = ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
sample_length = len(df_shifted.iloc[0]['I'])  # Asumimos todas las señales tienen igual longitud

# Expansión por chunks con barra de progreso
flat_chunks = []

print("\n Expandiendo señales en columnas...")

for i, chunk in enumerate(tqdm(np.array_split(df_shifted, n_chunks), desc="Chunks procesados")):
    expanded_rows = []

    for _, row in chunk.iterrows():
        flat_row = {'patient': row['patient']}
        for lead in lead_columns:
            signal = row[lead]
            flat_row.update({f"{lead}_{j}": signal[j] for j in range(sample_length)})
        expanded_rows.append(flat_row)

    df_expanded = pd.DataFrame(expanded_rows)
    flat_chunks.append(df_expanded)

# Unir todos los chunks
df_samples = pd.concat(flat_chunks, ignore_index=True)

# Guardar a archivo pickle
df_samples.to_pickle("ecg_flattened_samples.pkl")
print(" Archivo guardado como 'ecg_flattened_samples.pkl'")

