In [60]:
import wfdb    
import os
import pandas as pd
import numpy as np
from scipy.signal import butter, filtfilt
import pywt
from scipy.signal import detrend


In [61]:
record = wfdb.rdrecord('10', pn_dir='ludb/1.0.1/data/')



In [62]:
def normalize(ecg_signal, sampling_rate):
    # High-pass filtering
    # cutoff_hz = 0.5  # Removing baseline wander
    # normalized_cutoff = cutoff_hz / (0.5 * sampling_rate)
    # b, a = butter(4, normalized_cutoff, btype='high', analog=False)
    # filtered_ecg = filtfilt(b, a, ecg_signal)

    # Bandpass
    low = 0.5 / (0.5 * sampling_rate)
    high = 20 / (0.5 * sampling_rate)
    b, a = butter(4, [low, high], btype='band')
    filtered_ecg = filtfilt(b, a, ecg_signal)

    # Use robust baseline estimation (e.g., median)
    #baseline = np.median(filtered_ecg)
    #window_size = int(1.5 * sampling_rate) # 1.5 seconds if fs is the sampling frequency
    #baseline = np.convolve(filtered_ecg, np.ones(window_size)/window_size, mode='same')

    # Subtract the baseline
    #ecg_signal_corrected = filtered_ecg - baseline


    # Detrend the signal to remove linear drifts
    #ecg_signal_detrended = detrend(ecg_signal_corrected)

    #ecg_signal_detrended -= np.mean(ecg_signal_detrended)

    #ecg_min, ecg_max = ecg_signal_corrected.min(), ecg_signal_corrected.max()
    #ecg_normalized = 2 * ((ecg_signal_corrected - ecg_min) / (ecg_max - ecg_min)) - 1

    #ecg_normalized = 2 * ecg_normalized - 1

    return filtered_ecg

In [63]:
data_dir = 'ludb/1.0.1/data/'
data_path = '../lobachevsky-university-electrocardiography-database-1.0.1/data/'
fs = 500

# Get all records
records = [f.split('.')[0] for f in os.listdir(data_path) if f.endswith('.dat')]
records = sorted(records)
dfs = []  # List to store individual dataframes

# for record_name in records:
#     # Read the ECG record
#     record = wfdb.rdrecord(record_name, pn_dir=data_dir)

#     # Convert the signal for lead i to a DataFrame
#     lead_i_idx = record.sig_name.index('ii')
#     df_signals = pd.DataFrame({'ii': normalize(record.p_signal[:, lead_i_idx], fs) })

#     # Read the annotations for lead i
#     annotations = wfdb.rdann(record_name, 'ii', pn_dir=data_dir)
#     # Create a column for the lead i annotations and fill with 0
#     df_signals['target'] = 0

#     # Track if annotations 'N', 'p', 't' are present
#     has_N, has_p, has_t = False, False, False
    
#     for index, symbol in zip(annotations.sample, annotations.symbol):
#         if symbol == 'N':
#             df_signals.at[index, 'target'] = 2
#             has_N = True
#         elif symbol == 'p':
#             df_signals.at[index, 'target'] = 1
#             has_p = True
#         elif symbol == 't':
#             df_signals.at[index, 'target'] = 3
#             has_t = True

#     # Check if all three annotations are present
#     if has_N and has_p and has_t:
#         # Add a column to identify the record
#         df_signals.insert(0, 'record', record_name)

#         # Append this DataFrame to the list
#         dfs.append(df_signals)

for record_name in records:
    # Read the ECG record
    record = wfdb.rdrecord(record_name, pn_dir=data_dir)

    # Ensure signal is long enough to be trimmed
    #if record.p_signal.shape[0] >= 5000:  # Assuming the signals are at least 5000 samples long
    # Trim the signal to 3000 samples in length by removing the first 1000 and last 1000 samples
    trimmed_signal = record.p_signal[1000:-1000]

    # Convert the trimmed signal for lead ii to a DataFrame
    lead_i_idx = record.sig_name.index('ii')
    df_signals = pd.DataFrame({'ii': normalize(trimmed_signal[:, lead_i_idx], fs)})

    # Read the annotations for lead ii
    annotations = wfdb.rdann(record_name, 'ii', pn_dir=data_dir)

    # Filter annotations to include only those that fall within the trimmed range
    valid_annotations = [(index-1000, symbol) for index, symbol in zip(annotations.sample, annotations.symbol)
                            if 1000 <= index < record.p_signal.shape[0]-1000]

    # Create a column for the lead ii annotations and fill with 0
    df_signals['target'] = 0

    # Initialize flags for the presence of annotations 'N', 'p', 't'
    has_N, has_p, has_t = False, False, False

    # Apply valid annotations to the DataFrame
    for index, symbol in valid_annotations:
        if symbol == 'N':
            df_signals.at[index, 'target'] = 2
            has_N = True
        elif symbol == 'p':
            df_signals.at[index, 'target'] = 1
            has_p = True
        elif symbol == 't':
            df_signals.at[index, 'target'] = 3
            has_t = True

    # Check if all three annotations are present
    if has_N and has_p and has_t:
        # Add a column to identify the record
        df_signals.insert(0, 'record', record_name)

        # Append this DataFrame to the list
        dfs.append(df_signals)



# Concatenate all DataFrames into one
df_filtered = pd.concat(dfs, ignore_index=True)


In [64]:
df_filtered.to_csv('ludb_lead_ii_data_2.csv', header=False, index=False)

In [118]:
def split_dataset(dataset, target, split):
    train_data = dataset[:split[0]]
    train_target = target[:split[0]]
    train_tuple = (train_data, train_target)

    val_data = dataset[split[0]:split[0] + split[1]]
    val_target = target[split[0]:split[0] + split[1]]
    val_tuple = (val_data, val_target)

    test_data = dataset[split[0] + split[1]:split[0] + split[1] + split[2]]
    test_target = target[split[0] + split[1]:split[0] + split[1] + split[2]]
    test_tuple = (test_data, test_target)

    return train_tuple, val_tuple, test_tuple

df = pd.read_csv('ludb_lead_ii_data_2.csv', header=None)

    # Assuming each signal has 5000 entries
signal_length = 3000

# Calculate the number of signals
num_signals = df.shape[0] // signal_length

# Extract the signal values and target values without column names
signals = df.iloc[:num_signals * signal_length, 1].to_numpy().reshape(-1, signal_length)
targets = df.iloc[:num_signals * signal_length, 2].to_numpy().reshape(-1, signal_length)

trn_tuple, val_tuple, tst_tuple = split_dataset(signals, targets, [120, 51, 5])

# trn_tuple_seg = segment_by_r_peaks(trn_tuple)
# val_tuple_seg = segment_by_r_peaks(val_tuple)
# tst_tuple_seg = segment_by_r_peaks(tst_tuple)


# t, signal_matrix, target_matrix = dataset(1505)

# trn_tuple, val_tuple, tst_tuple = split_dataset(signal_matrix, target_matrix, [1000, 500, 5])
#initialize data structure
data = {'trn': {'x': trn_tuple[0], 'y': trn_tuple[1]},
        'val': {'x': val_tuple[0], 'y': val_tuple[1]},
        'tst': {'x': tst_tuple[0], 'y': tst_tuple[1]}}


In [205]:
def segment_heartbeats(data_tuple):
    signals, targets = data_tuple  # Unpack the tuple into signals and targets
    heartbeats_x = []
    heartbeats_y = []

    for signal, target in zip(signals, targets):
        # Assuming '2' indicates an R-peak, find the indices of R-peaks
        r_peaks_indices = np.where(target == 2)[0]
        for start, end in zip(r_peaks_indices[:-1], r_peaks_indices[1:]):
            # Segment the signal between two R-peaks
            heartbeat_signal = signal[start:end]
            heartbeat_target = target[start:end]
            heartbeats_x.append(heartbeat_signal)
            heartbeats_y.append(heartbeat_target)

    # Convert lists to arrays of objects to handle variable-length segments
    heartbeats_x_array = np.array(heartbeats_x, dtype=object)
    heartbeats_y_array = np.array(heartbeats_y, dtype=object)

    return (heartbeats_x_array, heartbeats_y_array)




In [203]:
trn_tuple[0]

array([[-0.00652167, -0.00669428, -0.00686972, ...,  0.12356657,
         0.12492934,  0.12529296],
       [-0.00755651, -0.00734837, -0.00717049, ..., -0.03413886,
        -0.03128428, -0.02842715],
       [-0.0134488 , -0.01484789, -0.0160608 , ..., -0.01386221,
        -0.01315323, -0.01242528],
       ...,
       [-0.0298723 , -0.02954573, -0.02922914, ..., -0.26962933,
        -0.23509315, -0.19994523],
       [-0.0147393 , -0.01577118, -0.01679412, ..., -0.02807013,
        -0.02622329, -0.02433966],
       [-0.02209376, -0.02568411, -0.02926717, ...,  0.03155599,
         0.03162091,  0.03161735]])

In [206]:
segment_heartbeats(trn_tuple)

(array([array([ 5.81850435e-01,  5.88978987e-01,  5.85747501e-01,  5.72451795e-01,
                5.49796612e-01,  5.18840486e-01,  4.80921476e-01,  4.37571227e-01,
                3.90424884e-01,  3.41133422e-01,  2.91283571e-01,  2.42329146e-01,
                1.95536363e-01,  1.51944658e-01,  1.12343491e-01,  7.72646607e-02,
                4.69887590e-02,  2.15637461e-02,  8.33162879e-04, -1.55287641e-02,
               -2.79779016e-02, -3.70594899e-02, -4.33682650e-02, -4.75114719e-02,
               -5.00764895e-02, -5.16043715e-02, -5.25700880e-02, -5.33696877e-02,
               -5.43140753e-02, -5.56286512e-02, -5.74577360e-02, -5.98725206e-02,
               -6.28813010e-02, -6.64409436e-02, -7.04687911e-02, -7.48544618e-02,
               -7.94711280e-02, -8.41859078e-02, -8.88690248e-02, -9.34014516e-02,
               -9.76808360e-02, -1.01625610e-01, -1.05177270e-01, -1.08300924e-01,
               -1.10984264e-01, -1.13235228e-01, -1.15078672e-01, -1.16552402e-01,
    

In [213]:
segment = np.zeros(300)
b =segment[72:]
len(b)

228