In [15]:
# -*- coding: utf-8 -*-
"""Script for formatting the MIT-Long-Term ECG Database

Steps:
    1. Download the ZIP database from https://physionet.org/content/ltdb/1.0.0/
    2. Open it with a zip-opener (WinZip, 7zip).
    3. Extract the folder of the same name (named 'mit-bih-long-term-ecg-database-1.0.0') to the same folder as this script.
    4. Run this script.

Credits:
    https://github.com/berndporr/py-ecg-detectors/blob/master/tester_MITDB.py by Bernd Porr
"""
import os

import numpy as np
import pandas as pd
import wfdb

data_files = ["mit-bih-long-term-ecg-database-1.0.0/" + file for file in os.listdir("mit-bih-long-term-ecg-database-1.0.0") if ".dat" in file]



dfs_ecg = []
dfs_rpeaks = []

for participant, file in enumerate(data_files):

    print("Participant: " + str(participant + 1) + "/" + str(len(data_files)))

    # Get signal
    sample, fields = wfdb.rdsamp(file[:-4])
    data = pd.DataFrame({"ECG": wfdb.rdsamp(file[:-4])[0][:, 1]})
    gender = fields['comments'][0].split(' ')[4]
    # print(gender)
    age = int(fields['comments'][0].split(' ')[1])
    
    data["Participant"] = "mit-long_%.2i" %(participant)
    data["Sample"] = range(len(data))
    data["Sampling_Rate"] = 128
    data["Database"] = "mit-long"
    data["Gender"] = gender
    data["Age"] = age

    # getting annotations
    anno = wfdb.rdann(file[:-4], 'atr')
    anno = anno.sample[np.where(np.array(anno.symbol) == "N")[0]]
    anno = pd.DataFrame({"Rpeaks": anno})
    anno["Participant"] = "mit-long_%.2i" %(participant)
    anno["Sampling_Rate"] = 128
    anno["Database"] = "mit-long"
    anno["Gender"] = gender
    anno["Age"] = age
    
    # Select only 2h of recording (otherwise it's too big)
    data = data[460800:460800*3].reset_index(drop=True)
    anno = anno[(anno["Rpeaks"] > 460800) & (anno["Rpeaks"] <= 460800*3)].reset_index(drop=True)
    anno["Rpeaks"] = anno["Rpeaks"] - 460800
    
    # Get the p wave features
    


    # Store with the rest
    dfs_ecg.append(data)
    dfs_rpeaks.append(anno)



# Save
df_ecg = pd.concat(dfs_ecg).to_csv("mit-long_ECGs.csv", index=False)
dfs_rpeaks = pd.concat(dfs_rpeaks).to_csv("mit-long_Rpeaks.csv", index=False)


# Quick test
#import neurokit2 as nk
#nk.events_plot(anno["Rpeaks"][anno["Rpeaks"] <= 1000], data["ECG"][0:1001])

Participant: 1/7
Participant: 2/7
Participant: 3/7
Participant: 4/7
Participant: 5/7
Participant: 6/7
Participant: 7/7


Processing data for Participant: MIT-LongTerm_00
Processing data for Participant: MIT-LongTerm_01
Processing data for Participant: MIT-LongTerm_02
Processing data for Participant: MIT-LongTerm_03
Processing data for Participant: MIT-LongTerm_04
Processing data for Participant: MIT-LongTerm_05
Processing data for Participant: MIT-LongTerm_06
Extracted features saved to 'MIT_long_features.csv'.


In [18]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks'])
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks'])
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks'])

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math

def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per 20 seconds
    samples_per_minute = sample_rate * 20

    # Prepare to collect all features
    all_features = []
                
    for _, group in df_ecg.groupby('Participant'):
        for i in range(0, len(group), samples_per_minute):
            ecg_segment = group.iloc[i:i+samples_per_minute]
            if len(ecg_segment) == samples_per_minute:
                features = extract_pqrst_features(ecg_segment['ECG'].values, sample_rate)
                features.update({
                    'Participant': group['Participant'].iloc[0],
                    'Sample': i,
                    'Sampling_Rate': sample_rate,
                    'Database': group['Database'].iloc[0],
                    'Gender': group['Gender'].iloc[0],
                    'Age': group['Age'].iloc[0]
                })
                all_features.append(features)

    return pd.DataFrame(all_features)

df_ecg = pd.read_csv("mit-long_ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 128  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    # if participant != "mit-long_04":
    #     continue
    print(f"Processing data for Participant: {participant}")
    participant_data = df_ecg[df_ecg['Participant'] == participant]
    try:
        features = process_ecg_data(participant_data, sample_rate)
        # Save the extracted features to a new CSV file
        features.to_csv(f"mit-long_features/mit-long_features_{participant}.csv", index=False)
        print(f"Extracted features saved to 'mit-long_features/mit-long_features_{participant}.csv'.")
    except Exception as e:
        print(f"Error processing data for Participant: {participant}")
        print(e)
        # raise e
        failed += 1
print(f"Processing completed. {failed} participants failed to process.")

Processing data for 7 participants.
Processing data for Participant: mit-long_00
Extracted features saved to 'mit-long_features/mit-long_features_mit-long_00.csv'.
Processing data for Participant: mit-long_01
Extracted features saved to 'mit-long_features/mit-long_features_mit-long_01.csv'.
Processing data for Participant: mit-long_02
Extracted features saved to 'mit-long_features/mit-long_features_mit-long_02.csv'.
Processing data for Participant: mit-long_03
Extracted features saved to 'mit-long_features/mit-long_features_mit-long_03.csv'.
Processing data for Participant: mit-long_04
Extracted features saved to 'mit-long_features/mit-long_features_mit-long_04.csv'.
Processing data for Participant: mit-long_05
Extracted features saved to 'mit-long_features/mit-long_features_mit-long_05.csv'.
Processing data for Participant: mit-long_06
Extracted features saved to 'mit-long_features/mit-long_features_mit-long_06.csv'.
Processing completed. 0 participants failed to process.
