In [4]:
# -*- coding: utf-8 -*-
"""Script for formatting the MIT-Long-Term ECG Database

Steps:
    1. Download the ZIP database from https://physionet.org/content/ltdb/1.0.0/
    2. Open it with a zip-opener (WinZip, 7zip).
    3. Extract the folder of the same name (named 'mit-bih-long-term-ecg-database-1.0.0') to the same folder as this script.
    4. Run this script.

Credits:
    https://github.com/berndporr/py-ecg-detectors/blob/master/tester_MITDB.py by Bernd Porr
"""
import os

import numpy as np
import pandas as pd
import wfdb

def extract_age_and_gender_from_comments(comments):
    age = None
    gender = None

    if comments:
        # Assuming comments is a list with a single string element
        comment = comments[0]  # Access the first (and assumed only) element

        # Split the comment by spaces and iterate over parts
        parts = comment.split()
        for i, part in enumerate(parts):
            if part == 'Age:':
                age = int(parts[i + 1])  # The next part is the age number
            if part == 'Sex:':
                gender = parts[i + 1]  # The next part is the gender

    return age, gender

data_files = ["chfdb/" + file for file in os.listdir("chfdb") if ".dat" in file]



dfs_ecg = []
dfs_rpeaks = []

for participant, file in enumerate(data_files):

    print("Participant: " + str(participant + 1) + "/" + str(len(data_files)))
    print("File: " + file)
    age, gender = extract_age_and_gender_from_comments(wfdb.rdheader(file[:-4]).comments)

    # Get signal
    sample, fields = wfdb.rdsamp(file[:-4])
    print(fields)
    # print(sample)
    data = pd.DataFrame({"ECG": wfdb.rdsamp(file[:-4])[0][:, 1]})
    
    data["Participant"] = "chfdb_%.2i" %(participant)
    data["Sample"] = range(len(data))
    data["Sampling_Rate"] = 250
    data["Database"] = "chfdb"
    data['Age'] = age
    data['Gender'] = gender

    # # getting annotations
    # anno = wfdb.rdann(file[:-4], 'qrs')
    # anno = anno.sample[np.where(np.array(anno.symbol) == "N")[0]]
    # anno = pd.DataFrame({"Rpeaks": anno})
    # anno["Participant"] = "smart_%.2i" %(participant)
    # anno["Sampling_Rate"] = 250
    # anno["Database"] = "chfdb"

    # Select only 2h of recording (otherwise it's too big)
    data = data[460800:460800+250*60*120].reset_index(drop=True)
    # anno = anno[(anno["Rpeaks"] > 460800) & (anno["Rpeaks"] <= 460800*3)].reset_index(drop=True)
    # anno["Rpeaks"] = anno["Rpeaks"] - 460800
    
    # Get the p wave features
    


    # Store with the rest
    dfs_ecg.append(data)
    # dfs_rpeaks.append(anno)



# Save
df_ecg = pd.concat(dfs_ecg).to_csv("chfdb_ECGs.csv", index=False)
# dfs_rpeaks = pd.concat(dfs_rpeaks).to_csv("chfdb-Rpeaks.csv", index=False)


# Quick test
#import neurokit2 as nk
#nk.events_plot(anno["Rpeaks"][anno["Rpeaks"] <= 1000], data["ECG"][0:1001])

Participant: 1/15
File: chfdb/chf02.dat
{'fs': 250, 'sig_len': 17793024, 'n_sig': 2, 'base_date': None, 'base_time': datetime.time(13, 22), 'units': ['mV', 'mV'], 'sig_name': ['ECG1', 'ECG2'], 'comments': ['Age: 61  Sex: F  NYHA class: III-IV']}
Participant: 2/15
File: chfdb/chf03.dat
{'fs': 250, 'sig_len': 17998848, 'n_sig': 2, 'base_date': None, 'base_time': datetime.time(8, 35), 'units': ['mV', 'mV'], 'sig_name': ['ECG1', 'ECG2'], 'comments': ['Age: 63  Sex: M  NYHA class: III-IV']}
Participant: 3/15
File: chfdb/chf01.dat
{'fs': 250, 'sig_len': 17994491, 'n_sig': 2, 'base_date': None, 'base_time': datetime.time(10, 0), 'units': ['mV', 'mV'], 'sig_name': ['ECG1', 'ECG2'], 'comments': ['Age: 71  Sex: M  NYHA class: III-IV']}
Participant: 4/15
File: chfdb/chf15.dat
{'fs': 250, 'sig_len': 17993443, 'n_sig': 2, 'base_date': None, 'base_time': datetime.time(9, 10), 'units': ['mV', 'mV'], 'sig_name': ['ECG1', 'ECG2'], 'comments': ['Age: 53  Sex: M  NYHA class: III-IV']}
Participant: 5/15
F

In [1]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks'])
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks'])
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks'])

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math

def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per 20 seconds
    samples_per_minute = sample_rate * 20

    # Prepare to collect all features
    all_features = []
                
    for _, group in df_ecg.groupby('Participant'):
        for i in range(0, len(group), samples_per_minute):
            ecg_segment = group.iloc[i:i+samples_per_minute]
            if len(ecg_segment) == samples_per_minute:
                features = extract_pqrst_features(ecg_segment['ECG'].values, sample_rate)
                features.update({
                    'Participant': group['Participant'].iloc[0],
                    'Sample': i,
                    'Sampling_Rate': sample_rate,
                    'Database': group['Database'].iloc[0],
                    'Gender': group['Gender'].iloc[0],
                    'Age': group['Age'].iloc[0]
                })
                all_features.append(features)

    return pd.DataFrame(all_features)


df_ecg = pd.read_csv("chfdb_ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 250  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    print(f"Processing data for Participant: {participant}")
    participant_data = df_ecg[df_ecg['Participant'] == participant]
    try:
        features = process_ecg_data(participant_data, sample_rate)
        # Save the extracted features to a new CSV file
        features.to_csv(f"chfdb_features/chfdb_features_{participant}.csv", index=False)
        print(f"Extracted features saved to 'chfdb_features/chfdb_features_{participant}.csv'.")
    except Exception as e:
        print(f"Error processing data for Participant: {participant}")
        print(e)
        failed += 1
print(f"Processing completed. {failed} participants failed to process.")
        
        
        


# # Save the extracted features to a new CSV file
# features_df.to_csv("smart_features.csv", index=False)
# print("Extracted features saved to 'smart_features.csv'.")

Processing data for 15 participants.
Processing data for Participant: chfdb_00
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_00.csv'.
Processing data for Participant: chfdb_01
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_01.csv'.
Processing data for Participant: chfdb_02


  warn(


Error processing data for Participant: chfdb_02
cannot convert float NaN to integer
Processing data for Participant: chfdb_03
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_03.csv'.
Processing data for Participant: chfdb_04


  warn(


Error processing data for Participant: chfdb_04
cannot convert float NaN to integer
Processing data for Participant: chfdb_05
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_05.csv'.
Processing data for Participant: chfdb_06
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_06.csv'.
Processing data for Participant: chfdb_07


  warn(


Error processing data for Participant: chfdb_07
cannot convert float NaN to integer
Processing data for Participant: chfdb_08
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_08.csv'.
Processing data for Participant: chfdb_09
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_09.csv'.
Processing data for Participant: chfdb_10
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_10.csv'.
Processing data for Participant: chfdb_11
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_11.csv'.
Processing data for Participant: chfdb_12
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_12.csv'.
Processing data for Participant: chfdb_13
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_13.csv'.
Processing data for Participant: chfdb_14
Extracted features saved to 'chfdb_features/chfdb_features_chfdb_14.csv'.
Processing completed. 3 participants failed to process.
