In [7]:
# -*- coding: utf-8 -*-
"""Script for formatting the MIT-Long-Term ECG Database

Steps:
    1. Download the ZIP database from https://physionet.org/content/ltdb/1.0.0/
    2. Open it with a zip-opener (WinZip, 7zip).
    3. Extract the folder of the same name (named 'mit-bih-long-term-ecg-database-1.0.0') to the same folder as this script.
    4. Run this script.

Credits:
    https://github.com/berndporr/py-ecg-detectors/blob/master/tester_MITDB.py by Bernd Porr
"""
import os

import numpy as np
import pandas as pd
import wfdb

data_files = ["mit-bih-arrhythmia-database-1.0.0/" + file for file in os.listdir("mit-bih-arrhythmia-database-1.0.0/") if ".dat" in file]

dfs_ecg = []
dfs_rpeaks = []

for participant, file in enumerate(data_files):
    print("Participant: " + str(participant + 1) + "/" + str(len(data_files)))

    # Get signal
    sample, fields = wfdb.rdsamp(file[:-4])
    data = pd.DataFrame({"ECG": wfdb.rdsamp(file[:-4])[0][:, 1]})
    
    gender = fields['comments'][0].split(' ')[1]
    age = int(fields['comments'][0].split(' ')[0])
    
    data["Participant"] = "mit-bih_%.2i" %(participant)
    data["Sample"] = range(len(data))
    data["Sampling_Rate"] = 360
    data["Database"] = "mit-bih"
    data["Gender"] = gender
    data["Age"] = age

    # getting annotations
    anno = wfdb.rdann(file[:-4], 'atr')
    anno = anno.sample[np.where(np.array(anno.symbol) == "N")[0]]
    anno = pd.DataFrame({"Rpeaks": anno})
    anno["Participant"] = "mit-bih_%.2i" %(participant)
    anno["Sampling_Rate"] = 360
    anno["Database"] = "mit-bih"
    anno["Gender"] = gender
    anno["Age"] = age

    # Select only 2h of recording (otherwise it's too big)
    # data = data[460800:460800*3].reset_index(drop=True)
    # anno = anno[(anno["Rpeaks"] > 460800) & (anno["Rpeaks"] <= 460800*3)].reset_index(drop=True)
    # anno["Rpeaks"] = anno["Rpeaks"] - 460800
    
    # Get the p wave features
    
    additional_path = "mit-bih-arrhythmia-database-1.0.0/" + "x_mitdb/"
    # print(file.replace("mit-bih-arrhythmia-database-1.0.0/", ""))
    additional_file = additional_path + "x_" + file.replace("mit-bih-arrhythmia-database-1.0.0/", "")
    if additional_file in os.listdir(additional_path):
        print("  - Additional recording detected.")
        additional_sample, additional_fields = wfdb.rdsamp(additional_file[:-4])
        additional_data = pd.DataFrame({"ECG": wfdb.rdsamp(additional_file[:-4])[0][:, 1]})
        
        additional_gender = additional_fields['comments'][0].split(' ')[1]
        additional_age = int(additional_fields['comments'][0].split(' ')[0])
        
        additional_data["Participant"] = "mit-bih_%.2i" %(participant)
        additional_data["Sample"] = range(len(data))
        additional_data["Sampling_Rate"] = 360
        additional_data["Database"] = "mit-bih"
        additional_data["Gender"] = additional_gender
        additional_data["Age"] = additional_age

        # getting annotations
        additional_anno = wfdb.rdann(additional_file[:-4], 'atr')
        additional_anno = anno.sample[np.where(np.array(anno.symbol) == "N")[0]]
        additional_anno = pd.DataFrame({"Rpeaks": anno})
        additional_anno["Participant"] = "mit-bih_%.2i" %(participant)
        additional_anno["Sampling_Rate"] = 360
        additional_anno["Database"] = "mit-bih"
        additional_anno["Gender"] = additional_gender
        additional_anno["Age"] = additional_age
        # Concatenate the dataframes with their respective dataframes
        
        data = pd.concat([data, additional_data], ignore_index=True)
        anno = pd.concat([anno, additional_anno], ignore_index=True)

    # Store with the rest
    dfs_ecg.append(data)
    dfs_rpeaks.append(anno)

# Save
df_ecg = pd.concat(dfs_ecg).to_csv("mit-bih_ECGs.csv", index=False)
dfs_rpeaks = pd.concat(dfs_rpeaks).to_csv("mit-bih_Rpeaks.csv", index=False)


# Quick test
#import neurokit2 as nk
#nk.events_plot(anno["Rpeaks"][anno["Rpeaks"] <= 1000], data["ECG"][0:1001])

Participant: 1/48
Participant: 2/48
Participant: 3/48
Participant: 4/48
Participant: 5/48
Participant: 6/48
Participant: 7/48
Participant: 8/48
Participant: 9/48
Participant: 10/48
Participant: 11/48
Participant: 12/48
Participant: 13/48
Participant: 14/48
Participant: 15/48
Participant: 16/48
Participant: 17/48
Participant: 18/48
Participant: 19/48
Participant: 20/48
Participant: 21/48
Participant: 22/48
Participant: 23/48
Participant: 24/48
Participant: 25/48
Participant: 26/48
Participant: 27/48
Participant: 28/48
Participant: 29/48
Participant: 30/48
Participant: 31/48
Participant: 32/48
Participant: 33/48
Participant: 34/48
Participant: 35/48
Participant: 36/48
Participant: 37/48
Participant: 38/48
Participant: 39/48
Participant: 40/48
Participant: 41/48
Participant: 42/48
Participant: 43/48
Participant: 44/48
Participant: 45/48
Participant: 46/48
Participant: 47/48
Participant: 48/48


In [7]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks'])
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks'])
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks'])

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math

def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per 20 seconds
    samples_per_minute = sample_rate * 20

    # Prepare to collect all features
    all_features = []
                
    for _, group in df_ecg.groupby('Participant'):
        for i in range(0, len(group), samples_per_minute):
            ecg_segment = group.iloc[i:i+samples_per_minute]
            if len(ecg_segment) == samples_per_minute:
                features = extract_pqrst_features(ecg_segment['ECG'].values, sample_rate)
                features.update({
                    'Participant': group['Participant'].iloc[0],
                    'Sample': i,
                    'Sampling_Rate': sample_rate,
                    'Database': group['Database'].iloc[0],
                    'Gender': group['Gender'].iloc[0],
                    'Age': group['Age'].iloc[0]
                })
                all_features.append(features)

    return pd.DataFrame(all_features)

df_ecg = pd.read_csv("mit-bih_ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 360  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    print(f"Processing data for Participant: {participant}")
    participant_data = df_ecg[df_ecg['Participant'] == participant]
    try:
        features = process_ecg_data(participant_data, sample_rate)
        # Save the extracted features to a new CSV file
        features.to_csv(f"mit-bih_features/mit-bih_features_{participant}.csv", index=False)
        print(f"Extracted features saved to 'mit-bih_features/mit-bih_features_{participant}.csv'.")
    except Exception as e:
        print(f"Error processing data for Participant: {participant}")
        print(e)
        # raise e
        failed += 1
print(f"Processing completed. {failed} participants failed to process.")

mit-bih_01, mit-bih_16, mit-bih_17, mit-bih_18, mit-bih_22, mit-bih_41, mit-bih_44 

Processing data for 48 participants.
Processing data for Participant: mit-bih_00
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_00.csv'.
Processing data for Participant: mit-bih_01


  warn(


Error processing data for Participant: mit-bih_01
cannot convert float NaN to integer
Processing data for Participant: mit-bih_02
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_02.csv'.
Processing data for Participant: mit-bih_03
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_03.csv'.
Processing data for Participant: mit-bih_04
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_04.csv'.
Processing data for Participant: mit-bih_05
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_05.csv'.
Processing data for Participant: mit-bih_06
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_06.csv'.
Processing data for Participant: mit-bih_07
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_07.csv'.
Processing data for Participant: mit-bih_08
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_08.csv'.
Processing data for Participant: mit-bih_09
Ex

  warn(
  warn(


Error processing data for Participant: mit-bih_17
cannot convert float NaN to integer
Processing data for Participant: mit-bih_18
Error processing data for Participant: mit-bih_18
'[7207] not in index'
Processing data for Participant: mit-bih_19
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_19.csv'.
Processing data for Participant: mit-bih_20
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_20.csv'.
Processing data for Participant: mit-bih_21
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_21.csv'.
Processing data for Participant: mit-bih_22


  warn(


Error processing data for Participant: mit-bih_22
cannot convert float NaN to integer
Processing data for Participant: mit-bih_23
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_23.csv'.
Processing data for Participant: mit-bih_24
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_24.csv'.
Processing data for Participant: mit-bih_25
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_25.csv'.
Processing data for Participant: mit-bih_26
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_26.csv'.
Processing data for Participant: mit-bih_27
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_27.csv'.
Processing data for Participant: mit-bih_28
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_28.csv'.
Processing data for Participant: mit-bih_29
Extracted features saved to 'mit-bih_features/mit-bih_features_mit-bih_29.csv'.
Processing data for Participant: mit-bih_30
Ex

In [15]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks'])
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks'])
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks'])

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math

def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per 20 seconds
    samples_per_minute = sample_rate * 24

    # Prepare to collect all features
    all_features = []
                
    for _, group in df_ecg.groupby('Participant'):
        for i in range(0, len(group), samples_per_minute):
            ecg_segment = group.iloc[i:i+samples_per_minute]
            if len(ecg_segment) == samples_per_minute:
                features = extract_pqrst_features(ecg_segment['ECG'].values, sample_rate)
                features.update({
                    'Participant': group['Participant'].iloc[0],
                    'Sample': i,
                    'Sampling_Rate': sample_rate,
                    'Database': group['Database'].iloc[0],
                    'Gender': group['Gender'].iloc[0],
                    'Age': group['Age'].iloc[0]
                })
                all_features.append(features)

    return pd.DataFrame(all_features)

df_ecg = pd.read_csv("mit-bih_ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 360  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    if participant in ["mit-bih_16", "mit-bih_17"]:
        print(f"Processing data for Participant: {participant}")
        participant_data = df_ecg[df_ecg['Participant'] == participant]
        try:
            features = process_ecg_data(participant_data, sample_rate)
            # Save the extracted features to a new CSV file
            features.to_csv(f"mit-bih_features/mit-bih_features_{participant}.csv", index=False)
            print(f"Extracted features saved to 'mit-bih_features/mit-bih_features_{participant}.csv'.")
        except Exception as e:
            print(f"Error processing data for Participant: {participant}")
            print(e)
            # raise e
            failed += 1
    print(f"Processing completed. {failed} participants failed to process.")

Processing data for 48 participants.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing completed. 0 participants failed to process.
Processing data for Participant: mit-bih_16
Error processing data f

  warn(


Error processing data for Participant: mit-bih_17
cannot convert float NaN to integer
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing completed. 2 participants failed to process.
Processing complet

  warn(


In [16]:
# rename the files "f"mit-bih_features/smart_features_{participant}.csv"" to "f"mit-bih_features/mit-bih_features_{participant}.csv""
import os
# give me the code
directory = "mit-bih_features/"
for filename in os.listdir(directory):
    if filename.startswith("smart_features_"):
        os.rename(directory + filename, directory + filename.replace("smart_features_", "mit-bih_features_"))
