In [16]:
# -*- coding: utf-8 -*-
"""Script for formatting the MIT-Long-Term ECG Database

Steps:
    1. Download the ZIP database from https://physionet.org/content/ltdb/1.0.0/
    2. Open it with a zip-opener (WinZip, 7zip).
    3. Extract the folder of the same name (named 'mit-bih-long-term-ecg-database-1.0.0') to the same folder as this script.
    4. Run this script.

Credits:
    https://github.com/berndporr/py-ecg-detectors/blob/master/tester_MITDB.py by Bernd Porr
"""
import os

import numpy as np
import pandas as pd
import wfdb

data_files = ["smart/" + file for file in os.listdir("smart") if ".dat" in file]



dfs_ecg = []
dfs_rpeaks = []

# load the .txt file smart/info.txt of the first 140 rows
info = pd.read_csv("smart/info.txt", sep="\t", nrows=140)
# add the hearder back to info, Record	Gender	Age	Weight	Height	BSA	BMI	Smoker	SBP	SBP	IMT MAX	LVMi	EF	Vascular event
# info.columns = ["Record", "Gender", "Age", "Weight", "Height", "BSA", "BMI", "Smoker", "SBP", "DBP", "IMT_MAX", "LVMi", "EF", "Vascular_event"] 


for participant, file in enumerate(data_files):
    print("Participant: " + str(participant + 1) + "/" + str(len(data_files)))
    # extract the record number in file
    file_num = str(int(file.split("/")[1].split(".")[0]))
    
    # get the value of Age column in info.txt with the record number equals to file_num
    age = info[info["Record"] == file_num]["Age"].values[0]
    gender = info[info["Record"] == file_num]["Gender"].values[0]
    
    
    # Get signal
    sample = wfdb.rdsamp(file[:-4])
    # print(sample)
    data = pd.DataFrame({"ECG": wfdb.rdsamp(file[:-4])[0][:, 1]})
    
    data["Participant"] = "smart_%.2i" %(participant)
    data["Sample"] = range(len(data))
    data["Sampling_Rate"] = 128
    data["Database"] = "smart"
    data["Age"] = age
    data["Gender"] = gender

    # getting annotations
    anno = wfdb.rdann(file[:-4], 'qrs')
    anno = anno.sample[np.where(np.array(anno.symbol) == "N")[0]]
    anno = pd.DataFrame({"Rpeaks": anno})
    anno["Participant"] = "smart_%.2i" %(participant)
    anno["Sampling_Rate"] = 128
    anno["Database"] = "smart"
    anno["Age"] = age
    anno["Gender"] = gender

    # Select only 2h of recording (otherwise it's too big)
    data = data[460800:460800*3].reset_index(drop=True)
    anno = anno[(anno["Rpeaks"] > 460800) & (anno["Rpeaks"] <= 460800*3)].reset_index(drop=True)
    anno["Rpeaks"] = anno["Rpeaks"] - 460800
    
    # Get the p wave features
    


    # Store with the rest
    dfs_ecg.append(data)
    dfs_rpeaks.append(anno)



# Save
df_ecg = pd.concat(dfs_ecg).to_csv("smart_ECGs.csv", index=False)
dfs_rpeaks = pd.concat(dfs_rpeaks).to_csv("smart_Rpeaks.csv", index=False)


# Quick test
#import neurokit2 as nk
#nk.events_plot(anno["Rpeaks"][anno["Rpeaks"] <= 1000], data["ECG"][0:1001])

Participant: 1/139
Participant: 2/139
Participant: 3/139
Participant: 4/139
Participant: 5/139
Participant: 6/139
Participant: 7/139
Participant: 8/139
Participant: 9/139
Participant: 10/139
Participant: 11/139
Participant: 12/139
Participant: 13/139
Participant: 14/139
Participant: 15/139
Participant: 16/139
Participant: 17/139
Participant: 18/139
Participant: 19/139
Participant: 20/139
Participant: 21/139
Participant: 22/139
Participant: 23/139
Participant: 24/139
Participant: 25/139
Participant: 26/139
Participant: 27/139
Participant: 28/139
Participant: 29/139
Participant: 30/139
Participant: 31/139
Participant: 32/139
Participant: 33/139
Participant: 34/139
Participant: 35/139
Participant: 36/139
Participant: 37/139
Participant: 38/139
Participant: 39/139
Participant: 40/139
Participant: 41/139
Participant: 42/139
Participant: 43/139
Participant: 44/139
Participant: 45/139
Participant: 46/139
Participant: 47/139
Participant: 48/139
Participant: 49/139
Participant: 50/139
Participa

In [17]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks'])
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks'])
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks'])

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math

def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per 20 seconds
    samples_per_minute = sample_rate * 20

    # Prepare to collect all features
    all_features = []
                
    for _, group in df_ecg.groupby('Participant'):
        for i in range(0, len(group), samples_per_minute):
            ecg_segment = group.iloc[i:i+samples_per_minute]
            if len(ecg_segment) == samples_per_minute:
                features = extract_pqrst_features(ecg_segment['ECG'].values, sample_rate)
                features.update({
                    'Participant': group['Participant'].iloc[0],
                    'Sample': i,
                    'Sampling_Rate': sample_rate,
                    'Database': group['Database'].iloc[0],
                    'Gender': group['Gender'].iloc[0],
                    'Age': group['Age'].iloc[0]
                })
                all_features.append(features)

    return pd.DataFrame(all_features)


df_ecg = pd.read_csv("smart_ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 128  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    print(f"Processing data for Participant: {participant}")
    participant_data = df_ecg[df_ecg['Participant'] == participant]
    try:
        features = process_ecg_data(participant_data, sample_rate)
        # Save the extracted features to a new CSV file
        features.to_csv(f"smart_features/smart_features_{participant}.csv", index=False)
        print(f"Extracted features saved to 'smart_features/smart_features_{participant}.csv'.")
    except Exception as e:
        print(f"Error processing data for Participant: {participant}")
        print(e)
        failed += 1
print(f"Processing completed. {failed} participants failed to process.")
        
        
        


# # Save the extracted features to a new CSV file
# features_df.to_csv("smart_features.csv", index=False)
# print("Extracted features saved to 'smart_features.csv'.")

Processing data for 139 participants.
Processing data for Participant: smart_00
Extracted features saved to 'smart_features/smart_features_smart_00.csv'.
Processing data for Participant: smart_01
Extracted features saved to 'smart_features/smart_features_smart_01.csv'.
Processing data for Participant: smart_02
Extracted features saved to 'smart_features/smart_features_smart_02.csv'.
Processing data for Participant: smart_03
Extracted features saved to 'smart_features/smart_features_smart_03.csv'.
Processing data for Participant: smart_04
Extracted features saved to 'smart_features/smart_features_smart_04.csv'.
Processing data for Participant: smart_05
Extracted features saved to 'smart_features/smart_features_smart_05.csv'.
Processing data for Participant: smart_06
Extracted features saved to 'smart_features/smart_features_smart_06.csv'.
Processing data for Participant: smart_07
Extracted features saved to 'smart_features/smart_features_smart_07.csv'.
Processing data for Participant: s

In [3]:
dp = pd.read_csv("smart-ECGs.csv")

In [4]:
# the data of smart_02
data = dp[dp["Participant"] == "smart_02"]


In [7]:
# check is there any missing value
data.to_csv("smart_02.csv", index=False)