In [1]:
import os
import pandas as pd
import wfdb

# Define the base path where the dataset is located
base_path = "brno-university-of-technology-ecg-quality-database-but-qdb-1.0.0"

# Gather all .dat files from the subdirectories under the base path
data_files = []
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith("_ECG.dat"):
            data_files.append(os.path.join(root, file))

dfs_ecg = []
dfs_rpeaks = []

info = pd.read_csv("brno-university-of-technology-ecg-quality-database-but-qdb-1.0.0/subject-info.csv", sep=';')

# Process each data file
for participant, file in enumerate(data_files):
    print("Processing data for Participant ID:", file.split('/')[-2])
   

    # Get signal
    record = wfdb.rdsamp(file[:-4])  # Remove '.dat' and pass the path without the extension
    
    age = info[info['ID'] == int(file.split('/')[-2])]['Age'].values[0]
    gender = info[info['ID'] == int(file.split('/')[-2])]['Gender'].values[0]
    
    print(record[0])
    data = pd.DataFrame({"ECG": record[0][:, 0]})  # Adjust index if needed based on lead configuration
    
    # Metadata
    data["Participant"] = "brno_%.2i" %(participant)  # Participant ID from the folder name
    data["Sample"] = range(len(data))
    data["Sampling_Rate"] = record[1]['fs']  # Sampling rate from the record metadata
    data["Database"] = "brno"
    data = data[:record[1]['fs']*60*120].reset_index(drop=True)
    data['Age'] = age
    data['Gender'] = gender

    # # Getting annotations
    # try:
    #     anno = wfdb.rdann(file[:-4], 'atr')  # Use correct annotation extension if different
    #     valid_rpeaks = anno.sample[np.isin(anno.symbol, ["N"])]  # Assuming 'N' denotes normal beats
    #     anno_df = pd.DataFrame({"Rpeaks": valid_rpeaks})
    #     anno_df["Participant"] = file.split('/')[-2]
    #     anno_df["Sampling_Rate"] = record[1]['fs']
    #     anno_df["Database"] = "BUT QDB"
    # except FileNotFoundError:
    #     print("Annotation file not found for", file.split('/')[-2])
    #     continue
    
    dfs_ecg.append(data)
    # dfs_rpeaks.append(anno_df)

# Save the concatenated DataFrame of all ECGs and R-peaks to CSV files
pd.concat(dfs_ecg).to_csv("brno_ECGs.csv", index=False)
# pd.concat(dfs_rpeaks).to_csv("BUT_QDB_Rpeaks.csv", index=False)

print("All ECG data and annotations have been saved successfully.")


Processing data for Participant ID: 118001
[[32767.65535311]
 [32767.65535311]
 [32767.65535311]
 ...
 [  832.01664033]
 [  853.01706034]
 [  903.01806036]]
Processing data for Participant ID: 100002
[[32768.11501187]
 [32768.11501187]
 [32768.11501187]
 ...
 [-1790.00064181]
 [-1777.16449522]
 [-1777.16449522]]
Processing data for Participant ID: 126001
[[213.00426009]
 [199.00398008]
 [197.00394008]
 ...
 [  0.        ]
 [ 15.00030001]
 [ 34.00068001]]
Processing data for Participant ID: 122001
[[32767.65535311]
 [32767.65535311]
 [32767.65535311]
 ...
 [ -301.00602012]
 [ -266.00532011]
 [ -334.00668013]]
Processing data for Participant ID: 123001
[[ 3861.86853577]
 [ 3943.07205744]
 [ 4024.27557911]
 ...
 [-2387.38353705]
 [-2389.94785879]
 [-2393.36695444]]
Processing data for Participant ID: 103001
[[19295.42797366]
 [19077.59522491]
 [18874.5307981 ]
 ...
 [   65.22675528]
 [   76.91834349]
 [  102.14756015]]
Processing data for Participant ID: 113001
[[6016.12032241]
 [6087.121

In [3]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks']) / 1000  # convert to mV
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks']) / 1000
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks']) / 1000
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks']) / 1000

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math

def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per 20 seconds
    samples_per_minute = sample_rate * 20

    # Prepare to collect all features
    all_features = []
                
    for _, group in df_ecg.groupby('Participant'):
        for i in range(0, len(group), samples_per_minute):
            ecg_segment = group.iloc[i:i+samples_per_minute]
            if len(ecg_segment) == samples_per_minute:
                features = extract_pqrst_features(ecg_segment['ECG'].values, sample_rate)
                features.update({
                    'Participant': group['Participant'].iloc[0],
                    'Sample': i,
                    'Sampling_Rate': sample_rate,
                    'Database': group['Database'].iloc[0],
                    'Gender': group['Gender'].iloc[0],
                    'Age': group['Age'].iloc[0]
                })
                all_features.append(features)

    return pd.DataFrame(all_features)


df_ecg = pd.read_csv("brno_ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 1000  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    print(f"Processing data for Participant: {participant}")
    participant_data = df_ecg[df_ecg['Participant'] == participant]
    try:
        features = process_ecg_data(participant_data, sample_rate)
        # Save the extracted features to a new CSV file
        features.to_csv(f"brno_features/brno_features_{participant}.csv", index=False)
        print(f"Extracted features saved to 'brno_features/brno_features_{participant}.csv'.")
    except Exception as e:
        print(f"Error processing data for Participant: {participant}")
        print(e)
        failed += 1
print(f"Processing completed. {failed} participants failed to process.")
        
        
        


# # Save the extracted features to a new CSV file
# features_df.to_csv("smart_features.csv", index=False)
# print("Extracted features saved to 'smart_features.csv'.")

Processing data for 18 participants.
Processing data for Participant: brno_00
Error processing data for Participant: brno_00
'[20016] not in index'
Processing data for Participant: brno_01
Extracted features saved to 'brno_features/brno_features_brno_01.csv'.
Processing data for Participant: brno_02
Extracted features saved to 'brno_features/brno_features_brno_02.csv'.
Processing data for Participant: brno_03
Error processing data for Participant: brno_03
'[20032] not in index'
Processing data for Participant: brno_04
Extracted features saved to 'brno_features/brno_features_brno_04.csv'.
Processing data for Participant: brno_05
Extracted features saved to 'brno_features/brno_features_brno_05.csv'.
Processing data for Participant: brno_06
Extracted features saved to 'brno_features/brno_features_brno_06.csv'.
Processing data for Participant: brno_07
Extracted features saved to 'brno_features/brno_features_brno_07.csv'.
Processing data for Participant: brno_08
Extracted features saved to 

  warn(


Error processing data for Participant: brno_13
cannot convert float NaN to integer
Processing data for Participant: brno_14
Extracted features saved to 'brno_features/brno_features_brno_14.csv'.
Processing data for Participant: brno_15
Extracted features saved to 'brno_features/brno_features_brno_15.csv'.
Processing data for Participant: brno_16
Extracted features saved to 'brno_features/brno_features_brno_16.csv'.
Processing data for Participant: brno_17
Extracted features saved to 'brno_features/brno_features_brno_17.csv'.
Processing completed. 3 participants failed to process.


In [2]:
import pandas as pd
dp = pd.read_csv("brno-ECGs.csv")

In [3]:
dp.head(20)



Unnamed: 0,ECG,Participant,Sample,Sampling_Rate,Database
0,22.971089,118001,0,100,brno
1,10.970362,118001,1,100,brno
2,34.971816,118001,2,100,brno
3,34.971816,118001,3,100,brno
4,58.003515,118001,4,100,brno
5,58.003515,118001,5,100,brno
6,46.002788,118001,6,100,brno
7,22.971089,118001,7,100,brno
8,22.971089,118001,8,100,brno
9,46.002788,118001,9,100,brno


In [4]:
record

NameError: name 'record' is not defined