In [None]:
import os
import pandas as pd

base_dir = "NCKU-CBIC-ECG-Database"

combined_ecg_data = []
combined_rpeaks_data = []

def load_csv_data(filepath):
    try:
        return pd.read_csv(filepath, header=None).iloc[0]  # Assumes data is in the first row
    except Exception as e:
        print(f"Failed to load {filepath}: {e}")
        return pd.Series()

def is_valid_folder(folder_name):
    # Check if the folder name starts with any of the valid prefixes
    valid_prefixes = ('1_', '2_', '3_', '4_', '5_', '6_')
    return any(folder_name.startswith(prefix) for prefix in valid_prefixes)

for participant_folder in os.listdir(base_dir):
    if is_valid_folder(participant_folder):  # Check for valid folder names
        participant_id = participant_folder.split('_')[0]
        formatted_participant_id = f"NCKU_{participant_id}" 
        participant_path = os.path.join(base_dir, participant_folder)
        
        print(f"Processing Participant {formatted_participant_id}")
        
        if os.path.isdir(participant_path):
            ecg_file = os.path.join(participant_path, "OUTPUT_ECG_data.csv")
            print(ecg_file)
            peak_label_file = os.path.join(participant_path, "OUTPUT_peak_label.csv")
            peak_position_file = os.path.join(participant_path, "OUTPUT_peak_position.csv")
            
            ecg_data = load_csv_data(ecg_file)
            peak_labels = load_csv_data(peak_label_file)
            peak_positions = load_csv_data(peak_position_file)
            
            for i, ecg_point in enumerate(ecg_data):
                combined_ecg_data.append({
                    'Participant': formatted_participant_id,
                    'Session': participant_path,
                    'Sample': i,
                    'ECG': ecg_point,
                    'Sampling_Rate': 400
                })
            
            for pos, label in zip(peak_positions, peak_labels):
                combined_rpeaks_data.append({
                    'Participant': formatted_participant_id,
                    'Session': participant_path,
                    'Rpeak_Position': pos,
                    'Label': label,
                    'Sampling_Rate': 400
                })

df_ecg = pd.DataFrame(combined_ecg_data)
df_rpeaks = pd.DataFrame(combined_rpeaks_data)

df_ecg.sort_values(by=['Participant', 'Session'], inplace=True)
df_rpeaks.sort_values(by=['Participant', 'Session'], inplace=True)

df_ecg.to_csv("NCKU-ECGs.csv", index=False)
df_rpeaks.to_csv("NCKU-Rpeaks.csv", index=False)

print("Data processing complete. Files saved.")

In [None]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks'])
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks'])
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks'])

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math

def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per minute
    samples_per_minute = sample_rate * 30

    # Prepare to collect all features
    all_features = []

    # Iterate over each participant's data
    # skip the participants with errors when processing
    participants = df_ecg['Participant'].unique()
    for participant in participants:
        # print(f"Processing data for Participant: {participant}")
        participant_data = df_ecg[df_ecg['Participant'] == participant]
        
        # Process data minute by minute
        for start in range(0, len(participant_data), samples_per_minute):
            end = start + samples_per_minute
            ecg_segment = participant_data['ECG'].iloc[start:end].values

            if len(ecg_segment) == samples_per_minute:  # Ensure full minute data 
                features = extract_pqrst_features(ecg_segment, sample_rate)
                features.update({'Participant': participant, 'Half Minute': start // samples_per_minute})
                all_features.append(features)
        

    return pd.DataFrame(all_features)


df_ecg = pd.read_csv("NCKU-ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 400  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    print(f"Processing data for Participant: {participant}")
    participant_data = df_ecg[df_ecg['Participant'] == participant]
    try:
        features = process_ecg_data(participant_data, sample_rate)
        # Save the extracted features to a new CSV file
        features.to_csv(f"NCKU_features/NCKU_features_{participant}.csv", index=False)
        print(f"Extracted features saved to 'NCKU_features/NCKU_features_{participant}.csv'.")
    except Exception as e:
        print(f"Error processing data for Participant: {participant}")
        print(e)
        failed += 1
print(f"Processing completed. {failed} participants failed to process.")

In [12]:
import math
import numpy as np
import pandas as pd
import neurokit2 as nk

def extract_pqrst_features(ecg_signal, sample_rate):
    cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=sample_rate)
    _, rpeaks = nk.ecg_peaks(cleaned_ecg, sampling_rate=sample_rate)
    _, waves_peak = nk.ecg_delineate(cleaned_ecg, rpeaks, sampling_rate=sample_rate, method="peak")

    # Initialize dictionary to store features
    features = {}

    # Check and calculate amplitude differences where indices are valid
    def calculate_amplitude_differences(peak_indices, reference_peak_indices):
        valid_indices = [i for i in range(len(peak_indices)) if not math.isnan(peak_indices[i]) and not math.isnan(reference_peak_indices[i])]
        amplitudes = np.array([cleaned_ecg[int(peak_indices[i])] - cleaned_ecg[int(reference_peak_indices[i])] for i in valid_indices])
        return np.mean(amplitudes[~np.isnan(amplitudes)]) if amplitudes.size > 0 else np.nan

    # Calculate amplitude differences for P, Q, S, T peaks with respect to R-peaks
    features['P_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_P_Peaks'], rpeaks['ECG_R_Peaks'])
    features['Q_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['S_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_S_Peaks'], rpeaks['ECG_R_Peaks'])
    features['T_minus_R_amp_mean'] = calculate_amplitude_differences(waves_peak['ECG_T_Peaks'], rpeaks['ECG_R_Peaks'])

    # Calculate interval features
    def calculate_intervals(start_peaks, end_peaks):
        valid_indices = [i for i in range(len(start_peaks)) if not math.isnan(start_peaks[i]) and not math.isnan(end_peaks[i])]
        intervals = np.array([(end_peaks[i] - start_peaks[i]) / sample_rate for i in valid_indices])  # convert to seconds
        return np.mean(intervals[~np.isnan(intervals)]) if intervals.size > 0 else np.nan

    features['PQ_interval_mean'] = calculate_intervals(waves_peak['ECG_P_Onsets'], waves_peak['ECG_Q_Peaks'])
    features['QR_interval_mean'] = calculate_intervals(waves_peak['ECG_Q_Peaks'], rpeaks['ECG_R_Peaks'])
    features['RS_interval_mean'] = calculate_intervals(rpeaks['ECG_R_Peaks'], waves_peak['ECG_S_Peaks'])
    features['ST_interval_mean'] = calculate_intervals(waves_peak['ECG_S_Peaks'], waves_peak['ECG_T_Peaks'])

    return features

import math
def clean_ecg_data(participant_data, sample_rate):
    """
    Clean the ECG signal using NeuroKit2 for each row and add as a new column.
    Assumes 'ECG' column contains raw ECG data possibly as a comma-separated string.
    """
    # Ensure that each ECG entry is an np.array; handle strings or NaNs appropriately
    def to_float_array(ecg):
        if isinstance(ecg, str):
            try:
                return np.fromstring(ecg, sep=',')
            except:
                return np.nan  # Return NaN if conversion fails
        return ecg  # Return the array if already in correct format

    participant_data['ECG'] = participant_data['ECG'].apply(to_float_array)

    # Clean ECG data only if it's not NaN and length is sufficient
    def clean_if_possible(ecg):
        if isinstance(ecg, np.ndarray) and len(ecg) > sample_rate / 2:
            return nk.ecg_clean(ecg, sampling_rate=sample_rate)
        return np.nan  # Return NaN if too short or not proper np.ndarray

    participant_data['Cleaned_ECG'] = participant_data['ECG'].apply(clean_if_possible)
    return participant_data


def process_ecg_data(df_ecg, sample_rate):
    # Determine the number of samples per minute
    samples_per_minute = sample_rate * 30

    # Prepare to collect all features
    all_features = []

    # Iterate over each participant's data
    # skip the participants with errors when processing
    participants = df_ecg['Participant'].unique()
    for participant in participants:
        # print(f"Processing data for Participant: {participant}")
        participant_data = df_ecg[df_ecg['Participant'] == participant]
        
        print(participant_data.head())
        
        participant_data = clean_ecg_data(participant_data, sample_rate)
        
        print(participant_data.head())
    
        # Process data minute by minute
        for start in range(0, len(participant_data), samples_per_minute):
            end = start + samples_per_minute
            ecg_segment = participant_data['ECG'].iloc[start:end].values

            if len(ecg_segment) == samples_per_minute:  # Ensure full minute data 
                features = extract_pqrst_features(ecg_segment, sample_rate)
                features.update({'Participant': participant, 'Half Minute': start // samples_per_minute})
                all_features.append(features)
        

    return pd.DataFrame(all_features)


df_ecg = pd.read_csv("NCKU-ECGs.csv")

# drop NaNs
df_ecg = df_ecg.dropna()

# Assume the sampling rate needs to be defined
sample_rate = 400  # Define the correct sample rate for your data

# Process and extract features from the ECG data
# features_df = process_ecg_data(df_ecg, sample_rate)

# Process and save features by participant in df_ecg, save each user's data in a separate CSV file
participants = df_ecg['Participant'].unique()
# Report how many participants are being processed and how many failed
pcount = len(participants)
print(f"Processing data for {pcount} participants.")
failed = 0
for participant in participants:
    print(f"Processing data for Participant: {participant}")
    participant_data = df_ecg[df_ecg['Participant'] == participant]
    try:
        features = process_ecg_data(participant_data, sample_rate)
        # Save the extracted features to a new CSV file
        features.to_csv(f"NCKU_features/NCKU_features_{participant}.csv", index=False)
        print(f"Extracted features saved to 'NCKU_features/NCKU_features_{participant}.csv'.")
    except Exception as e:
        print(f"Error processing data for Participant: {participant}")
        print(e)
        # raise e
        failed += 1
print(f"Processing completed. {failed} participants failed to process.")

Processing data for 6 participants.
Processing data for Participant: NCKU_1
  Participant                        Session  Sample       ECG  Sampling_Rate
0      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       0 -0.004349            400
1      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       1 -0.010554            400
2      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       2 -0.012562            400
3      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       3 -0.012562            400
4      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       4 -0.012562            400
  Participant                        Session  Sample       ECG  Sampling_Rate  \
0      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       0 -0.004349            400   
1      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       1 -0.010554            400   
2      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       2 -0.012562            400   
3      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       3 -0.012562            400   
4      NCKU_1  NCKU-CBIC-ECG-Database/1_0100       