### PPG_DALIA_HR

In [None]:
import pickle
import numpy as np
import os
import glob

# --- 1. Configuration ---

# Directory where the input data is located
data_dir = 
# Output directory for the processed .npy files
output_dir = 
os.makedirs(output_dir, exist_ok=True)

# File matching pattern to find subject data
file_pattern = os.path.join(data_dir, 'S*', 'S*.pkl')

# Dataset Specifications
FS_PPG = 64          # Sampling frequency of the PPG signal
WINDOW_SECONDS = 8   # Duration of each data window in seconds
SHIFT_SECONDS = 2    # Overlap between consecutive windows in seconds

# Split ratios and random seed for reproducibility
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# --- 2. Find all data files ---
file_paths = sorted(glob.glob(file_pattern))

if not file_paths:
    raise FileNotFoundError(f"Error: No matching files found in '{data_dir}'. Pattern: '{file_pattern}'")

print(f"Found {len(file_paths)} subject files to process:")
for path in file_paths:
    print(f"- {path}")
print("-" * 40)

# --- 3. Process data and group by subject ID ---
data_by_subject = {}

for file_path in file_paths:
    # Extract subject ID from the file path (e.g., 'S5' from '/path/to/S5/S5.pkl')
    subject_id = os.path.basename(os.path.dirname(file_path))
    print(f"Processing subject: {subject_id}...")
    
    try:
        with open(file_path, 'rb') as file:
            data = pickle.load(file, encoding='latin1')

        # Extract PPG signal and corresponding heart rate labels
        ppg_signal = data['signal']['wrist']['BVP'].ravel()
        labels = data['label'].ravel()

        # Calculate window and shift lengths in samples
        win_len_samples = int(WINDOW_SECONDS * FS_PPG)
        win_shift_samples = int(SHIFT_SECONDS * FS_PPG)
        
        current_subject_features = []
        current_subject_labels = []

        # Create sliding windows
        num_windows = len(labels)
        for i in range(num_windows):
            start_idx = i * win_shift_samples
            end_idx = start_idx + win_len_samples
            
            # Ensure the window does not exceed the signal bounds
            if end_idx > len(ppg_signal):
                break

            feature_segment = ppg_signal[start_idx:end_idx]
            label_value = labels[i]
            
            current_subject_features.append(feature_segment)
            current_subject_labels.append(label_value)
        
        # If data was successfully extracted, store it in the dictionary
        if current_subject_features:
            data_by_subject[subject_id] = {
                'features': np.array(current_subject_features),
                'labels': np.array(current_subject_labels)
            }
            print(f"  -> Successfully extracted {len(current_subject_features)} samples.")
        else:
            print(f"  -> Failed to extract any samples from {subject_id}.")

    except Exception as e:
        print(f"  -> Error processing file '{file_path}': {e}")

print("-" * 40)
print("All files preprocessed. Data is now grouped by subject ID.")

# --- 4. Perform subject-based data splitting ---

# Get all unique subject IDs and shuffle them for random splitting
subject_ids = list(data_by_subject.keys())
np.random.shuffle(subject_ids)
num_subjects = len(subject_ids)

if num_subjects < 3:
    raise ValueError("Number of subjects is less than 3, cannot split into train, validation, and test sets.")

# Split subject IDs based on the defined ratios
train_split_idx = int(num_subjects * TRAIN_RATIO)
val_split_idx = train_split_idx + int(num_subjects * VAL_RATIO)

train_subject_ids = subject_ids[:train_split_idx]
val_subject_ids = subject_ids[train_split_idx:val_split_idx]
test_subject_ids = subject_ids[val_split_idx:]

print("Subject ID Split Results:")
print(f"Training set subjects ({len(train_subject_ids)}): {train_subject_ids}")
print(f"Validation set subjects ({len(val_subject_ids)}): {val_subject_ids}")
print(f"Test set subjects ({len(test_subject_ids)}): {test_subject_ids}")
print("-" * 40)

# --- 5. Combine data into final sets based on the split IDs ---

def combine_data_from_ids(ids_list, data_source):
    """Helper function to combine data from the source dictionary based on a list of IDs."""
    if not ids_list:
        # If the ID list is empty, return empty arrays with the correct shape.
        # Get dimension info from a sample entry to create correctly shaped empty arrays.
        if not data_source:
            return np.empty((0, WINDOW_SECONDS * FS_PPG)), np.empty((0,))
        
        sample_key = next(iter(data_source))
        feature_dim = data_source[sample_key]['features'].shape[1]
        return np.empty((0, feature_dim)), np.empty((0,))
        
    features_list = [data_source[pid]['features'] for pid in ids_list]
    labels_list = [data_source[pid]['labels'] for pid in ids_list]
    
    # Use np.concatenate to merge the data from all subjects in the list
    final_features = np.concatenate(features_list, axis=0)
    final_labels = np.concatenate(labels_list, axis=0)
    
    return final_features, final_labels

# Create the final training, validation, and test sets
train_features, train_labels = combine_data_from_ids(train_subject_ids, data_by_subject)
val_features, val_labels = combine_data_from_ids(val_subject_ids, data_by_subject)
test_features, test_labels = combine_data_from_ids(test_subject_ids, data_by_subject)

# --- 6. Print final shapes and perform validation ---
print("Final Dataset Shapes (by sample count):")
print(f"Training set (Train)  -> Features: {train_features.shape}, Labels: {train_labels.shape}")
print(f"Validation set (Val)  -> Features: {val_features.shape}, Labels: {val_labels.shape}")
print(f"Test set (Test)       -> Features: {test_features.shape}, Labels: {test_labels.shape}")

# Verify that the total number of samples matches before and after the split
total_samples_in_dict = sum(len(d['labels']) for d in data_by_subject.values())
total_samples_after_split = len(train_features) + len(val_features) + len(test_features)
assert total_samples_in_dict == total_samples_after_split, "Total number of samples does not match after splitting!"
print(f"\nSample count validation successful: {total_samples_after_split}")
print("-" * 40)

# --- 7. Save the processed data to .npy files ---
print(f"Saving files to directory: '{output_dir}'")

np.save(os.path.join(output_dir, 'train_features.npy'), train_features)
np.save(os.path.join(output_dir, 'train_labels.npy'), train_labels)

np.save(os.path.join(output_dir, 'val_features.npy'), val_features)
np.save(os.path.join(output_dir, 'val_labels.npy'), val_labels)

np.save(os.path.join(output_dir, 'test_features.npy'), test_features)
np.save(os.path.join(output_dir, 'test_labels.npy'), test_labels)

print("\nAll files saved successfully!")

### PPG_DALIA_RR

In [None]:
import os
import pickle
import numpy as np
from scipy.signal import butter, filtfilt, welch
from tqdm import tqdm

# --- 1. Configuration Parameters ---
PPG_SAMPLING_RATE = 64
RESP_SAMPLING_RATE = 700
PPG_SEGMENT_LENGTH = 1250  # Corresponds to ~19.5 seconds
SEGMENT_DURATION_S = PPG_SEGMENT_LENGTH / PPG_SAMPLING_RATE
RESP_SEGMENT_LENGTH = int(SEGMENT_DURATION_S * RESP_SAMPLING_RATE)
STEP_DURATION_S = 1.0  # Sliding window step
PPG_STEP_SIZE = int(STEP_DURATION_S * PPG_SAMPLING_RATE)
RESP_STEP_SIZE = int(STEP_DURATION_S * RESP_SAMPLING_RATE)
RESP_RATE_MIN_HZ = 0.1  # 6 breaths per minute
RESP_RATE_MAX_HZ = 0.5  # 30 breaths per minute

# --- 2. Respiration Rate Estimation Function ---
def estimate_respiration_rate(resp_segment, fs):
    """
    Estimate respiration rate from a respiration signal segment using Welch's method.
    """
    lowcut, highcut = 0.08, 2.0
    nyquist = 0.5 * fs
    low, high = lowcut / nyquist, highcut / nyquist
    
    try:
        b, a = butter(2, [low, high], btype='band')
        filtered_segment = filtfilt(b, a, resp_segment)
    except ValueError:
        # This can happen if the signal is too short or contains NaNs
        return np.nan
    
    freqs, psd = welch(filtered_segment, fs, nperseg=len(filtered_segment), window='hamming')
    
    # Find frequencies within the valid respiration rate range
    valid_indices = np.where((freqs >= RESP_RATE_MIN_HZ) & (freqs <= RESP_RATE_MAX_HZ))
    
    if len(valid_indices[0]) == 0:
        return np.nan

    # Find the frequency with the highest power in the valid range
    valid_psd = psd[valid_indices]
    peak_index_in_valid = np.argmax(valid_psd)
    resp_rate_hz = freqs[valid_indices][peak_index_in_valid]
    
    return resp_rate_hz

# --- 3. Data Processing Function ---
def process_subject_file(file_path):
    """
    Loads and processes a single subject's .pkl file.
    Returns features (PPG segments) and labels (respiration rates), or (None, None) 
    if an error occurs or no valid segments are found.
    """
    subject_id = os.path.basename(os.path.dirname(file_path))

    try:
        with open(file_path, 'rb') as file:
            data = pickle.load(file, encoding='latin1')
        bvp_signal = data['signal']['wrist']['BVP'].flatten()
        resp_signal = data['signal']['chest']['Resp'].flatten()
    except (FileNotFoundError, KeyError, EOFError):
        return None, None

    features_list = []
    labels_list = []

    # Determine the maximum number of windows that can be created
    max_ppg_start = len(bvp_signal) - PPG_SEGMENT_LENGTH
    max_resp_start = len(resp_signal) - RESP_SEGMENT_LENGTH

    if max_ppg_start < 0 or max_resp_start < 0:
        return None, None

    total_iterations = min(max_ppg_start // PPG_STEP_SIZE, max_resp_start // RESP_STEP_SIZE) + 1
    
    for i in tqdm(range(total_iterations), desc=f"Processing {subject_id}", leave=False, ncols=100):
        ppg_start_idx = i * PPG_STEP_SIZE
        resp_start_idx = i * RESP_STEP_SIZE
        
        ppg_segment = bvp_signal[ppg_start_idx : ppg_start_idx + PPG_SEGMENT_LENGTH]
        resp_segment = resp_signal[resp_start_idx : resp_start_idx + RESP_SEGMENT_LENGTH]
        
        estimated_rr = estimate_respiration_rate(resp_segment, RESP_SAMPLING_RATE)
        
        if not np.isnan(estimated_rr):
            features_list.append(ppg_segment)
            labels_list.append(estimated_rr)

    if not features_list:
        return None, None
        
    return np.array(features_list), np.array(labels_list)

# --- 4. Main Execution Block ---
if __name__ == "__main__":
    # --- Configuration ---
    BASE_DIR = 
    OUTPUT_DIR = 
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    NUM_SUBJECTS = 15
    TRAIN_RATIO = 0.8
    VAL_RATIO = 0.1
    RANDOM_SEED = 42
    np.random.seed(RANDOM_SEED)

    # --- Step 1: Process data and store it grouped by subject ---
    data_by_subject = {}
    
    print("-" * 50)
    print(f"Starting batch processing for {NUM_SUBJECTS} subjects...")
    print("-" * 50)

    for i in range(1, NUM_SUBJECTS + 1):
        subject_id = f'S{i}'
        file_path = os.path.join(BASE_DIR, subject_id, f'{subject_id}.pkl')
        
        subject_features, subject_labels = process_subject_file(file_path)
        
        if subject_features is not None and subject_labels is not None:
            data_by_subject[subject_id] = {
                'features': subject_features,
                'labels': subject_labels
            }
            print(f"-> Collected {len(subject_features)} samples from {subject_id}.")
        else:
            print(f"-> No valid data collected from {subject_id}.")

    # --- Step 2: Split data based on subject IDs ---
    print("\n" + "-" * 50)
    print("All subjects processed. Splitting data by subject ID...")
    print("-" * 50)
    
    subject_ids = list(data_by_subject.keys())
    np.random.shuffle(subject_ids)
    num_subjects_found = len(subject_ids)

    if num_subjects_found < 3:
        raise ValueError(f"Found only {num_subjects_found} subjects with valid data. Cannot split into train/val/test sets.")

    train_split_idx = int(num_subjects_found * TRAIN_RATIO)
    val_split_idx = train_split_idx + int(num_subjects_found * VAL_RATIO)

    train_subject_ids = subject_ids[:train_split_idx]
    val_subject_ids = subject_ids[train_split_idx:val_split_idx]
    test_subject_ids = subject_ids[val_split_idx:]

    print("Subject ID split results:")
    print(f"Train subjects ({len(train_subject_ids)}): {train_subject_ids}")
    print(f"Validation subjects ({len(val_subject_ids)}): {val_subject_ids}")
    print(f"Test subjects ({len(test_subject_ids)}): {test_subject_ids}")

    # --- Step 3: Combine data into sets based on the split IDs ---
    def combine_data_from_ids(ids_list, data_source):
        """Helper function to combine data from the source dictionary based on a list of IDs."""
        if not ids_list:
            # If the ID list is empty, return correctly shaped empty arrays.
            if not data_source:
                return np.empty((0, PPG_SEGMENT_LENGTH)), np.empty((0,))
            sample_key = next(iter(data_source))
            feature_dim = data_source[sample_key]['features'].shape[1]
            return np.empty((0, feature_dim)), np.empty((0,))

        features_list = [data_source[pid]['features'] for pid in ids_list]
        labels_list = [data_source[pid]['labels'] for pid in ids_list]
        return np.concatenate(features_list, axis=0), np.concatenate(labels_list, axis=0)

    train_features, train_labels = combine_data_from_ids(train_subject_ids, data_by_subject)
    val_features, val_labels = combine_data_from_ids(val_subject_ids, data_by_subject)
    test_features, test_labels = combine_data_from_ids(test_subject_ids, data_by_subject)

    # --- Step 4: Validate and save the final datasets ---
    print("\n--- Final Aggregated Datasets ---")
    print(f"Train Set      -> Features: {train_features.shape}, Labels: {train_labels.shape}")
    print(f"Validation Set -> Features: {val_features.shape}, Labels: {val_labels.shape}")
    print(f"Test Set       -> Features: {test_features.shape}, Labels: {test_labels.shape}")

    # Verify total sample count
    total_samples = sum(len(d['labels']) for d in data_by_subject.values())
    split_samples = len(train_labels) + len(val_labels) + len(test_labels)
    assert total_samples == split_samples, "Sample count mismatch after split!"
    print(f"\nTotal sample count verified: {total_samples}")

    print("\n" + "-" * 50)
    print(f"Saving final files to '{OUTPUT_DIR}'...")
    
    np.save(os.path.join(OUTPUT_DIR, 'train_features.npy'), train_features)
    np.save(os.path.join(OUTPUT_DIR, 'train_labels.npy'), train_labels)
    np.save(os.path.join(OUTPUT_DIR, 'val_features.npy'), val_features)
    np.save(os.path.join(OUTPUT_DIR, 'val_labels.npy'), val_labels)
    np.save(os.path.join(OUTPUT_DIR, 'test_features.npy'), test_features)
    np.save(os.path.join(OUTPUT_DIR, 'test_labels.npy'), test_labels)

    print("All files saved successfully.")
    print("-" * 50)

### BIDMC_RR

In [None]:
import pandas as pd
import numpy as np
import os

# --- 1. Configuration ---
DATASET_PATH = 
OUTPUT_PATH = 
NUM_SUBJECTS = 53
SAMPLING_RATE = 125
ANNOTATOR_COLUMN = 'breaths ann1 [signal sample no]'

# Windowing parameters
WINDOW_SEC = 10
STRIDE_SEC = 2

# Splitting parameters
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
RANDOM_SEED = 42 # For reproducible splits

# --- 2. Single Subject Processing Function ---
def create_windows_and_labels_for_subject(subject_id):
    """
    Creates PPG windows and corresponding respiration rate labels for a single subject.
    """
    signals_file = os.path.join(DATASET_PATH, f'bidmc_{subject_id:02}_Signals.csv')
    breaths_file = os.path.join(DATASET_PATH, f'bidmc_{subject_id:02}_Breaths.csv')

    if not os.path.exists(signals_file) or not os.path.exists(breaths_file):
        print(f"Warning: Files for subject {subject_id} are missing. Skipping.")
        return None, None

    try:
        signals_df = pd.read_csv(signals_file)
        signals_df.columns = signals_df.columns.str.strip()
        
        breaths_df = pd.read_csv(breaths_file)
        breaths_df.columns = breaths_df.columns.str.strip()

        ppg_signal = signals_df['PLETH'].values
        breath_starts = breaths_df[ANNOTATOR_COLUMN].values
    except (FileNotFoundError, KeyError) as e:
        print(f"Error: Could not read or process files for subject {subject_id}: {e}")
        return None, None

    window_samples = WINDOW_SEC * SAMPLING_RATE
    stride_samples = STRIDE_SEC * SAMPLING_RATE

    ppg_windows = []
    rr_labels = []

    start_idx = 0
    while start_idx + window_samples <= len(ppg_signal):
        end_idx = start_idx + window_samples
        current_ppg_window = ppg_signal[start_idx:end_idx]
        
        # Count breaths within the current window
        breaths_in_window = np.sum((breath_starts >= start_idx) & (breath_starts < end_idx))
        # Convert count to breaths per minute
        respiratory_rate = (breaths_in_window / WINDOW_SEC) * 60
        
        ppg_windows.append(current_ppg_window)
        rr_labels.append(respiratory_rate)
        
        start_idx += stride_samples

    if not ppg_windows:
        return None, None
        
    return np.array(ppg_windows), np.array(rr_labels)


# --- 3. Main Execution Function ---
def main():
    """
    Main function to process all subject data, split by subject ID, and save the results.
    """
    np.random.seed(RANDOM_SEED)
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    
    # --- Step 1: Process and group data by subject ---
    data_by_subject = {}
    print("Starting dataset processing...")

    for i in range(1, NUM_SUBJECTS + 1):
        subject_id = i
        features, labels = create_windows_and_labels_for_subject(subject_id)
        
        if features is not None and labels is not None:
            data_by_subject[subject_id] = {
                'features': features,
                'labels': labels
            }
            print(f"Processed subject {subject_id:02} - Generated {len(features)} window samples.")

    if not data_by_subject:
        print("Error: Failed to generate any samples from the dataset. Please check paths and files.")
        return

    # --- Step 2: Split subjects into train, validation, and test sets ---
    print("\nAll subjects processed. Splitting dataset by subject ID...")
    
    subject_ids = list(data_by_subject.keys())
    np.random.shuffle(subject_ids)
    num_subjects_found = len(subject_ids)

    if num_subjects_found < 3:
        raise ValueError(f"Found only {num_subjects_found} subjects with valid data. Cannot split into train/val/test sets.")

    train_split_idx = int(num_subjects_found * TRAIN_RATIO)
    val_split_idx = train_split_idx + int(num_subjects_found * VAL_RATIO)

    train_subject_ids = subject_ids[:train_split_idx]
    val_subject_ids = subject_ids[train_split_idx:val_split_idx]
    test_subject_ids = subject_ids[val_split_idx:]
    
    print("\nSubject ID Split Results:")
    print(f"Training set subjects ({len(train_subject_ids)}): {sorted(train_subject_ids)}")
    print(f"Validation set subjects ({len(val_subject_ids)}): {sorted(val_subject_ids)}")
    print(f"Test set subjects ({len(test_subject_ids)}): {sorted(test_subject_ids)}")

    # --- Step 3: Combine data based on the split subject IDs ---
    def combine_data_from_ids(ids_list, data_source):
        """Helper function to combine data arrays from a list of subject IDs."""
        window_samples = WINDOW_SEC * SAMPLING_RATE
        if not ids_list:
            return np.empty((0, window_samples)), np.empty((0,))
        
        features_list = [data_source[sid]['features'] for sid in ids_list]
        labels_list = [data_source[sid]['labels'] for sid in ids_list]
        return np.concatenate(features_list, axis=0), np.concatenate(labels_list, axis=0)

    train_features, train_labels = combine_data_from_ids(train_subject_ids, data_by_subject)
    val_features, val_labels = combine_data_from_ids(val_subject_ids, data_by_subject)
    test_features, test_labels = combine_data_from_ids(test_subject_ids, data_by_subject)

    # --- Step 4: Validate and save the final datasets ---
    print("\n--- Final Dataset Shapes ---")
    print(f"Training set (Train)  -> Features: {train_features.shape}, Labels: {train_labels.shape}")
    print(f"Validation set (Val)  -> Features: {val_features.shape}, Labels: {val_labels.shape}")
    print(f"Test set (Test)       -> Features: {test_features.shape}, Labels: {test_labels.shape}")
    
    total_samples_in_dict = sum(len(d['labels']) for d in data_by_subject.values())
    total_samples_after_split = len(train_labels) + len(val_labels) + len(test_labels)
    assert total_samples_in_dict == total_samples_after_split, "Total number of samples does not match after splitting!"
    print(f"\nSample count validation successful: {total_samples_after_split}")

    print(f"\nSaving files to: '{OUTPUT_PATH}'")
    
    np.save(os.path.join(OUTPUT_PATH, 'train_features.npy'), train_features)
    np.save(os.path.join(OUTPUT_PATH, 'train_labels.npy'), train_labels)
    np.save(os.path.join(OUTPUT_PATH, 'val_features.npy'), val_features)
    np.save(os.path.join(OUTPUT_PATH, 'val_labels.npy'), val_labels)
    np.save(os.path.join(OUTPUT_PATH, 'test_features.npy'), test_features)
    np.save(os.path.join(OUTPUT_PATH, 'test_labels.npy'), test_labels)

    print("All files saved successfully!")


if __name__ == '__main__':
    main()

### BIDMC_HR

In [None]:
import pandas as pd
import numpy as np
import os

# --- 1. 全局参数定义 (部分修改) ---
DATASET_PATH = '/home/ubuntu/wokrspace/Data/physionet.org/files/bidmc/1.0.0/bidmc_csv/'
NUM_SUBJECTS = 53
SAMPLING_RATE = 125

WINDOW_SEC = 10
STRIDE_SEC = 2

# --- 新增/修改的配置 ---
OUTPUT_PATH = '/home/ubuntu/wokrspace/Finetuning_tasks/bidmc_hr' # 新的输出文件夹，按受试者划分
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
RANDOM_SEED = 42 # 确保划分可复现

# --- 2. 单个受试者处理函数 (保持不变) ---
def create_windows_and_labels_for_subject(subject_id):
    """
    为单个受试者创建PPG窗口和对应的平均心率(HR)标签。
    (此函数功能正确，无需修改)
    """
    signals_file = os.path.join(DATASET_PATH, f'bidmc_{subject_id:02}_Signals.csv')
    numerics_file = os.path.join(DATASET_PATH, f'bidmc_{subject_id:02}_Numerics.csv')

    if not os.path.exists(signals_file) or not os.path.exists(numerics_file):
        print(f"警告: 受试者 {subject_id:02} 的文件缺失，已跳过。")
        return None, None

    try:
        signals_df = pd.read_csv(signals_file)
        signals_df.columns = signals_df.columns.str.strip()
        
        numerics_df = pd.read_csv(numerics_file)
        numerics_df.columns = numerics_df.columns.str.strip()

        ppg_signal = signals_df['PLETH'].values
        
    except (FileNotFoundError, KeyError) as e:
        print(f"错误: 读取或处理受试者 {subject_id:02} 的文件时出错: {e}")
        return None, None

    window_samples = WINDOW_SEC * SAMPLING_RATE
    stride_samples = STRIDE_SEC * SAMPLING_RATE

    ppg_windows = []
    hr_labels = []

    start_idx = 0
    while start_idx + window_samples <= len(ppg_signal):
        end_idx = start_idx + window_samples
        current_ppg_window = ppg_signal[start_idx:end_idx]
        
        start_time_sec = start_idx / SAMPLING_RATE
        end_time_sec = end_idx / SAMPLING_RATE
        
        relevant_numerics = numerics_df[
            (numerics_df['Time [s]'] >= start_time_sec) & 
            (numerics_df['Time [s]'] < end_time_sec)
        ]
        
        if not relevant_numerics.empty:
            average_hr = relevant_numerics['HR'].mean()
            if not np.isnan(average_hr): # 确保平均值有效
                ppg_windows.append(current_ppg_window)
                hr_labels.append(average_hr)
        
        start_idx += stride_samples

    if not ppg_windows:
        return None, None
        
    return np.array(ppg_windows), np.array(hr_labels)


# --- 3. 主执行函数 (*** 这里是主要修改的地方 ***) ---
def main():
    """
    主函数，处理所有受试者数据，按受试者ID划分，并保存结果。
    """
    np.random.seed(RANDOM_SEED)
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    
    # --- 步骤 1: 按受试者处理并分组存储数据 ---
    data_by_subject = {}
    print("开始处理数据集，提取PPG窗口和对应的平均HR...")

    for i in range(1, NUM_SUBJECTS + 1):
        subject_id = i
        features, labels = create_windows_and_labels_for_subject(subject_id)
        
        if features is not None and labels is not None:
            data_by_subject[subject_id] = {
                'features': features,
                'labels': labels
            }
            print(f"处理完成: 受试者 {subject_id:02} - 生成了 {len(features)} 个样本。")

    if not data_by_subject:
        print("错误: 未能从数据集中生成任何样本。请检查路径和文件。")
        return

    # --- 步骤 2: 基于受试者ID进行分组划分 ---
    print("\n所有受试者处理完毕，正在按ID划分数据集...")
    
    subject_ids = list(data_by_subject.keys())
    np.random.shuffle(subject_ids)
    num_subjects = len(subject_ids)

    if num_subjects < 3:
        raise ValueError("有效受试者数量少于3，无法进行划分。")

    train_split_idx = int(num_subjects * TRAIN_RATIO)
    val_split_idx = train_split_idx + int(num_subjects * VAL_RATIO)

    train_subject_ids = subject_ids[:train_split_idx]
    val_subject_ids = subject_ids[train_split_idx:val_split_idx]
    test_subject_ids = subject_ids[val_split_idx:]
    
    print("\n受试者ID划分结果:")
    print(f"训练集受试者 ({len(train_subject_ids)}): {sorted(train_subject_ids)}")
    print(f"验证集受试者 ({len(val_subject_ids)}): {sorted(val_subject_ids)}")
    print(f"测试集受试者 ({len(test_subject_ids)}): {sorted(test_subject_ids)}")

    # --- 步骤 3: 根据ID划分合并数据 ---
    def combine_data_from_ids(ids_list, data_source):
        if not ids_list:
            # 获取特征维度以创建正确形状的空数组
            sample_key = next(iter(data_source))
            feature_dim = data_source[sample_key]['features'].shape[1]
            return np.empty((0, feature_dim)), np.empty((0,))
            
        features_list = [data_source[sid]['features'] for sid in ids_list]
        labels_list = [data_source[sid]['labels'] for sid in ids_list]
        return np.concatenate(features_list, axis=0), np.concatenate(labels_list, axis=0)

    train_features, train_labels = combine_data_from_ids(train_subject_ids, data_by_subject)
    val_features, val_labels = combine_data_from_ids(val_subject_ids, data_by_subject)
    test_features, test_labels = combine_data_from_ids(test_subject_ids, data_by_subject)

    # --- 步骤 4: 验证并保存 ---
    print("\n--- 最终数据集形状 ---")
    print(f"训练集 (Train)  -> 特征: {train_features.shape}, 标签: {train_labels.shape}")
    print(f"验证集 (Val)    -> 特征: {val_features.shape}, 标签: {val_labels.shape}")
    print(f"测试集 (Test)   -> 特征: {test_features.shape}, 标签: {test_labels.shape}")
    
    total_samples = sum(len(d['labels']) for d in data_by_subject.values())
    split_samples = len(train_labels) + len(val_labels) + len(test_labels)
    assert total_samples == split_samples, "样本总数在划分后不匹配！"
    print(f"\n样本总数验证成功: {total_samples}")

    print(f"\n正在保存文件至: '{OUTPUT_PATH}'")
    
    np.save(os.path.join(OUTPUT_PATH, 'train_features.npy'), train_features)
    np.save(os.path.join(OUTPUT_PATH, 'train_labels.npy'), train_labels)

    np.save(os.path.join(OUTPUT_PATH, 'val_features.npy'), val_features)
    np.save(os.path.join(OUTPUT_PATH, 'val_labels.npy'), val_labels)

    np.save(os.path.join(OUTPUT_PATH, 'test_features.npy'), test_features)
    np.save(os.path.join(OUTPUT_PATH, 'test_labels.npy'), test_labels)

    print("所有文件保存成功！")


if __name__ == '__main__':
    main()

### UCI

In [None]:
import pandas as pd
import numpy as np
import os

# --- 1. Global Parameters ---
DATASET_PATH = 
OUTPUT_PATH = 
NUM_SUBJECTS = 53
SAMPLING_RATE = 125

# Windowing parameters
WINDOW_SEC = 10
STRIDE_SEC = 2

# Splitting parameters
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
RANDOM_SEED = 42 # For reproducible splits

# --- 2. Single Subject Processing Function ---
def create_windows_and_labels_for_subject(subject_id):
    """
    Creates PPG windows and corresponding mean heart rate (HR) labels for a single subject.
    """
    signals_file = os.path.join(DATASET_PATH, f'bidmc_{subject_id:02}_Signals.csv')
    numerics_file = os.path.join(DATASET_PATH, f'bidmc_{subject_id:02}_Numerics.csv')

    if not os.path.exists(signals_file) or not os.path.exists(numerics_file):
        print(f"Warning: Files for subject {subject_id:02} are missing. Skipping.")
        return None, None

    try:
        signals_df = pd.read_csv(signals_file)
        signals_df.columns = signals_df.columns.str.strip()
        
        numerics_df = pd.read_csv(numerics_file)
        numerics_df.columns = numerics_df.columns.str.strip()

        ppg_signal = signals_df['PLETH'].values
        
    except (FileNotFoundError, KeyError) as e:
        print(f"Error: Could not read or process files for subject {subject_id:02}: {e}")
        return None, None

    window_samples = WINDOW_SEC * SAMPLING_RATE
    stride_samples = STRIDE_SEC * SAMPLING_RATE

    ppg_windows = []
    hr_labels = []

    start_idx = 0
    while start_idx + window_samples <= len(ppg_signal):
        end_idx = start_idx + window_samples
        current_ppg_window = ppg_signal[start_idx:end_idx]
        
        # Determine the time range of the current window
        start_time_sec = start_idx / SAMPLING_RATE
        end_time_sec = end_idx / SAMPLING_RATE
        
        # Filter numerics data to find HR values within this window
        relevant_numerics = numerics_df[
            (numerics_df['Time [s]'] >= start_time_sec) & 
            (numerics_df['Time [s]'] < end_time_sec)
        ]
        
        if not relevant_numerics.empty:
            average_hr = relevant_numerics['HR'].mean()
            # Ensure the calculated mean is a valid number
            if not np.isnan(average_hr):
                ppg_windows.append(current_ppg_window)
                hr_labels.append(average_hr)
        
        start_idx += stride_samples

    if not ppg_windows:
        return None, None
        
    return np.array(ppg_windows), np.array(hr_labels)


# --- 3. Main Execution Function ---
def main():
    """
    Main function to process all subject data, split by subject ID, and save the results.
    """
    np.random.seed(RANDOM_SEED)
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    
    # --- Step 1: Process and group data by subject ---
    data_by_subject = {}
    print("Starting dataset processing to extract PPG windows and corresponding mean HR...")

    for i in range(1, NUM_SUBJECTS + 1):
        subject_id = i
        features, labels = create_windows_and_labels_for_subject(subject_id)
        
        if features is not None and labels is not None:
            data_by_subject[subject_id] = {
                'features': features,
                'labels': labels
            }
            print(f"Processed subject {subject_id:02} - Generated {len(features)} samples.")

    if not data_by_subject:
        print("Error: Failed to generate any samples from the dataset. Please check paths and files.")
        return

    # --- Step 2: Split subjects into train, validation, and test sets ---
    print("\nAll subjects processed. Splitting dataset by subject ID...")
    
    subject_ids = list(data_by_subject.keys())
    np.random.shuffle(subject_ids)
    num_subjects_found = len(subject_ids)

    if num_subjects_found < 3:
        raise ValueError(f"Found only {num_subjects_found} subjects with valid data. Cannot split into train/val/test sets.")

    train_split_idx = int(num_subjects_found * TRAIN_RATIO)
    val_split_idx = train_split_idx + int(num_subjects_found * VAL_RATIO)

    train_subject_ids = subject_ids[:train_split_idx]
    val_subject_ids = subject_ids[train_split_idx:val_split_idx]
    test_subject_ids = subject_ids[val_split_idx:]
    
    print("\nSubject ID Split Results:")
    print(f"Training set subjects ({len(train_subject_ids)}): {sorted(train_subject_ids)}")
    print(f"Validation set subjects ({len(val_subject_ids)}): {sorted(val_subject_ids)}")
    print(f"Test set subjects ({len(test_subject_ids)}): {sorted(test_subject_ids)}")

    # --- Step 3: Combine data based on the split subject IDs ---
    def combine_data_from_ids(ids_list, data_source):
        """Helper function to combine data arrays from a list of subject IDs."""
        if not ids_list:
            # If the list is empty, get feature dimension from a sample to create correctly shaped empty arrays.
            if not data_source:
                return np.empty((0, WINDOW_SEC * SAMPLING_RATE)), np.empty((0,))
            sample_key = next(iter(data_source))
            feature_dim = data_source[sample_key]['features'].shape[1]
            return np.empty((0, feature_dim)), np.empty((0,))
            
        features_list = [data_source[sid]['features'] for sid in ids_list]
        labels_list = [data_source[sid]['labels'] for sid in ids_list]
        return np.concatenate(features_list, axis=0), np.concatenate(labels_list, axis=0)

    train_features, train_labels = combine_data_from_ids(train_subject_ids, data_by_subject)
    val_features, val_labels = combine_data_from_ids(val_subject_ids, data_by_subject)
    test_features, test_labels = combine_data_from_ids(test_subject_ids, data_by_subject)

    # --- Step 4: Validate and save the final datasets ---
    print("\n--- Final Dataset Shapes ---")
    print(f"Training set (Train)  -> Features: {train_features.shape}, Labels: {train_labels.shape}")
    print(f"Validation set (Val)  -> Features: {val_features.shape}, Labels: {val_labels.shape}")
    print(f"Test set (Test)       -> Features: {test_features.shape}, Labels: {test_labels.shape}")
    
    total_samples_in_dict = sum(len(d['labels']) for d in data_by_subject.values())
    total_samples_after_split = len(train_labels) + len(val_labels) + len(test_labels)
    assert total_samples_in_dict == total_samples_after_split, "Total number of samples does not match after splitting!"
    print(f"\nSample count validation successful: {total_samples_after_split}")

    print(f"\nSaving files to: '{OUTPUT_PATH}'")
    
    np.save(os.path.join(OUTPUT_PATH, 'train_features.npy'), train_features)
    np.save(os.path.join(OUTPUT_PATH, 'train_labels.npy'), train_labels)
    np.save(os.path.join(OUTPUT_PATH, 'val_features.npy'), val_features)
    np.save(os.path.join(OUTPUT_PATH, 'val_labels.npy'), val_labels)
    np.save(os.path.join(OUTPUT_PATH, 'test_features.npy'), test_features)
    np.save(os.path.join(OUTPUT_PATH, 'test_labels.npy'), test_labels)

    print("All files saved successfully!")


if __name__ == '__main__':
    main()

### CSN

In [None]:
import os
import json
import numpy as np
import wfdb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# --- Configuration ---
DATA_ROOT = 'path/to/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/WFDBRecords/'
OUTPUT_DIR = 'path/to/output/csn/'
TEST_SIZE = 0.1
VAL_SIZE = 0.1 # This will be 10% of the original data, meaning VAL_SIZE / (1 - TEST_SIZE) of the training+validation set.
RANDOM_STATE = 42

# --- Main Script ---

def parse_header_comments_manual(hea_filepath):
    """
    Manually and safely parses a .hea file to extract only the '#Dx' comment lines.
    This bypasses an issue in the wfdb library's internal date parsing logic for this specific dataset.
    
    Args:
        hea_filepath (str): The full path to the .hea file.

    Returns:
        list: A list of SNOMED CT diagnosis codes.
    """
    dx_codes = []
    try:
        with open(hea_filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip().startswith('#Dx:'):
                    codes_str = line.split(':')[1].strip()
                    if codes_str:
                        dx_codes = [code.strip() for code in codes_str.split(',')]
                    break # Stop after finding the Dx line for efficiency
    except IOError:
        # Return an empty list if the file cannot be opened
        return []
    return dx_codes

def process_ecg_data():
    """
    Main function to find, process, split, and save ECG data.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    print("--- Step 1: Discovering all ECG records ---")
    data_path = os.path.abspath(DATA_ROOT)
    
    # Find all .hea files recursively and store their full paths
    all_hea_files = sorted([
        os.path.join(root, file)
        for root, _, files in os.walk(data_path)
        for file in files if file.endswith('.hea')
    ])
    
    if not all_hea_files:
        print(f"Error: No .hea files were found in the directory '{data_path}'.")
        return
        
    print(f"Found a total of {len(all_hea_files)} ECG records.")

    print("\n--- Step 2: Extracting signals and labels (using manual header parsing) ---")
    all_signals = []
    all_raw_labels = []
    
    successful_reads = 0
    failed_reads = 0

    for hea_filepath in tqdm(all_hea_files, desc="Reading records"):
        # Construct the record path base (without extension) from the .hea file path
        # Example: '/path/to/JS00001.hea' -> '/path/to/JS00001'
        record_path_base = hea_filepath[:-4]
        
        try:
            # 1. Read the signal (.mat file) using the absolute path.
            signal, _ = wfdb.rdsamp(record_path_base)
            
            # 2. Manually parse the .hea file to get trusted labels.
            dx_codes = parse_header_comments_manual(hea_filepath)
            
            # 3. Validate the data.
            if signal is None or signal.shape[1] != 12:
                failed_reads += 1
                continue
            
            if not dx_codes: # Skip if no diagnosis codes were found
                failed_reads += 1
                continue

            # Transpose signal from (length, channels) to (channels, length)
            # which is a common convention for time-series models.
            signal_transposed = signal.T
            
            all_signals.append(signal_transposed)
            all_raw_labels.append(dx_codes)
            successful_reads += 1
            
        except Exception:
            # Catch any other potential errors during file processing
            failed_reads += 1

    print("\n--- Reading complete ---")
    print(f"Successfully processed {successful_reads} records.")
    print(f"Failed or skipped {failed_reads} records.")

    if not all_signals:
        print("\nError: Failed to load any valid ECG data. Halting execution.")
        return

    print("\n--- Step 3: Multi-hot encoding labels ---")
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(all_raw_labels)
    num_classes = len(mlb.classes_)
    print(f"Found {num_classes} unique diagnosis codes in the dataset.")
    
    mapping_filepath = os.path.join(OUTPUT_DIR, 'snomed_ct_classes.json')
    with open(mapping_filepath, 'w') as f:
        json.dump(mlb.classes_.tolist(), f, indent=4)
    print(f"Label class mapping saved to: {mapping_filepath}")
    
    print("\n--- Step 4: Assembling feature matrix ---")
    try:
        X = np.array(all_signals, dtype=np.float32)
    except ValueError:
        print("Warning: ECG records have inconsistent lengths. Truncating to the shortest length.")
        min_len = min(s.shape[1] for s in all_signals)
        print(f"All records will be standardized to the minimum length found: {min_len}")
        
        # Pre-allocate numpy array for efficiency
        X = np.zeros((len(all_signals), all_signals[0].shape[0], min_len), dtype=np.float32)
        for i, s in enumerate(all_signals):
            X[i] = s[:, :min_len]

    print(f"Feature matrix (X) shape: {X.shape}")
    print(f"Label matrix (y) shape: {y.shape}")

    print("\n--- Step 5: Splitting the dataset ---")
    # Adjust validation size to be a proportion of the train+val set
    val_size_adjusted = VAL_SIZE / (1 - TEST_SIZE)
    
    # First, split into training+validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True
    )
    # Then, split the training+validation set into final training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size_adjusted, random_state=RANDOM_STATE, shuffle=True
    )
    
    print(f"Training set size:   {X_train.shape[0]}")
    print(f"Validation set size: {X_val.shape[0]}")
    print(f"Test set size:       {X_test.shape[0]}")

    print("\n--- Step 6: Saving processed files ---")
    np.save(os.path.join(OUTPUT_DIR, 'train_features.npy'), X_train)
    np.save(os.path.join(OUTPUT_DIR, 'train_labels.npy'), y_train)
    np.save(os.path.join(OUTPUT_DIR, 'val_features.npy'), X_val)
    np.save(os.path.join(OUTPUT_DIR, 'val_labels.npy'), y_val)
    np.save(os.path.join(OUTPUT_DIR, 'test_features.npy'), X_test)
    np.save(os.path.join(OUTPUT_DIR, 'test_labels.npy'), y_test)
    
    print(f"\nTask complete! All files have been saved in '{OUTPUT_DIR}'.")

if __name__ == '__main__':
    process_ecg_data()

### PTB-XL

In [None]:
import os
import pandas as pd
import numpy as np
import wfdb
import ast
from sklearn.preprocessing import MultiLabelBinarizer

def load_raw_data(df: pd.DataFrame, sampling_rate: int, path: str) -> np.ndarray:
    """
    Loads raw ECG waveform data from record files specified in the dataframe.

    Args:
        df: DataFrame containing metadata, including filenames.
        sampling_rate: The sampling rate of the ECG signals (100 or 500 Hz).
        path: The root directory path where the ECG record files are stored.

    Returns:
        A NumPy array containing the ECG signals.
    """
    if sampling_rate == 100:
        filenames = df.filename_lr
    else:
        filenames = df.filename_hr
    
    # Read all specified records using wfdb.rdsamp
    data = [wfdb.rdsamp(os.path.join(path, f)) for f in filenames]
    
    # Extract only the signal part from the (signal, metadata) tuples
    signals = np.array([signal for signal, meta in data])
    return signals

def aggregate_diagnostic(scp_codes_dict: dict, agg_df: pd.DataFrame) -> list:
    """
    Aggregates detailed SCP codes into their broader diagnostic superclasses.

    Args:
        scp_codes_dict: A dictionary of SCP codes for a single record.
        agg_df: DataFrame containing the mapping from SCP codes to diagnostic classes.

    Returns:
        A list of unique diagnostic superclasses for the record.
    """
    superclasses = []
    for key in scp_codes_dict.keys():
        if key in agg_df.index:
            superclass = agg_df.loc[key].diagnostic_class
            superclasses.append(superclass)
            
    return list(set(superclasses))

def process_and_save_ptbxl(data_path: str, output_path: str, sampling_rate: int = 100):
    """
    Main function to process the PTB-XL dataset and save it into .npy files
    for machine learning tasks.
    
    Args:
        data_path: Path to the root directory of the PTB-XL dataset.
        output_path: Path to the directory where the processed .npy files will be saved.
        sampling_rate: The desired sampling rate (100 or 500).
    """
    print("Starting the processing of the PTB-XL dataset...")

    # 1. Load and preprocess metadata
    print("Step 1/7: Loading metadata...")
    metadata_path = os.path.join(data_path, 'ptbxl_database.csv')
    metadata_df = pd.read_csv(metadata_path, index_col='ecg_id')
    # Convert string representation of dictionaries to actual dictionaries
    metadata_df.scp_codes = metadata_df.scp_codes.apply(lambda x: ast.literal_eval(x))

    # 2. Load SCP statements and generate diagnostic superclass labels
    print("Step 2/7: Generating diagnostic superclass labels...")
    scp_statements_path = os.path.join(data_path, 'scp_statements.csv')
    agg_df = pd.read_csv(scp_statements_path, index_col=0)
    # Filter for diagnostic SCP codes only
    agg_df = agg_df[agg_df.diagnostic == 1]
    metadata_df['diagnostic_superclass'] = metadata_df.scp_codes.apply(
        lambda x: aggregate_diagnostic(x, agg_df)
    )

    # 3. Load raw ECG signal data
    print(f"Step 3/7: Loading {sampling_rate}Hz ECG signal data...")
    X = load_raw_data(metadata_df, sampling_rate, data_path)
    
    # 4. Reshape signal data to (n_samples, n_channels, n_length)
    print("Step 4/7: Reshaping signal data...")
    X = X.transpose(0, 2, 1)

    # 5. Perform multi-hot encoding on the labels
    print("Step 5/7: Performing multi-hot encoding on labels...")
    all_superclasses = sorted(list(agg_df.diagnostic_class.unique()))
    mlb = MultiLabelBinarizer(classes=all_superclasses)
    y = mlb.fit_transform(metadata_df['diagnostic_superclass'])
    print(f"Label classes: {mlb.classes_}")

    # 6. Split the dataset using the predefined 'strat_fold'
    print("Step 6/7: Splitting data into train, validation, and test sets...")
    
    # The PTB-XL dataset provides a 'strat_fold' column for a 10-fold
    # stratified split based on patients. This ensures that all records from a single
    # patient belong to the same fold. Using this column for splitting prevents
    # data leakage by keeping patients completely separate across sets.
    # The officially recommended split is:
    # - Folds 1-8:   Training set
    # - Fold 9:      Validation set
    # - Fold 10:     Test set
    
    train_indices = metadata_df['strat_fold'].isin(range(1, 9)) # Folds 1 to 8
    X_train = X[train_indices]
    y_train = y[train_indices]

    val_indices = metadata_df['strat_fold'] == 9 # Fold 9
    X_val = X[val_indices]
    y_val = y[val_indices]

    test_indices = metadata_df['strat_fold'] == 10 # Fold 10
    X_test = X[test_indices]
    y_test = y[test_indices]

    print("\nDataset split complete. Final shapes:")
    print(f"  Training set features:   {X_train.shape}, labels: {y_train.shape}")
    print(f"  Validation set features: {X_val.shape}, labels: {y_val.shape}")
    print(f"  Test set features:       {X_test.shape}, labels: {y_test.shape}")

    # 7. Save the processed data as .npy files
    print(f"\nStep 7/7: Saving files to directory: {output_path}...")
    os.makedirs(output_path, exist_ok=True)
    
    np.save(os.path.join(output_path, 'train_features.npy'), X_train)
    np.save(os.path.join(output_path, 'train_labels.npy'), y_train)
    
    np.save(os.path.join(output_path, 'val_features.npy'), X_val)
    np.save(os.path.join(output_path, 'val_labels.npy'), y_val)
    
    np.save(os.path.join(output_path, 'test_features.npy'), X_test)
    np.save(os.path.join(output_path, 'test_labels.npy'), y_test)

    print("\nAll files have been saved successfully!")

if __name__ == '__main__':
    # --- User Configuration ---
    # Please update these paths to match your local directory structure.
    DATA_PATH = 'path/to/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/'
    OUTPUT_PATH = 'path/to/output/ptb-xl/'
    SAMPLING_RATE = 100  # The dataset supports 100Hz and 500Hz
    
    process_and_save_ptbxl(
        data_path=DATA_PATH, 
        output_path=OUTPUT_PATH, 
        sampling_rate=SAMPLING_RATE
    )

### Physionet2017

In [None]:
import os
import numpy as np
import pandas as pd
import wfdb
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def main():
    """
    Main function to process the PhysioNet 2017 Challenge dataset.
    It loads ECG signals and their corresponding labels, pads the signals to a uniform
    length, splits the data into training, validation, and test sets, and saves
    them as .npy files.
    """
    # --- 1. Define Paths and Constants ---
    # !!! IMPORTANT: Please modify these paths to match your local file structure !!!
    DATA_DIR = 'path/to/physionet-2017-challenge-data/'
    ECG_DIR = os.path.join(DATA_DIR, 'training2017')
    REF_FILE = os.path.join(DATA_DIR, 'REFERENCE-v3.csv')
    OUTPUT_DIR = 'path/to/output/physionet2017/'
    
    # The maximum signal duration is 61s, with a sampling rate of 300Hz.
    MAX_SIGNAL_LENGTH = 61 * 300  # 18300
    RANDOM_STATE = 42

    # Label mapping for the four classes: Normal, Atrial Fibrillation, Other, Noisy
    LABEL_MAP = {'N': 0, 'A': 1, 'O': 2, '~': 3}

    print("--- Step 1: Configuration Loaded ---")
    print(f"Data directory:   {ECG_DIR}")
    print(f"Reference file:   {REF_FILE}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Max signal length: {MAX_SIGNAL_LENGTH}")
    print("-" * 36)

    # --- 2. Load Reference Labels ---
    try:
        df_labels = pd.read_csv(REF_FILE, header=None, names=['record', 'label'])
        records = df_labels['record'].tolist()
        labels_dict = df_labels.set_index('record')['label'].to_dict()
        print("--- Step 2: Labels Loaded Successfully ---")
        print(f"Found {len(records)} records in the reference file.")
        print("-" * 42)
    except FileNotFoundError:
        print(f"Error: Reference file not found at '{REF_FILE}'. Please check the DATA_DIR path.")
        return

    # --- 3. Read and Preprocess ECG Signals ---
    print("--- Step 3: Reading and Preprocessing ECG Signals ---")
    all_features = []
    all_labels = []

    for record_name in tqdm(records, desc="Processing ECG Records"):
        record_path = os.path.join(ECG_DIR, record_name)
        
        try:
            # Read the WFDB record
            record = wfdb.rdrecord(record_path)
            # The signal shape is (length, channels). Transpose to (channels, length).
            signal = record.p_signal.T
            
            # Pad the signal to the maximum length
            current_len = signal.shape[1]
            pad_len = MAX_SIGNAL_LENGTH - current_len
            # Use 'constant' mode to pad with zeros at the end of the signal
            padded_signal = np.pad(signal, ((0, 0), (0, pad_len)), 'constant', constant_values=0)
            
            all_features.append(padded_signal)
            
            # Get and map the corresponding label
            label_char = labels_dict[record_name]
            label_int = LABEL_MAP[label_char]
            all_labels.append(label_int)

        except FileNotFoundError:
            print(f"Warning: Record file not found for {record_name}, skipping.")
        except Exception as e:
            print(f"An error occurred while processing {record_name}: {e}, skipping.")

    # Convert lists to NumPy arrays
    X = np.array(all_features, dtype=np.float32)
    y = np.array(all_labels, dtype=np.int64)

    print("--- Signal processing complete ---")
    print(f"Feature matrix shape (X): {X.shape}")
    print(f"Label vector shape (y):   {y.shape}")
    print("-" * 33)

    # --- 4. Split the Dataset (80% train, 10% validation, 10% test) ---
    print("--- Step 4: Splitting dataset into train, validation, and test sets ---")
    
    # First, split off 10% for the test set
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, 
        test_size=0.1, 
        random_state=RANDOM_STATE, 
        stratify=y  # Ensure class distribution is preserved
    )

    # Then, split the remaining 90% to get 10% of the original data for validation
    # (0.1 / 0.9 is approximately 11.1% of the remaining data)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, 
        test_size=(0.1/0.9), 
        random_state=RANDOM_STATE, 
        stratify=y_train_val # Preserve class distribution in the new split
    )

    print("--- Dataset splitting complete ---")
    print(f"Training set shape:   Features {X_train.shape}, Labels {y_train.shape}")
    print(f"Validation set shape: Features {X_val.shape}, Labels {y_val.shape}")
    print(f"Test set shape:       Features {X_test.shape}, Labels {y_test.shape}")
    print("-" * 33)

    # --- 5. Save Files ---
    print(f"--- Step 5: Saving datasets to '{OUTPUT_DIR}' ---")

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    np.save(os.path.join(OUTPUT_DIR, 'train_features.npy'), X_train)
    np.save(os.path.join(OUTPUT_DIR, 'train_labels.npy'), y_train)
    np.save(os.path.join(OUTPUT_DIR, 'val_features.npy'), X_val)
    np.save(os.path.join(OUTPUT_DIR, 'val_labels.npy'), y_val)
    np.save(os.path.join(OUTPUT_DIR, 'test_features.npy'), X_test)
    np.save(os.path.join(OUTPUT_DIR, 'test_labels.npy'), y_test)

    print("--- All files saved successfully. ---")
    print("Script finished.")

if __name__ == '__main__':
    main()