In [9]:
import numpy as np
import pandas as pd
import neurokit2 as nk
import mne
import os
import hashlib
import wfdb
from pathlib import Path
from tqdm import tqdm

def read_sha256sums(filepath):
    """Read SHA256SUMS.txt into a dictionary"""
    sha256sums = {}
    with open(filepath, 'r') as f:
        for line in f:
            hash_value, filename = line.strip().split()
            sha256sums[filename] = hash_value
    return sha256sums

def calculate_sha256(filepath):
    """Calculate SHA256 hash of a file"""
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        # Read file in chunks for memory efficiency
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def verify_file_integrity(filepath, sha256sums, data_dir):
    """Verify a file's integrity against SHA256SUMS"""
    if not os.path.exists(filepath):
        return False, "File does not exist"
    
    calculated_hash = calculate_sha256(filepath)
    relative_path = str(Path(filepath).relative_to(data_dir))
    expected_hash = sha256sums.get(relative_path)
    
    if expected_hash is None:
        return False, "File not found in SHA256SUMS"
        
    return calculated_hash == expected_hash, calculated_hash

def get_all_record_folders(data_dir):
    """Read main RECORDS file and return all folder paths"""
    folders = []
    with open(data_dir / 'RECORDS', 'r') as f:
        folders = [line.strip() for line in f if line.strip()]
    return folders

def read_folder_records(folder_path):
    """Read RECORDS file from a specific folder"""
    records = []
    records_file = folder_path / 'RECORDS'
    if records_file.exists():
        with open(records_file, 'r') as f:
            records = [line.strip() for line in f if line.strip()]
    return records

def load_record(record_path, sha256sums, data_dir, verify=True):
    """Load a single record and optionally verify its integrity"""
    base_path = str(record_path.parent / record_path.stem)
    
    # Files to check
    hea_path = base_path + '.hea'
    mat_path = base_path + '.mat'
    
    # Verify files if requested
    if verify:
        hea_ok, hea_hash = verify_file_integrity(hea_path, sha256sums, data_dir)
        mat_ok, mat_hash = verify_file_integrity(mat_path, sha256sums, data_dir)
        
        if not (hea_ok and mat_ok):
            print(f"Validation failed for {record_path}")
            return None
    
    try:
        # Read the record using wfdb
        record = wfdb.rdrecord(base_path)
        return {
            'data': record.p_signal,
            'fs': record.fs,
            'sig_name': record.sig_name,
            'units': record.units,
            'baseline': record.baseline,
            'adc_gain': record.adc_gain,
            'record_name': record_path.stem
        }
    except Exception as e:
        print(f"Error reading {record_path}: {e}")
        return None

def load_dataset(data_dir, max_records=None, verify=True):
    """
    Load entire dataset with progress bar
    
    Parameters:
    -----------
    data_dir : Path
        Path to dataset root directory
    max_records : int, optional
        Maximum number of records to load (for testing)
    verify : bool
        Whether to verify file integrity
        
    Returns:
    --------
    dict
        Dictionary containing loaded records and metadata
    """
    data_dir = Path(data_dir)
    
    # Read SHA256SUMS if verification is requested
    sha256sums = read_sha256sums(data_dir / 'SHA256SUMS.txt') if verify else {}
    
    # Get all folders
    folders = get_all_record_folders(data_dir)
    
    # Store loaded records
    dataset = {
        'records': {},
        'metadata': {
            'total_records': 0,
            'failed_records': [],
            'verification_enabled': verify
        }
    }
    
    records_loaded = 0
    
    # Process each folder with progress bar
    with tqdm(total=max_records if max_records else None) as pbar:
        for folder in folders:
            folder_path = data_dir / folder
            if not folder_path.exists():
                continue
                
            records = read_folder_records(folder_path)
            
            for record in records:
                if max_records and records_loaded >= max_records:
                    break
                    
                record_path = folder_path / record
                data = load_record(record_path, sha256sums, data_dir, verify)
                
                if data is not None:
                    dataset['records'][data['record_name']] = data
                    records_loaded += 1
                    pbar.update(1)
                    pbar.set_description(f"Loaded {records_loaded} records")
                else:
                    dataset['metadata']['failed_records'].append(str(record_path))
            
            if max_records and records_loaded >= max_records:
                break
    
    dataset['metadata']['total_records'] = records_loaded
    return dataset


In [10]:

data_dir = Path('/Users/teofil/Dev/GitHub/ekg-classification-pipeline/data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0')

# Add estimate of total records for progress bar
total_records = 10000  # Estimat - se va ajusta automat

print("Starting data loading...")
print("This might take a while depending on your system...")

dataset = load_dataset(
    data_dir=data_dir,
    max_records=None,  
    verify=True  
)

print("\nDataset Summary:")
print("-" * 50)
print(f"Successfully loaded: {dataset['metadata']['total_records']} records")

if dataset['metadata']['failed_records']:
    print(f"Failed records: {len(dataset['metadata']['failed_records'])}")
    # Optional: save failed records list
    with open('failed_records.txt', 'w') as f:
        for record in dataset['metadata']['failed_records']:
            f.write(f"{record}\n")

print("\nData Characteristics:")
print("-" * 50)
if dataset['records']:
    first_record = next(iter(dataset['records'].values()))
    print(f"Signal shape per record: {first_record['data'].shape}")
    print(f"Sampling rate: {first_record['fs']} Hz")
    print(f"Channels: {', '.join(first_record['sig_name'])}")
    
    # Calculate overall statistics
    all_data = np.vstack([record['data'] for record in dataset['records'].values()])
    print("\nOverall Statistics:")
    print("-" * 50)
    print(f"Total samples: {all_data.shape}")
    print(f"Min value: {np.min(all_data):.2f}")
    print(f"Max value: {np.max(all_data):.2f}")
    print(f"Mean value: {np.mean(all_data):.2f}")
    print(f"Std deviation: {np.std(all_data):.2f}")
    
    print("\nSaving dataset to numpy file...")
    np.save('full_dataset.npy', all_data)


    

Starting data loading...
This might take a while depending on your system...


Loaded 1111 records: : 1111it [00:01, 574.53it/s]

Error reading /Users/teofil/Dev/GitHub/ekg-classification-pipeline/data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/WFDBRecords/01/019/JS01052: time data '/' does not match format '%d/%m/%Y'


Loaded 22787 records: : 22787it [00:41, 550.20it/s]

Error reading /Users/teofil/Dev/GitHub/ekg-classification-pipeline/data/a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/WFDBRecords/23/236/JS23074: list index out of range


Loaded 45150 records: : 45150it [01:24, 532.61it/s]



Dataset Summary:
--------------------------------------------------
Successfully loaded: 45150 records
Failed records: 2

Data Characteristics:
--------------------------------------------------
Signal shape per record: (5000, 12)
Sampling rate: 500 Hz
Channels: I, II, III, aVR, aVL, aVF, V1, V2, V3, V4, V5, V6

Overall Statistics:
--------------------------------------------------
Total samples: (225750000, 12)
Min value: nan
Max value: nan
Mean value: nan
Std deviation: nan

Saving dataset to numpy file...


OSError: 2709000000 requested and 1879047680 written