In [None]:
# Check file existence
existing_files = cleaned_mapping['ecg_path'].apply(lambda p: Path(p).exists())
existence_rate = existing_files.sum() / len(cleaned_mapping)

print(f"File Existence Check:")
print(f"  Total records: {len(cleaned_mapping):,}")
print(f"  Files exist: {existing_files.sum():,}")
print(f"  Files missing: {(~existing_files).sum():,}")
print(f"  Existence rate: {existence_rate:.2%}")

# Assert at least 95% exist
assert existence_rate >= 0.95, f"Only {existence_rate:.2%} of files exist (expected ≥95%)"
print(f"  ✓ Assertion passed: ≥95% of files exist")

# Label distribution
print(f"\nLabel Distribution:")
print(f"  AFIB records: {cleaned_mapping['_AFIB'].sum():,}")
print(f"  SR records:   {cleaned_mapping['_SR'].sum():,}")

# Sample rows
print(f"\nSample rows from cleaned_mapping:")
print(cleaned_mapping.sample(n=min(10, len(cleaned_mapping)), random_state=42))

print(f"\n{'='*60}")
print(f"✓ Data handling complete!")
print(f"{'='*60}")

## 7. Sanity Checks

Verify that at least 95% of the ecg_path files exist, and display sample statistics.

In [None]:
# Create final cleaned mapping with required columns
cleaned_mapping = balanced_df[['record_id', 'ecg_path', '_AFIB', '_SR']].copy()

# Convert Path objects to strings for CSV serialization
cleaned_mapping['ecg_path'] = cleaned_mapping['ecg_path'].apply(str)

# Save to CSV
cleaned_mapping.to_csv(OUT_MAPPING_CSV, index=False)

print(f"✓ Saved cleaned mapping to: {OUT_MAPPING_CSV}")
print(f"  Total records: {len(cleaned_mapping):,}")
print(f"\nFirst 5 rows of cleaned_mapping:")
cleaned_mapping.head()

## 6. Create Final Mapping & Save

Select only the required columns and save to CSV.

In [None]:
# Process all records: construct paths and copy files
ecg_paths = []
copied_count = 0
failed_count = 0

print("Processing records and copying files...\n")

for idx, row in balanced_df.iterrows():
    record_id = row['record_id']
    
    # Construct output path
    output_path = construct_output_path(record_id, CLEANED_WFDB_DIR)
    ecg_paths.append(output_path)
    
    # Copy or generate the .npy file
    success = copy_or_generate_npy(record_id, RAW_WFDB_DIR, output_path)
    
    if success:
        copied_count += 1
    else:
        failed_count += 1
    
    # Progress update every 500 records
    if (idx + 1) % 500 == 0:
        print(f"Processed {idx + 1:,}/{len(balanced_df):,} records (copied: {copied_count}, failed: {failed_count})")

print(f"\n✓ Processing complete")
print(f"  Successfully copied/generated: {copied_count:,} files")
print(f"  Failed: {failed_count:,} files")

# Add ecg_path column to the DataFrame
balanced_df['ecg_path'] = ecg_paths

In [None]:
def construct_output_path(record_id: str, cleaned_wfdb_dir: Path) -> Path:
    """
    Construct output path using bucket structure:
    cleaned_wfdb_dir/12/120/120.npy for record_id='120'
    """
    # Extract first 2 characters as bucket
    bucket = record_id[:2]
    
    # Build path: bucket/record_id/record_id.npy
    output_path = cleaned_wfdb_dir / bucket / record_id / f"{record_id}.npy"
    return output_path


def copy_or_generate_npy(
    record_id: str,
    raw_wfdb_dir: Path,
    output_path: Path
) -> bool:
    """
    Copy or generate .npy file from raw WFDB data.
    Returns True if successful, False otherwise.
    """
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Check if already exists
    if output_path.exists():
        return True
    
    # Look for raw .npy file first (if already converted)
    # The raw CSV path is structured like: 01/010/JS00001.csv
    # We need to find the corresponding .npy or read from WFDB format
    
    # Try to find the source file in RAW_WFDB_DIR
    # The record_id starts with 'JS' typically, so we search for it
    possible_sources = list(raw_wfdb_dir.rglob(f"*{record_id}.*"))
    
    # Filter for .npy files first
    npy_sources = [p for p in possible_sources if p.suffix == '.npy']
    if npy_sources:
        shutil.copy2(npy_sources[0], output_path)
        return True
    
    # If no .npy, try to read WFDB format (.hea, .dat files)
    hea_sources = [p for p in possible_sources if p.suffix == '.hea']
    if hea_sources:
        try:
            # Read WFDB record (without extension)
            record_path = str(hea_sources[0].parent / hea_sources[0].stem)
            record = wfdb.rdrecord(record_path)
            
            # Convert to numpy and save
            ecg_data = record.p_signal  # Shape: (n_samples, n_leads)
            np.save(output_path, ecg_data)
            return True
        except Exception as e:
            print(f"  Warning: Failed to read WFDB for {record_id}: {e}")
            return False
    
    # If CSV exists, try reading it
    csv_sources = [p for p in possible_sources if p.suffix == '.csv']
    if csv_sources:
        try:
            csv_data = pd.read_csv(csv_sources[0])
            ecg_array = csv_data.values
            np.save(output_path, ecg_array)
            return True
        except Exception as e:
            print(f"  Warning: Failed to read CSV for {record_id}: {e}")
            return False
    
    print(f"  Warning: No source file found for {record_id}")
    return False


print("Functions defined successfully")

## 5. Path Construction & File Copying

For each record, construct the output path using the bucket structure:
- Bucket = first 2 digits of `record_id`
- Path = `CLEANED_WFDB_DIR / bucket / record_id / <record_id>.npy`

We'll copy/generate .npy files from the raw WFDB data.

In [None]:
# Create binary label columns with underscore prefix
mapping_df['_AFIB'] = mapping_df['AFIB'].astype(int)
mapping_df['_SR'] = mapping_df['SR'].astype(int)

# Filter to records with exactly one of AFIB or SR (no multi-label, no unlabeled)
valid_records = mapping_df[
    ((mapping_df['_AFIB'] == 1) & (mapping_df['_SR'] == 0)) |
    ((mapping_df['_AFIB'] == 0) & (mapping_df['_SR'] == 1))
].copy()

print(f"Original dataset: {len(mapping_df):,} records")
print(f"Valid AFIB-only or SR-only records: {len(valid_records):,} records")

# Count AFIB vs SR before balancing
afib_records = valid_records[valid_records['_AFIB'] == 1]
sr_records = valid_records[valid_records['_SR'] == 1]

print(f"\n--- Before Balancing ---")
print(f"AFIB records: {len(afib_records):,}")
print(f"SR records:   {len(sr_records):,}")

# Balance by downsampling the majority class
min_count = min(len(afib_records), len(sr_records))
afib_balanced = afib_records.sample(n=min_count, random_state=42)
sr_balanced = sr_records.sample(n=min_count, random_state=42)

# Combine and shuffle
balanced_df = pd.concat([afib_balanced, sr_balanced], ignore_index=True)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n--- After Balancing ---")
print(f"AFIB records: {len(afib_balanced):,}")
print(f"SR records:   {len(sr_balanced):,}")
print(f"Total balanced records: {len(balanced_df):,}")

print(f"\n✓ Dataset balanced successfully")

## 4. Label Engineering & Balancing

Create binary columns `_AFIB` and `_SR`, then balance the dataset by downsampling the majority class.

In [None]:
# Load the original mapping CSV
mapping_df = pd.read_csv(RAW_MAPPING_CSV)

print(f"Loaded {len(mapping_df):,} records from {RAW_MAPPING_CSV.name}")
print(f"\nColumns available: {list(mapping_df.columns)[:10]}... (showing first 10)")
print(f"\nDataFrame shape: {mapping_df.shape}")
print(f"\nFirst few rows:")
mapping_df.head()

## 3. Load Original File Mapping

We expect the raw mapping CSV at: `data/ecg_arrhythmia_dataset_CSV/file_mapping.csv`

This CSV contains:
- `record_id`: Unique identifier (e.g., JS00001)
- `AFIB`: Binary column (1 = atrial fibrillation present)
- `SR`: Binary column (1 = sinus rhythm present)
- Other diagnostic and demographic columns

In [None]:
# Project structure paths (relative, portable)
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"

# Raw dataset (input)
RAW_WFDB_DIR = DATA_DIR / "ecg_arrhythmia_dataset_CSV" / "WFDBRecords"
RAW_MAPPING_CSV = DATA_DIR / "ecg_arrhythmia_dataset_CSV" / "file_mapping.csv"

# Cleaned dataset (output)
CLEANED_ROOT = DATA_DIR / "cleaned_balanced_AFIB_SR"
CLEANED_WFDB_DIR = CLEANED_ROOT / "WFDBRecords"
OUT_MAPPING_CSV = CLEANED_ROOT / "file_mapping_cleaned.csv"

# Create output directories
CLEANED_ROOT.mkdir(parents=True, exist_ok=True)
CLEANED_WFDB_DIR.mkdir(parents=True, exist_ok=True)

print(f"PROJECT_ROOT: {PROJECT_ROOT}")
print(f"RAW_MAPPING_CSV: {RAW_MAPPING_CSV}")
print(f"RAW_WFDB_DIR: {RAW_WFDB_DIR}")
print(f"OUT_MAPPING_CSV: {OUT_MAPPING_CSV}")
print(f"CLEANED_WFDB_DIR: {CLEANED_WFDB_DIR}")
print(f"\n✓ Paths configured (all relative to project root)")

## 2. Path Setup

Define all paths relative to the project root—no hardcoded C:\ paths.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import shutil
from typing import Dict, List, Tuple
import wfdb

print("✓ All imports successful")

## 1. Imports

# ECG Data Handling: Cleaned & Balanced AFIB/SR Dataset

This notebook creates a cleaned and balanced subset of AFIB vs SR records from the original ECG arrhythmia database.

**Steps:**
1. Define project-relative paths
2. Load original file mapping
3. Engineer binary labels (_AFIB, _SR) and balance the dataset
4. Construct output paths and copy/generate .npy files
5. Save cleaned mapping CSV with sanity checks