# Master ECG Pipeline

This notebook combines all project scripts and modules into one single runnable file.

**Usage:** run cells top-to-bottom. For headless execution on Windows use:

```python
import asyncio, sys
if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
```



In [1]:

# Environment & imports - idempotent
import os, sys, json, time, math, asyncio
from pathlib import Path
import numpy as np
import random
import torch
print('Python:', sys.executable)
print('Torch:', getattr(torch, '__version__', 'n/a'))
# Windows asyncio fix for nbconvert headless runs
import platform
if platform.system() == 'Windows':
    try:
        import asyncio, sys
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    except Exception:
        pass

# Project root detection
ROOT = Path(os.environ.get('ECG_ROOT', Path.cwd().resolve()))
DATASET_DIR = ROOT / 'Dataset'
ARTIFACTS_DIR = ROOT / 'artifacts'
PROCESSED_DIR = ARTIFACTS_DIR / 'processed'
LOGS_DIR = ROOT / 'logs'
NOTEBOOKS_DIR = ROOT / 'notebooks'
for p in (ARTIFACTS_DIR, PROCESSED_DIR, PROCESSED_DIR/'records', LOGS_DIR):
    p.mkdir(parents=True, exist_ok=True)

# seeds for reproducibility
SEED = int(os.environ.get('ECG_SEED', '42'))
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('DEVICE', DEVICE)


Python: D:\ecg-research\.venv1\Scripts\python.exe
Torch: 2.9.1+cpu
DEVICE cpu


## Inlined: Source modules (src/)

Files: __init__.py, dataloaders.py, eval.py, generate_candidate_unified_mapping.py, model.py, preprocessing.py, saver.py, serving.py, training.py, utils.py


In [2]:
# --- __init__.py (inlined) ---
try:
    """
    ECG Research Pipeline
    A unified multi-dataset ECG preprocessing, training, and serving pipeline.
    """
    
    __version__ = "0.1.0"
    __author__ = "ECG Research Team"
    
    from . import dataloaders, eval, model, preprocessing, saver, training, utils
    
    __all__ = ['utils', 'preprocessing', 'dataloaders', 'model', 'training', 'eval', 'saver']
    
except Exception as _e:
    print('Warning: inlined module', '__init__.py', 'raised', _e)




In [3]:
# --- dataloaders.py (inlined) ---
try:
    """
    ECG Dataloaders Module
    Implements PyTorch Dataset and DataLoader creation with lazy loading.
    """
    
    import logging
    from pathlib import Path
    from typing import Dict, Optional, Tuple
    
    import numpy as np
    import torch
    from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
    
    from .utils import read_json
    
    logger = logging.getLogger(__name__)
    
    
    class ECGDataset(Dataset):
        """
        PyTorch Dataset for lazy loading of preprocessed ECG records.
        Reads individual compressed .npz files on demand.
        """
    
        def __init__(
            self,
            manifest: list,
            processed_dir: Path,
            augment: bool = False,
            noise_std: float = 0.01
        ):
            """
            Initialize ECG Dataset.
    
            Args:
                manifest: List of dicts with 'path' and 'label' keys
                processed_dir: Root directory containing processed records
                augment: Whether to apply augmentation
                noise_std: Standard deviation for gaussian noise augmentation
            """
            self.manifest = manifest
            self.processed_dir = Path(processed_dir)
            self.augment = augment
            self.noise_std = noise_std
    
        def __len__(self) -> int:
            return len(self.manifest)
    
        def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
            """
            Load and return a single record.
    
            Args:
                idx: Index of record
    
            Returns:
                Tuple of (signal_tensor, label_tensor)
            """
            item = self.manifest[idx]
            record_path = self.processed_dir / item['path']
    
            try:
                data = np.load(record_path)
                signal = data['signal'].astype(np.float32)
                label = int(data['label'])
            except Exception as e:
                logger.error(f"Failed to load {record_path}: {e}")
                # Return zero signal and label
                signal = np.zeros(5000, dtype=np.float32)
                label = 0
    
            # Optional augmentation (on CPU for reproducibility)
            if self.augment:
                # Add small gaussian noise
                if self.noise_std > 0:
                    noise = np.random.normal(0, self.noise_std, signal.shape).astype(np.float32)
                    signal = signal + noise
    
                # Random crop (if longer than needed, take random segment)
                # Here we assume signal is already fixed length, so this is optional placeholder
    
            # Convert to tensor
            signal_tensor = torch.from_numpy(signal).float()
            label_tensor = torch.tensor(label, dtype=torch.long)
    
            return signal_tensor, label_tensor
    
    
    def collate_fn(batch: list) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Collate function to reshape signals to (batch, 1, samples).
    
        Args:
            batch: List of (signal, label) tuples
    
        Returns:
            Tuple of (signals_batch, labels_batch)
        """
        signals = []
        labels = []
    
        for signal, label in batch:
            signals.append(signal.unsqueeze(0))  # Add channel dimension
            labels.append(label)
    
        signals_batch = torch.stack(signals, dim=0)  # Shape: (batch, 1, samples)
        labels_batch = torch.stack(labels, dim=0)
    
        return signals_batch, labels_batch
    
    
    def create_dataloaders(
        splits_json: Path,
        processed_dir: Path,
        batch_size: int = 32,
        num_workers: int = 4,
        pin_memory: bool = True,
        augment_train: bool = False
    ) -> Dict[str, DataLoader]:
        """
        Create train/val/test DataLoaders from splits.json.
    
        Args:
            splits_json: Path to splits.json file
            processed_dir: Root directory containing processed records
            batch_size: Batch size for DataLoaders
            num_workers: Number of worker processes
            pin_memory: Whether to pin memory for faster GPU transfer
            augment_train: Whether to augment training data
    
        Returns:
            Dictionary with 'train', 'val', 'test' DataLoaders
        """
        splits = read_json(splits_json)
    
        train_manifest = splits['train']
        val_manifest = splits.get('val', [])
        test_manifest = splits.get('test', [])
    
        logger.info(f"Creating dataloaders: train={len(train_manifest)}, "
                    f"val={len(val_manifest)}, test={len(test_manifest)}")
    
        # Create datasets
        train_dataset = ECGDataset(train_manifest, processed_dir, augment=augment_train)
        val_dataset = ECGDataset(val_manifest, processed_dir, augment=False)
        test_dataset = ECGDataset(test_manifest, processed_dir, augment=False)
    
        # Optional: Create weighted sampler for imbalanced classes
        train_labels = [item['label'] for item in train_manifest]
        label_counts = np.bincount(train_labels)
        weights = 1.0 / (label_counts + 1e-6)
        sample_weights = weights[train_labels]
        sampler = WeightedRandomSampler(
            weights=sample_weights,
            num_samples=len(sample_weights),
            replacement=True
        )
    
        # Create dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            sampler=sampler,
            num_workers=num_workers,
            pin_memory=pin_memory,
            collate_fn=collate_fn,
            drop_last=True
        )
    
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers,
            pin_memory=pin_memory,
            collate_fn=collate_fn
        ) if len(val_manifest) > 0 else None
    
        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers,
            pin_memory=pin_memory,
            collate_fn=collate_fn
        ) if len(test_manifest) > 0 else None
    
        return {
            'train': train_loader,
            'val': val_loader,
            'test': test_loader
        }
    
except Exception as _e:
    print('Warning: inlined module', 'dataloaders.py', 'raised', _e)




In [4]:
# --- eval.py (inlined) ---
try:
    """
    ECG Evaluation Module
    Implements evaluation metrics and visualization for trained models.
    """
    
    import logging
    from pathlib import Path
    from typing import Dict, List, Optional, Tuple
    
    import matplotlib
    matplotlib.use('Agg')  # Non-interactive backend
    import matplotlib.pyplot as plt
    import numpy as np
    import torch
    import torch.nn as nn
    from sklearn.metrics import (
        classification_report,
        confusion_matrix,
        f1_score,
        precision_recall_fscore_support,
    )
    from tqdm import tqdm
    
    from .utils import safe_makedirs, write_json
    
    logger = logging.getLogger(__name__)
    
    
    def predict_on_split(
        model: nn.Module,
        dataloader: torch.utils.data.DataLoader,
        device: str,
        use_amp: bool = True
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Generate predictions and probabilities for a data split.
    
        Args:
            model: Trained model
            dataloader: DataLoader for the split
            device: Device string
            use_amp: Whether to use automatic mixed precision
    
        Returns:
            Tuple of (predictions, probabilities, true_labels)
        """
        model.eval()
        all_preds = []
        all_probs = []
        all_labels = []
    
        with torch.no_grad():
            for signals, labels in tqdm(dataloader, desc="Predicting"):
                signals = signals.to(device, non_blocking=True)
    
                if use_amp and device == 'cuda':
                    with torch.cuda.amp.autocast():
                        outputs = model(signals)
                else:
                    outputs = model(signals)
    
                probs = torch.softmax(outputs, dim=1)
                _, preds = outputs.max(1)
    
                all_preds.extend(preds.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())
                all_labels.extend(labels.numpy())
    
        return (
            np.array(all_preds),
            np.array(all_probs),
            np.array(all_labels)
        )
    
    
    def compute_metrics(
        y_true: np.ndarray,
        y_pred: np.ndarray,
        label_names: Optional[List[str]] = None
    ) -> Dict[str, any]:
        """
        Compute classification metrics.
    
        Args:
            y_true: True labels
            y_pred: Predicted labels
            label_names: List of label names
    
        Returns:
            Dictionary of metrics
        """
        if label_names is None:
            label_names = [f"Class_{i}" for i in range(max(y_true.max(), y_pred.max()) + 1)]
    
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
    
        # Per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            y_true, y_pred, average=None, zero_division=0
        )
    
        # Macro and micro averages
        f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
        f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)
        f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
        # Per-class results
        per_class = []
        for i, label in enumerate(label_names):
            per_class.append({
                'label': label,
                'precision': float(precision[i]) if i < len(precision) else 0.0,
                'recall': float(recall[i]) if i < len(recall) else 0.0,
                'f1': float(f1[i]) if i < len(f1) else 0.0,
                'support': int(support[i]) if i < len(support) else 0
            })
    
        metrics = {
            'confusion_matrix': cm.tolist(),
            'per_class': per_class,
            'f1_macro': float(f1_macro),
            'f1_micro': float(f1_micro),
            'f1_weighted': float(f1_weighted),
            'accuracy': float((y_true == y_pred).mean())
        }
    
        # Classification report
        report = classification_report(
            y_true, y_pred, target_names=label_names, zero_division=0, output_dict=True
        )
        metrics['classification_report'] = report
    
        return metrics
    
    
    def plot_class_f1(
        metrics: Dict[str, any],
        output_path: Path,
        figsize: Tuple[int, int] = (10, 6)
    ) -> None:
        """
        Plot per-class F1 scores as bar chart.
    
        Args:
            metrics: Metrics dictionary from compute_metrics
            output_path: Path to save figure
            figsize: Figure size tuple
        """
        per_class = metrics['per_class']
        labels = [item['label'] for item in per_class]
        f1_scores = [item['f1'] for item in per_class]
    
        fig, ax = plt.subplots(figsize=figsize)
        bars = ax.bar(range(len(labels)), f1_scores, color='steelblue', alpha=0.8)
    
        ax.set_xlabel('Class', fontsize=12)
        ax.set_ylabel('F1 Score', fontsize=12)
        ax.set_title('Per-Class F1 Scores', fontsize=14, fontweight='bold')
        ax.set_xticks(range(len(labels)))
        ax.set_xticklabels(labels, rotation=45, ha='right')
        ax.set_ylim(0, 1.0)
        ax.grid(axis='y', alpha=0.3, linestyle='--')
    
        # Add value labels on bars
        for bar, score in zip(bars, f1_scores):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                    f'{score:.3f}',
                    ha='center', va='bottom', fontsize=9)
    
        plt.tight_layout()
        safe_makedirs(output_path.parent)
        plt.savefig(output_path, dpi=150, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved F1 bar plot to {output_path}")
    
    
    def evaluate_model(
        model: nn.Module,
        dataloader: torch.utils.data.DataLoader,
        device: str,
        label_names: List[str],
        output_dir: Path,
        split_name: str = 'test',
        use_amp: bool = True
    ) -> Dict[str, any]:
        """
        Complete evaluation pipeline: predict, compute metrics, save results and plots.
    
        Args:
            model: Trained model
            dataloader: DataLoader for evaluation
            device: Device string
            label_names: List of label names
            output_dir: Directory to save outputs
            split_name: Name of split being evaluated
            use_amp: Whether to use mixed precision
    
        Returns:
            Metrics dictionary
        """
        logger.info(f"Evaluating model on {split_name} split")
    
        # Get predictions
        preds, probs, labels = predict_on_split(model, dataloader, device, use_amp)
    
        # Compute metrics
        metrics = compute_metrics(labels, preds, label_names)
    
        # Add split name
        metrics['split'] = split_name
    
        # Save metrics
        metrics_path = output_dir / f'{split_name}_metrics.json'
        write_json(metrics_path, metrics)
        logger.info(f"Saved metrics to {metrics_path}")
    
        # Plot F1 scores
        figures_dir = output_dir.parent.parent / 'figures'
        plot_path = figures_dir / f'{split_name}_f1_scores.png'
        plot_class_f1(metrics, plot_path)
    
        # Log summary
        logger.info(f"{split_name.capitalize()} Results:")
        logger.info(f"  Accuracy: {metrics['accuracy']:.4f}")
        logger.info(f"  F1 Macro: {metrics['f1_macro']:.4f}")
        logger.info(f"  F1 Micro: {metrics['f1_micro']:.4f}")
        logger.info(f"  F1 Weighted: {metrics['f1_weighted']:.4f}")
    
        return metrics
    
except Exception as _e:
    print('Warning: inlined module', 'eval.py', 'raised', _e)




In [5]:
# --- generate_candidate_unified_mapping.py (inlined) ---
try:
    # generate_candidate_unified_mapping.py
    import csv
    import json
    from pathlib import Path
    import re
    import pandas as pd
    
    ROOT = Path.cwd().resolve()
    DATASET_DIR = ROOT / "dataset"
    LOGS_DIR = ROOT / "logs"
    LOGS_DIR.mkdir(parents=True, exist_ok=True)
    
    OUT = LOGS_DIR / "unified_label_mapping.candidate.csv"
    
    # label heuristics: map various textual signals to our 5 classes
    HEURISTICS = [
        (re.compile(r"\b(myocard|infarct|mi|ischemi|ischemia)\b", re.I), "MI"),
        (re.compile(r"\b(atrial fibrill|afib|a\.fibrill|fibrillation)\b", re.I), "AF"),
        (re.compile(r"\b(bundle branch block|bbb|left bundle|right bundle|bundle-branch)\b", re.I), "BBB"),
        (re.compile(r"\b(normal|norm|sinus rhythm|no abnormalit)\b", re.I), "NORM"),
    ]
    
    def apply_heuristics(text: str):
        if not text:
            return ""
        for pat, lab in HEURISTICS:
            if pat.search(text):
                return lab
        return ""
    
    def extract_ptbxl(ptbxl_dir: Path, rows):
        csv_path = ptbxl_dir / "ptbxl_database.csv"
        if csv_path.exists():
            df = pd.read_csv(csv_path, dtype=str).fillna("")
            for _, r in df.iterrows():
                # filename_lr often like "records100/00000/00001_lr"
                filename_lr = str(r.get("filename_lr", "")).strip()
                if filename_lr:
                    rid = Path(filename_lr).name  # e.g. "00001_lr" but we'll preserve folder-less id
                    rid = filename_lr.replace("\\", "/").strip("/")
                else:
                    rid = str(r.get("ecg_id", "")).strip()
                report = str(r.get("report", "")).strip()
                scp = str(r.get("scp_codes", "")).strip()
                combined = " ".join([report, scp])
                mapped = apply_heuristics(combined)
                rows.append(("PTBXL", rid, combined, mapped))
        else:
            print("PTBXL metadata not found at", csv_path)
    
    def extract_cinc(cinc_dir: Path, rows):
        # CinC 2017 uses REFERENCE files with labels like A,N,O
        # We'll collect training/validation/test reference files if present
        for sub in ["training", "validation", "test", ""]:
            base = cinc_dir if sub == "" else cinc_dir / sub
            if not base.exists():
                continue
            for ref in base.glob("REFERENCE*.csv"):
                try:
                    df = pd.read_csv(ref, header=None, names=["record", "label"], dtype=str)
                except Exception:
                    continue
                for _, r in df.iterrows():
                    record = str(r["record"]).strip()
                    label = str(r["label"]).strip().upper()
                    # map A->AF? O->OTHER? N->NORM? This is heuristic: you may need to adjust
                    mapped = ""
                    if label == "A": mapped = "AF"
                    elif label == "N": mapped = "NORM"
                    elif label == "O": mapped = "OTHER"
                    rows.append(("CinC_2017_AFDB", record, label, mapped))
        # If no REFERENCE files, also sweep .hea/.mat basenames
        for rec in cinc_dir.rglob("*"):
            if rec.suffix.lower() in (".hea", ".mat"):
                rel = rec.relative_to(DATASET_DIR).with_suffix("")
                rid = rel.as_posix()
                rows.append(("CinC_2017_AFDB", rid, "", ""))
    
    def extract_folder_generic(ds_name: str, ds_dir: Path, rows):
        # walk files and try to collect any textual hints from accompanying .hea or .txt files
        for p in ds_dir.rglob("*"):
            if p.suffix.lower() in (".hea", ".mat", ".txt", ".csv"):
                rel = p.relative_to(DATASET_DIR).with_suffix("")
                rid = rel.as_posix()
                hint = ""
                # attempt to read small header or first lines for clues
                if p.suffix.lower() == ".hea":
                    try:
                        text = p.read_text(errors="ignore")
                        hint = " ".join(text.splitlines()[:10])
                    except Exception:
                        hint = ""
                elif p.suffix.lower() == ".csv":
                    hint = ""
                rows.append((ds_name, rid, hint, ""))
    
    def main():
        rows = []
        # PTBXL
        ptbxl_dir = DATASET_DIR / "ptb-xl"
        if ptbxl_dir.exists():
            extract_ptbxl(ptbxl_dir, rows)
        # CinC
        cinc_dir = DATASET_DIR / "CinC2017"
        if cinc_dir.exists():
            extract_cinc(cinc_dir, rows)
        # PTB Diagnostic folder (if present, we'll flatten names)
        ptbdiag = DATASET_DIR / "PTB_Diagnostic"
        if ptbdiag.exists():
            extract_folder_generic("PTB_Diagnostic", ptbdiag, rows)
        # Chapman_Shaoxing
        chap = DATASET_DIR / "Chapman_Shaoxing"
        if chap.exists():
            extract_folder_generic("Chapman_Shaoxing", chap, rows)
    
        # dedupe by dataset+record_id
        seen = set()
        unique = []
        for ds, rid, orig, mapped in rows:
            key = (ds, rid)
            if key in seen:
                continue
            seen.add(key)
            unique.append((ds, rid, orig.replace("\n", " ")[:400], mapped))
    
        # write CSV
        with OUT.open("w", newline="", encoding="utf-8") as fh:
            w = csv.writer(fh)
            w.writerow(["dataset", "record_id", "original_label_text", "mapped_label"])
            for ds, rid, orig, mapped in unique:
                w.writerow([ds, rid, orig, mapped])
        print("Wrote candidate mapping to", OUT)
        print("Rows:", len(unique))
    
    if __name__ == "__main__":
        main()
except Exception as _e:
    print('Warning: inlined module', 'generate_candidate_unified_mapping.py', 'raised', _e)


Wrote candidate mapping to D:\ecg-research\notebooks\logs\unified_label_mapping.candidate.csv
Rows: 0


In [6]:
# --- model.py (inlined) ---
try:
    """
    ECG Model Module
    Defines 1D CNN architecture for ECG classification with residual blocks.
    """
    
    import logging
    from pathlib import Path
    from typing import Optional
    
    import torch
    import torch.nn as nn
    
    logger = logging.getLogger(__name__)
    
    
    class ResidualBlock1D(nn.Module):
        """
        1D Residual block with batch normalization and skip connection.
        """
    
        def __init__(
            self,
            in_channels: int,
            out_channels: int,
            kernel_size: int = 3,
            stride: int = 1,
            dropout: float = 0.1
        ):
            """
            Initialize residual block.
    
            Args:
                in_channels: Number of input channels
                out_channels: Number of output channels
                kernel_size: Convolution kernel size
                stride: Convolution stride
                dropout: Dropout probability
            """
            super().__init__()
    
            self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size,
                                   stride=stride, padding=kernel_size//2, bias=False)
            self.bn1 = nn.BatchNorm1d(out_channels)
            self.activation = nn.GELU()
    
            self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size,
                                   stride=1, padding=kernel_size//2, bias=False)
            self.bn2 = nn.BatchNorm1d(out_channels)
    
            self.dropout = nn.Dropout(dropout)
    
            # Skip connection with projection if dimensions change
            self.skip = nn.Sequential()
            if stride != 1 or in_channels != out_channels:
                self.skip = nn.Sequential(
                    nn.Conv1d(in_channels, out_channels, kernel_size=1,
                             stride=stride, bias=False),
                    nn.BatchNorm1d(out_channels)
                )
    
        def forward(self, x: torch.Tensor) -> torch.Tensor:
            """Forward pass through residual block."""
            identity = self.skip(x)
    
            out = self.conv1(x)
            out = self.bn1(out)
            out = self.activation(out)
            out = self.dropout(out)
    
            out = self.conv2(out)
            out = self.bn2(out)
    
            out = out + identity
            out = self.activation(out)
    
            return out
    
    
    class ECGResNet1D(nn.Module):
        """
        1D ResNet for ECG classification.
        Suitable for single-lead or multi-lead flattened ECG signals.
        """
    
        def __init__(
            self,
            input_channels: int = 1,
            n_classes: int = 5,
            base_channels: int = 64,
            dropout: float = 0.2
        ):
            """
            Initialize ECG ResNet model.
    
            Args:
                input_channels: Number of input channels (1 for single-lead)
                n_classes: Number of output classes
                base_channels: Base number of channels (will be scaled in deeper layers)
                dropout: Dropout probability
            """
            super().__init__()
    
            self.input_channels = input_channels
            self.n_classes = n_classes
    
            # Initial convolution
            self.conv1 = nn.Conv1d(input_channels, base_channels, kernel_size=15,
                                   stride=2, padding=7, bias=False)
            self.bn1 = nn.BatchNorm1d(base_channels)
            self.activation = nn.GELU()
            self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
    
            # Residual blocks
            self.layer1 = self._make_layer(base_channels, base_channels, 2, stride=1, dropout=dropout)
            self.layer2 = self._make_layer(base_channels, base_channels*2, 2, stride=2, dropout=dropout)
            self.layer3 = self._make_layer(base_channels*2, base_channels*4, 2, stride=2, dropout=dropout)
            self.layer4 = self._make_layer(base_channels*4, base_channels*8, 2, stride=2, dropout=dropout)
    
            # Global pooling and classifier
            self.global_pool = nn.AdaptiveAvgPool1d(1)
            self.dropout = nn.Dropout(dropout)
            self.fc = nn.Linear(base_channels*8, n_classes)
    
            self._initialize_weights()
    
        def _make_layer(
            self,
            in_channels: int,
            out_channels: int,
            num_blocks: int,
            stride: int,
            dropout: float
        ) -> nn.Sequential:
            """Create a layer with multiple residual blocks."""
            layers = []
            layers.append(ResidualBlock1D(in_channels, out_channels, stride=stride, dropout=dropout))
            for _ in range(1, num_blocks):
                layers.append(ResidualBlock1D(out_channels, out_channels, stride=1, dropout=dropout))
            return nn.Sequential(*layers)
    
        def _initialize_weights(self):
            """Initialize model weights."""
            for m in self.modules():
                if isinstance(m, nn.Conv1d):
                    nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                elif isinstance(m, nn.BatchNorm1d):
                    nn.init.constant_(m.weight, 1)
                    nn.init.constant_(m.bias, 0)
                elif isinstance(m, nn.Linear):
                    nn.init.normal_(m.weight, 0, 0.01)
                    nn.init.constant_(m.bias, 0)
    
        def forward(self, x: torch.Tensor) -> torch.Tensor:
            """
            Forward pass.
    
            Args:
                x: Input tensor of shape (batch, channels, samples)
    
            Returns:
                Logits of shape (batch, n_classes)
            """
            x = self.conv1(x)
            x = self.bn1(x)
            x = self.activation(x)
            x = self.maxpool(x)
    
            x = self.layer1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.layer4(x)
    
            x = self.global_pool(x)
            x = torch.flatten(x, 1)
            x = self.dropout(x)
            x = self.fc(x)
    
            return x
    
    
    def count_parameters(model: nn.Module) -> int:
        """
        Count trainable parameters in model.
    
        Args:
            model: PyTorch model
    
        Returns:
            Number of trainable parameters
        """
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    
    def load_checkpoint(
        model: nn.Module,
        checkpoint_path: Path,
        device: str = 'cpu'
    ) -> nn.Module:
        """
        Load model from checkpoint.
    
        Args:
            model: Model instance
            checkpoint_path: Path to checkpoint file
            device: Device to load model to
    
        Returns:
            Model with loaded weights
        """
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        logger.info(f"Loaded checkpoint from {checkpoint_path}")
        return model
    
    
    def save_checkpoint(
        model: nn.Module,
        optimizer: torch.optim.Optimizer,
        epoch: int,
        metrics: dict,
        checkpoint_path: Path
    ) -> None:
        """
        Save model checkpoint.
    
        Args:
            model: Model instance
            optimizer: Optimizer instance
            epoch: Current epoch
            metrics: Dictionary of metrics
            checkpoint_path: Path to save checkpoint
        """
        checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'metrics': metrics
        }, checkpoint_path)
        logger.info(f"Saved checkpoint to {checkpoint_path}")
    
except Exception as _e:
    print('Warning: inlined module', 'model.py', 'raised', _e)


In [7]:
# --- preprocessing.py (inlined) ---
try:
    """
    ECG Preprocessing Module
    Handles loading, filtering, resampling, and normalization of ECG signals.
    Implements streaming preprocessing with unified label mapping.
    """
    
    import logging
    import re
    from pathlib import Path
    from typing import Dict, List, Optional, Tuple, Union
    
    import numpy as np
    import pandas as pd
    import wfdb
    from scipy import signal as scipy_signal
    from scipy.io import loadmat
    from sklearn.model_selection import train_test_split
    from tqdm import tqdm
    
    from .utils import read_json, robust_path_variants, safe_makedirs, safe_save_npz, write_json
    
    logger = logging.getLogger(__name__)
    
    
    def load_wfdb_record(record_path: Union[str, Path]) -> Tuple[np.ndarray, float]:
        """
        Load WFDB format ECG record (.hea/.dat files).
    
        Args:
            record_path: Path to record (without extension)
    
        Returns:
            Tuple of (signal array, sampling frequency)
    
        Raises:
            ValueError: If record cannot be read
        """
        try:
            record = wfdb.rdrecord(str(Path(record_path).with_suffix('')))
            if record.p_signal is None:
                raise ValueError(f"No signal data in record: {record_path}")
    
            # Flatten multi-lead to single channel (mean across leads)
            signal = record.p_signal.mean(axis=1) if record.p_signal.ndim > 1 else record.p_signal
            return signal.astype(np.float32), float(record.fs)
        except Exception as e:
            raise ValueError(f"Failed to read WFDB record {record_path}: {e}")
    
    
    def load_mat_record(mat_path: Union[str, Path]) -> Tuple[np.ndarray, float]:
        """
        Load MAT format ECG record.
    
        Args:
            mat_path: Path to .mat file
    
        Returns:
            Tuple of (signal array, sampling frequency)
    
        Raises:
            ValueError: If MAT file cannot be read
        """
        try:
            mat_data = loadmat(str(mat_path))
    
            # Common MAT file structures
            signal = None
            fs = 500  # Default fallback
    
            # Try common field names
            for key in ['val', 'data', 'ecg', 'signal']:
                if key in mat_data:
                    signal = mat_data[key]
                    break
    
            if signal is None:
                # Try first non-metadata field
                for key, value in mat_data.items():
                    if not key.startswith('__') and isinstance(value, np.ndarray):
                        signal = value
                        break
    
            if signal is None:
                raise ValueError(f"No signal data found in MAT file: {mat_path}")
    
            # Flatten and ensure 1D
            signal = signal.flatten().astype(np.float32)
    
            # Try to extract sampling frequency
            for key in ['fs', 'Fs', 'freq', 'frequency']:
                if key in mat_data:
                    fs = float(mat_data[key].item())
                    break
    
            return signal, fs
        except Exception as e:
            raise ValueError(f"Failed to read MAT file {mat_path}: {e}")
    
    
    def resample_signal(signal: np.ndarray, original_fs: float, target_fs: float = 500) -> np.ndarray:
        """
        Resample signal to target frequency using scipy.signal.resample.
    
        Args:
            signal: Input signal array
            original_fs: Original sampling frequency
            target_fs: Target sampling frequency
    
        Returns:
            Resampled signal
        """
        if abs(original_fs - target_fs) < 1e-3:
            return signal
    
        num_samples = int(len(signal) * target_fs / original_fs)
        return scipy_signal.resample(signal, num_samples).astype(np.float32)
    
    
    def normalize_signal(signal: np.ndarray) -> np.ndarray:
        """
        Z-score normalize signal.
    
        Args:
            signal: Input signal array
    
        Returns:
            Normalized signal
        """
        mean = np.mean(signal)
        std = np.std(signal)
        if std < 1e-8:
            logger.warning("Signal has near-zero std, skipping normalization")
            return signal
        return ((signal - mean) / std).astype(np.float32)
    
    
    def pad_or_truncate(signal: np.ndarray, target_samples: int) -> np.ndarray:
        """
        Pad with zeros or truncate signal to target length.
    
        Args:
            signal: Input signal array
            target_samples: Desired length
    
        Returns:
            Signal of length target_samples
        """
        current_length = len(signal)
        if current_length < target_samples:
            pad_width = target_samples - current_length
            return np.pad(signal, (0, pad_width), mode='constant').astype(np.float32)
        else:
            return signal[:target_samples].astype(np.float32)
    
    
    def build_mapping_index(unified_csv: Union[str, Path]) -> Dict[str, str]:
        """
        Build robust mapping index from unified label CSV.
    
        Args:
            unified_csv: Path to unified_label_mapping.csv
    
        Returns:
            Dictionary mapping path variants to mapped_label
        """
        try:
            df = pd.read_csv(unified_csv)
            required_cols = ['dataset', 'record_id', 'mapped_label']
            missing = set(required_cols) - set(df.columns)
            if missing:
                logger.warning(f"Missing columns in unified CSV: {missing}, using available columns")
    
            mapping = {}
            for _, row in df.iterrows():
                dataset = row.get('dataset', '')
                record_id = str(row.get('record_id', ''))
                mapped_label = row.get('mapped_label', 'OTHER')
    
                # Build full relative path
                full_path = f"{dataset}/{record_id}" if dataset else record_id
    
                # Generate variants
                variants = robust_path_variants(full_path)
                for variant in variants:
                    mapping[variant] = mapped_label
    
            logger.info(f"Built mapping index with {len(mapping)} path variants from {len(df)} records")
            return mapping
        except FileNotFoundError:
            logger.warning(f"Unified label mapping not found: {unified_csv}, all records will be marked OTHER")
            return {}
        except Exception as e:
            logger.error(f"Error reading unified CSV: {e}")
            return {}
    
    
    def lookup_label(file_path: Path, dataset_dir: Path, mapping_index: Dict[str, str]) -> str:
        """
        Lookup mapped label for a file using robust path matching.
    
        Args:
            file_path: Full path to file
            dataset_dir: Root dataset directory
            mapping_index: Mapping dictionary from build_mapping_index
    
        Returns:
            Mapped label string (default: 'OTHER')
        """
        try:
            rel_path = file_path.relative_to(dataset_dir)
        except ValueError:
            rel_path = file_path
    
        # Generate variants
        variants = robust_path_variants(rel_path)
    
        for variant in variants:
            if variant in mapping_index:
                return mapping_index[variant]
    
        return 'OTHER'
    
    
    def sanitize_filename(path: str) -> str:
        """
        Sanitize path for use as filename.
    
        Args:
            path: Input path string
    
        Returns:
            Sanitized filename safe for filesystem
        """
        # Replace separators and special chars with underscore
        sanitized = re.sub(r'[\\/:*?"<>|]', '_', path)
        # Remove multiple consecutive underscores
        sanitized = re.sub(r'_+', '_', sanitized)
        return sanitized.strip('_')
    
    
    def run_streaming_preprocess(
        dataset_dir: Union[str, Path],
        unified_csv: Union[str, Path],
        out_dir: Union[str, Path],
        target_fs: int = 500,
        target_samples: int = 5000,
        label_order: Optional[List[str]] = None,
        limit: Optional[int] = None
    ) -> Dict[str, any]:
        """
        Run streaming preprocessing on dataset with unified label mapping.
        Processes files one at a time, saves individually, and builds manifest.
    
        Args:
            dataset_dir: Root directory containing datasets
            unified_csv: Path to unified_label_mapping.csv
            out_dir: Output directory for processed records
            target_fs: Target sampling frequency (Hz)
            target_samples: Target number of samples
            label_order: List of label names in order (for encoding)
            limit: Optional limit on number of files to process (for testing)
    
        Returns:
            Dictionary with processing statistics and paths
        """
        if label_order is None:
            label_order = ['MI', 'AF', 'BBB', 'NORM', 'OTHER']
    
        dataset_dir = Path(dataset_dir)
        out_dir = Path(out_dir)
        records_dir = out_dir / 'records'
        safe_makedirs(records_dir)
    
        # Build mapping index
        mapping_index = build_mapping_index(unified_csv)
    
        # Label encoding
        label_to_int = {label: idx for idx, label in enumerate(label_order)}
    
        # Find all ECG files
        hea_files = list(dataset_dir.rglob('*.hea'))
        mat_files = list(dataset_dir.rglob('*.mat'))
    
        all_files = hea_files + mat_files
        if limit:
            all_files = all_files[:limit]
    
        logger.info(f"Found {len(hea_files)} .hea files and {len(mat_files)} .mat files")
        logger.info(f"Processing {len(all_files)} files total")
    
        manifest = []
        label_counts = {label: 0 for label in label_order}
        failed_count = 0
    
        for file_path in tqdm(all_files, desc="Processing records"):
            try:
                # Read signal
                if file_path.suffix == '.hea':
                    signal, fs = load_wfdb_record(file_path.with_suffix(''))
                elif file_path.suffix == '.mat':
                    signal, fs = load_mat_record(file_path)
                else:
                    continue
    
                # Resample if needed
                if abs(fs - target_fs) > 1e-3:
                    signal = resample_signal(signal, fs, target_fs)
    
                # Normalize
                signal = normalize_signal(signal)
    
                # Pad or truncate
                signal = pad_or_truncate(signal, target_samples)
    
                # Lookup label
                mapped_label = lookup_label(file_path, dataset_dir, mapping_index)
                label_int = label_to_int.get(mapped_label, label_to_int['OTHER'])
    
                # Generate output filename
                try:
                    rel_path = file_path.relative_to(dataset_dir)
                except ValueError:
                    rel_path = file_path
    
                dataset_name = rel_path.parts[0] if len(rel_path.parts) > 1 else 'unknown'
                sanitized = sanitize_filename(str(rel_path.with_suffix('')))
                out_filename = f"{dataset_name}__{sanitized}.npz"
                out_path = records_dir / out_filename
    
                # Save
                safe_save_npz(out_path, signal, label_int)
    
                # Add to manifest
                manifest.append({
                    'path': f"records/{out_filename}",
                    'label': int(label_int),
                    'mapped_label': mapped_label,
                    'dataset': dataset_name
                })
    
                label_counts[mapped_label] = label_counts.get(mapped_label, 0) + 1
    
            except Exception as e:
                logger.warning(f"Failed to process {file_path}: {e}")
                failed_count += 1
                continue
    
        logger.info(f"Successfully processed {len(manifest)} records, {failed_count} failures")
        logger.info(f"Label distribution: {label_counts}")
    
        # Create stratified splits
        labels_list = [item['label'] for item in manifest]
        indices = np.arange(len(manifest))
    
        if len(manifest) > 10:
            train_idx, temp_idx = train_test_split(
                indices, test_size=0.2, stratify=labels_list, random_state=42
            )
            val_idx, test_idx = train_test_split(
                temp_idx, test_size=0.5, stratify=[labels_list[i] for i in temp_idx], random_state=42
            )
        else:
            # Too few samples for stratification
            train_idx = indices
            val_idx = np.array([])
            test_idx = np.array([])
    
        splits = {
            'train': [manifest[i] for i in train_idx],
            'val': [manifest[i] for i in val_idx] if len(val_idx) > 0 else [],
            'test': [manifest[i] for i in test_idx] if len(test_idx) > 0 else [],
            'label_order': label_order,
            'label_to_int': label_to_int,
            'counts': label_counts
        }
    
        # Save outputs
        write_json(out_dir / 'splits.json', splits)
        write_json(out_dir / 'label_map.json', {'label_order': label_order, 'label_to_int': label_to_int})
        np.save(out_dir / 'labels.npy', np.array(labels_list))
    
        logger.info(f"Saved splits.json, label_map.json, and labels.npy to {out_dir}")
    
        return {
            'total_processed': len(manifest),
            'total_failed': failed_count,
            'splits': splits,
            'label_counts': label_counts,
            'output_dir': str(out_dir)
        }
    
except Exception as _e:
    print('Warning: inlined module', 'preprocessing.py', 'raised', _e)




In [8]:
# --- saver.py (inlined) ---
try:
    """
    ECG Saver Module
    Utilities for saving models, results, and artifacts.
    """
    
    import logging
    from pathlib import Path
    from typing import Dict, Optional
    
    import numpy as np
    import torch
    import torch.nn as nn
    
    from .utils import safe_makedirs, write_json
    
    logger = logging.getLogger(__name__)
    
    
    def save_model_checkpoint(
        model: nn.Module,
        optimizer: torch.optim.Optimizer,
        epoch: int,
        metrics: Dict,
        save_path: Path,
        scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None
    ) -> None:
        """
        Save complete model checkpoint with optimizer and scheduler state.
    
        Args:
            model: Model instance
            optimizer: Optimizer instance
            epoch: Current epoch number
            metrics: Dictionary of metrics
            save_path: Path to save checkpoint
            scheduler: Optional learning rate scheduler
        """
        safe_makedirs(save_path.parent)
    
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'metrics': metrics
        }
    
        if scheduler is not None:
            checkpoint['scheduler_state_dict'] = scheduler.state_dict()
    
        torch.save(checkpoint, save_path)
        logger.info(f"Saved checkpoint to {save_path}")
    
    
    def save_predictions(
        predictions: np.ndarray,
        probabilities: np.ndarray,
        labels: np.ndarray,
        save_path: Path,
        label_names: Optional[list] = None
    ) -> None:
        """
        Save predictions, probabilities, and labels to compressed file.
    
        Args:
            predictions: Predicted class indices
            probabilities: Class probabilities
            labels: True labels
            save_path: Path to save file
            label_names: Optional list of label names
        """
        safe_makedirs(save_path.parent)
    
        save_dict = {
            'predictions': predictions,
            'probabilities': probabilities,
            'labels': labels
        }
    
        if label_names is not None:
            save_dict['label_names'] = np.array(label_names, dtype=object)
    
        np.savez_compressed(save_path, **save_dict)
        logger.info(f"Saved predictions to {save_path}")
    
    
    def save_training_history(
        history: list,
        save_path: Path,
        additional_info: Optional[Dict] = None
    ) -> None:
        """
        Save training history to JSON file.
    
        Args:
            history: List of epoch dictionaries
            save_path: Path to save file
            additional_info: Optional additional metadata
        """
        output = {'history': history}
    
        if additional_info is not None:
            output.update(additional_info)
    
        write_json(save_path, output)
        logger.info(f"Saved training history to {save_path}")
    
    
    def export_model_onnx(
        model: nn.Module,
        save_path: Path,
        input_shape: tuple = (1, 1, 5000),
        device: str = 'cpu'
    ) -> None:
        """
        Export model to ONNX format for deployment.
    
        Args:
            model: Trained model
            save_path: Path to save ONNX file
            input_shape: Example input shape
            device: Device to use for export
        """
        safe_makedirs(save_path.parent)
    
        model.eval()
        model = model.to(device)
    
        dummy_input = torch.randn(*input_shape).to(device)
    
        torch.onnx.export(
            model,
            dummy_input,
            save_path,
            export_params=True,
            opset_version=11,
            do_constant_folding=True,
            input_names=['input'],
            output_names=['output'],
            dynamic_axes={
                'input': {0: 'batch_size'},
                'output': {0: 'batch_size'}
            }
        )
    
        logger.info(f"Exported model to ONNX: {save_path}")
    
    
    def save_evaluation_report(
        metrics: Dict,
        save_path: Path
    ) -> None:
        """
        Save evaluation metrics report to JSON.
    
        Args:
            metrics: Dictionary of evaluation metrics
            save_path: Path to save file
        """
        write_json(save_path, metrics)
        logger.info(f"Saved evaluation report to {save_path}")
    
except Exception as _e:
    print('Warning: inlined module', 'saver.py', 'raised', _e)




In [9]:
# --- serving.py (inlined) ---
try:
    """
    ECG Model Serving Module
    Handles model loading and inference for production use.
    """
    
    import torch
    import mlflow
    import numpy as np
    from pathlib import Path
    
    
    class ECGPredictor:
        """
        ECG inference class for loading trained models and making predictions.
        """
    
        def __init__(self, model_path=None, model_uri=None):
            """
            Initialize predictor with either a local model path or MLflow model URI.
    
            Args:
                model_path: Path to local .pth model file
                model_uri: MLflow model URI (e.g., 'models:/ECGClassifier/Production')
            """
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
            if model_uri:
                self.model = mlflow.pytorch.load_model(model_uri)
            elif model_path:
                self.model = torch.load(model_path, map_location=self.device)
            else:
                raise ValueError("Must provide either model_path or model_uri")
    
            self.model.to(self.device)
            self.model.eval()
    
        def predict(self, ecg_signal):
            """
            Make prediction on a single ECG signal.
    
            Args:
                ecg_signal: numpy array of shape (length, channels)
    
            Returns:
                predictions: numpy array of class probabilities
            """
            # Prepare input
            if isinstance(ecg_signal, np.ndarray):
                ecg_tensor = torch.FloatTensor(ecg_signal).unsqueeze(0)
            else:
                ecg_tensor = ecg_signal.unsqueeze(0)
    
            ecg_tensor = ecg_tensor.to(self.device)
    
            # Run inference
            with torch.no_grad():
                outputs = self.model(ecg_tensor)
                probabilities = torch.softmax(outputs, dim=1)
    
            return probabilities.cpu().numpy()[0]
    
        def predict_batch(self, ecg_signals):
            """
            Make predictions on a batch of ECG signals.
    
            Args:
                ecg_signals: numpy array of shape (batch_size, length, channels)
    
            Returns:
                predictions: numpy array of shape (batch_size, num_classes)
            """
            ecg_tensor = torch.FloatTensor(ecg_signals).to(self.device)
    
            with torch.no_grad():
                outputs = self.model(ecg_tensor)
                probabilities = torch.softmax(outputs, dim=1)
    
            return probabilities.cpu().numpy()
    
    
    def load_mlflow_model(run_id=None, model_name=None, stage='Production'):
        """
        Load model from MLflow.
    
        Args:
            run_id: MLflow run ID
            model_name: Registered model name
            stage: Model stage (Production, Staging, etc.)
    
        Returns:
            predictor: ECGPredictor instance
        """
        if model_name:
            model_uri = f"models:/{model_name}/{stage}"
        elif run_id:
            model_uri = f"runs:/{run_id}/model"
        else:
            raise ValueError("Must provide either run_id or model_name")
    
        return ECGPredictor(model_uri=model_uri)
    
except Exception as _e:
    print('Warning: inlined module', 'serving.py', 'raised', _e)




In [10]:
# --- training.py (inlined) ---
try:
    """
    ECG Training Module
    Implements training loop with mixed precision, checkpointing, and metrics logging.
    """
    
    import logging
    import time
    from pathlib import Path
    from typing import Dict, Optional
    
    import numpy as np
    import torch
    import torch.nn as nn
    from torch.cuda.amp import GradScaler, autocast
    from torch.optim.lr_scheduler import CosineAnnealingLR
    from tqdm import tqdm
    
    from .utils import write_json
    
    logger = logging.getLogger(__name__)
    
    
    class Trainer:
        """
        Trainer class for ECG classification with mixed precision support.
        """
    
        def __init__(
            self,
            model: nn.Module,
            train_loader: torch.utils.data.DataLoader,
            val_loader: Optional[torch.utils.data.DataLoader],
            device: str,
            learning_rate: float = 1e-3,
            weight_decay: float = 1e-4,
            epochs: int = 50,
            checkpoint_dir: Path = Path('artifacts/models'),
            use_amp: bool = True,
            grad_accum_steps: int = 1
        ):
            """
            Initialize Trainer.
    
            Args:
                model: PyTorch model
                train_loader: Training DataLoader
                val_loader: Validation DataLoader
                device: Device string ('cuda' or 'cpu')
                learning_rate: Initial learning rate
                weight_decay: Weight decay for optimizer
                epochs: Number of training epochs
                checkpoint_dir: Directory to save checkpoints
                use_amp: Whether to use automatic mixed precision
                grad_accum_steps: Gradient accumulation steps
            """
            self.model = model.to(device)
            self.train_loader = train_loader
            self.val_loader = val_loader
            self.device = device
            self.epochs = epochs
            self.checkpoint_dir = Path(checkpoint_dir)
            self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
            self.grad_accum_steps = grad_accum_steps
    
            # Loss and optimizer
            self.criterion = nn.CrossEntropyLoss()
            self.optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=learning_rate,
                weight_decay=weight_decay
            )
    
            # Learning rate scheduler
            self.scheduler = CosineAnnealingLR(
                self.optimizer,
                T_max=epochs,
                eta_min=1e-6
            )
    
            # Mixed precision
            self.use_amp = use_amp and device == 'cuda'
            self.scaler = GradScaler() if self.use_amp else None
    
            # Tracking
            self.best_val_f1 = 0.0
            self.history = []
    
            logger.info(f"Initialized Trainer on {device}, AMP={self.use_amp}")
    
        def train_step(self) -> Dict[str, float]:
            """
            Execute one training epoch.
    
            Returns:
                Dictionary of training metrics
            """
            self.model.train()
            total_loss = 0.0
            correct = 0
            total = 0
    
            self.optimizer.zero_grad()
    
            pbar = tqdm(self.train_loader, desc="Training", leave=False)
            for batch_idx, (signals, labels) in enumerate(pbar):
                signals = signals.to(self.device, non_blocking=True)
                labels = labels.to(self.device, non_blocking=True)
    
                # Forward pass with optional mixed precision
                if self.use_amp:
                    with autocast():
                        outputs = self.model(signals)
                        loss = self.criterion(outputs, labels)
                        loss = loss / self.grad_accum_steps
    
                    self.scaler.scale(loss).backward()
    
                    if (batch_idx + 1) % self.grad_accum_steps == 0:
                        self.scaler.step(self.optimizer)
                        self.scaler.update()
                        self.optimizer.zero_grad()
                else:
                    outputs = self.model(signals)
                    loss = self.criterion(outputs, labels)
                    loss = loss / self.grad_accum_steps
                    loss.backward()
    
                    if (batch_idx + 1) % self.grad_accum_steps == 0:
                        self.optimizer.step()
                        self.optimizer.zero_grad()
    
                # Track metrics
                total_loss += loss.item() * self.grad_accum_steps
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()
    
                pbar.set_postfix({
                    'loss': total_loss / (batch_idx + 1),
                    'acc': 100. * correct / total
                })
    
            avg_loss = total_loss / len(self.train_loader)
            accuracy = 100. * correct / total
    
            return {
                'loss': avg_loss,
                'accuracy': accuracy
            }
    
        def val_step(self) -> Dict[str, float]:
            """
            Execute one validation epoch.
    
            Returns:
                Dictionary of validation metrics
            """
            if self.val_loader is None:
                return {}
    
            self.model.eval()
            total_loss = 0.0
            correct = 0
            total = 0
    
            all_preds = []
            all_labels = []
    
            with torch.no_grad():
                pbar = tqdm(self.val_loader, desc="Validation", leave=False)
                for signals, labels in pbar:
                    signals = signals.to(self.device, non_blocking=True)
                    labels = labels.to(self.device, non_blocking=True)
    
                    if self.use_amp:
                        with autocast():
                            outputs = self.model(signals)
                            loss = self.criterion(outputs, labels)
                    else:
                        outputs = self.model(signals)
                        loss = self.criterion(outputs, labels)
    
                    total_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += labels.size(0)
                    correct += predicted.eq(labels).sum().item()
    
                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())
    
            avg_loss = total_loss / len(self.val_loader)
            accuracy = 100. * correct / total
    
            # Calculate F1 macro (simple approximation)
            # For full metrics, use eval.py functions
            from sklearn.metrics import f1_score
            f1_macro = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    
            return {
                'loss': avg_loss,
                'accuracy': accuracy,
                'f1_macro': f1_macro
            }
    
        def fit(self) -> Dict[str, list]:
            """
            Train model for specified number of epochs.
    
            Returns:
                Training history dictionary
            """
            logger.info(f"Starting training for {self.epochs} epochs")
    
            for epoch in range(self.epochs):
                start_time = time.time()
    
                # Train
                train_metrics = self.train_step()
    
                # Validate
                val_metrics = self.val_step()
    
                # Step scheduler
                self.scheduler.step()
    
                # Log metrics
                epoch_time = time.time() - start_time
                lr = self.optimizer.param_groups[0]['lr']
    
                log_str = f"Epoch {epoch+1}/{self.epochs} ({epoch_time:.1f}s) - "
                log_str += f"LR: {lr:.6f} - "
                log_str += f"Train Loss: {train_metrics['loss']:.4f}, Acc: {train_metrics['accuracy']:.2f}%"
    
                if val_metrics:
                    log_str += f" - Val Loss: {val_metrics['loss']:.4f}, "
                    log_str += f"Acc: {val_metrics['accuracy']:.2f}%, "
                    log_str += f"F1: {val_metrics['f1_macro']:.4f}"
    
                logger.info(log_str)
    
                # Save history
                epoch_record = {
                    'epoch': epoch + 1,
                    'train': train_metrics,
                    'val': val_metrics,
                    'lr': lr,
                    'time': epoch_time
                }
                self.history.append(epoch_record)
    
                # Save checkpoints
                val_f1 = val_metrics.get('f1_macro', 0.0)
    
                # Save latest
                latest_path = self.checkpoint_dir / 'latest.pth'
                torch.save({
                    'epoch': epoch + 1,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'scheduler_state_dict': self.scheduler.state_dict(),
                    'metrics': val_metrics
                }, latest_path)
    
                # Save best
                if val_f1 > self.best_val_f1:
                    self.best_val_f1 = val_f1
                    best_path = self.checkpoint_dir / 'best.pth'
                    torch.save({
                        'epoch': epoch + 1,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'metrics': val_metrics
                    }, best_path)
                    logger.info(f"Saved new best model with F1={val_f1:.4f}")
    
            # Save training log
            log_path = self.checkpoint_dir.parent / 'training_log.json'
            write_json(log_path, {'history': self.history, 'best_val_f1': self.best_val_f1})
            logger.info(f"Saved training log to {log_path}")
    
            return self.history
    
except Exception as _e:
    print('Warning: inlined module', 'training.py', 'raised', _e)




In [11]:
# --- utils.py (inlined) ---
try:
    """
    Utility functions for ECG data I/O and common operations.
    """
    
    import os
    import json
    import logging
    import random
    import numpy as np
    from pathlib import Path
    from typing import Any, Dict, Optional, Union
    
    logger = logging.getLogger(__name__)
    
    
    def safe_makedirs(path: Union[str, Path]) -> None:
        """
        Create directory safely if it does not exist.
    
        Args:
            path: Directory path to create
        """
        Path(path).mkdir(parents=True, exist_ok=True)
    
    
    def read_json(path: Union[str, Path]) -> Dict[str, Any]:
        """
        Read JSON file safely.
    
        Args:
            path: Path to JSON file
    
        Returns:
            Dictionary containing JSON data
        """
        with open(path, 'r') as f:
            return json.load(f)
    
    
    def write_json(path: Union[str, Path], data: Dict[str, Any], indent: int = 2) -> None:
        """
        Write dictionary to JSON file.
    
        Args:
            path: Output path
            data: Dictionary to serialize
            indent: JSON indentation level
        """
        safe_makedirs(Path(path).parent)
        with open(path, 'w') as f:
            json.dump(data, f, indent=indent)
    
    
    def set_seed(seed: int = 42) -> None:
        """
        Set random seed for reproducibility across numpy and torch.
    
        Args:
            seed: Random seed value
        """
        random.seed(seed)
        np.random.seed(seed)
        try:
            import torch
            torch.manual_seed(seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed_all(seed)
                torch.backends.cudnn.deterministic = True
                torch.backends.cudnn.benchmark = False
        except ImportError:
            pass
    
    
    def print_device_info() -> str:
        """
        Print and return device information (CUDA GPU or CPU).
    
        Returns:
            Device string ('cuda' or 'cpu')
        """
        try:
            import torch
            if torch.cuda.is_available():
                device = 'cuda'
                gpu_name = torch.cuda.get_device_name(0)
                gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
                logger.info(f"Using CUDA device: {gpu_name}")
                logger.info(f"GPU memory: {gpu_memory:.2f} GB")
                return device
            else:
                logger.info("CUDA not available, using CPU")
                return 'cpu'
        except ImportError:
            logger.warning("PyTorch not installed, defaulting to CPU")
            return 'cpu'
    
    
    def safe_save_npz(path: Union[str, Path], signal: np.ndarray, label: int) -> None:
        """
        Save signal and label to compressed NPZ file safely.
    
        Args:
            path: Output file path
            signal: Signal array
            label: Integer label
        """
        safe_makedirs(Path(path).parent)
        np.savez_compressed(path, signal=signal, label=label)
    
    
    def robust_path_variants(path: Union[str, Path]) -> list:
        """
        Generate multiple path variants for robust matching.
        Handles different path separators and relative path formats.
    
        Args:
            path: Input path
    
        Returns:
            List of path string variants
        """
        p = Path(path)
        variants = []
    
        # Normalize separators to forward slash
        normalized = str(p).replace('\\', '/')
        variants.append(normalized)
    
        # Without dataset prefix (assumes dataset is first component)
        parts = Path(normalized).parts
        if len(parts) > 1:
            variants.append('/'.join(parts[1:]))
    
        # Last two components
        if len(parts) >= 2:
            variants.append('/'.join(parts[-2:]))
    
        # Basename without extension
        stem = p.stem
        variants.append(stem)
    
        # Full relative path without extension
        variants.append(str(p.with_suffix('')).replace('\\', '/'))
    
        return list(set(variants))
    
    
    def get_project_root() -> Path:
        """
        Get the project root directory.
    
        Returns:
            Path to project root
        """
        return Path(__file__).parent.parent
    
    
except Exception as _e:
    print('Warning: inlined module', 'utils.py', 'raised', _e)


## Inlined: Utility scripts (scripts/)

Files: apply_mapping_improvements.py, create_master_notebook.py, create_notebook.py, generate_unified_mapping.py, improve_mapping.py, model_smoke_test.py, preprocess_streaming.py, run_full_automation.py, validate_mapping.py, verify_smoke_test.py


In [12]:
# --- apply_mapping_improvements.py (inlined) ---
try:
    """
    Apply mapping improvements to create an updated unified_label_mapping.csv
    """
    import pandas as pd
    from pathlib import Path
    from datetime import datetime
    
    ROOT = Path(__file__).parent.parent
    LOGS_DIR = ROOT / "logs"
    
    # Load original mapping
    mapping_file = LOGS_DIR / "unified_label_mapping.csv"
    improvements_file = LOGS_DIR / "mapping_improvements_suggested.csv"
    
    if not improvements_file.exists():
        print("Error: No improvements file found. Run improve_mapping.py first.")
        exit(1)
    
    # Backup original
    backup_file = LOGS_DIR / f"unified_label_mapping.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    print(f"Creating backup: {backup_file.name}")
    
    df = pd.read_csv(mapping_file, dtype=str).fillna("")
    df.to_csv(backup_file, index=False)
    
    # Load improvements
    improvements_df = pd.read_csv(improvements_file, dtype=str)
    print(f"\nLoaded {len(improvements_df)} improvements")
    
    # Apply improvements
    applied = 0
    for _, imp in improvements_df.iterrows():
        idx = int(imp['index'])
        suggested_label = imp['suggested_label']
    
        if idx < len(df):
            df.at[idx, 'mapped_label'] = suggested_label
            applied += 1
    
    print(f"Applied {applied} improvements")
    
    # Save updated mapping
    df.to_csv(mapping_file, index=False)
    print(f"\n✓ Updated mapping saved to: {mapping_file}")
    
    # Print new distribution
    print(f"\nUpdated label distribution:")
    label_counts = df['mapped_label'].value_counts()
    for label, count in label_counts.items():
        pct = count / len(df) * 100
        label_name = label if label else "(unmapped)"
        print(f"  {label_name}: {count:,} ({pct:.1f}%)")
    
    unmapped = (df['mapped_label'] == '').sum()
    print(f"\nRemaining unmapped: {unmapped:,} ({unmapped/len(df)*100:.1f}%)")
    
    print("\n✓ Mapping update complete!")
    print("\nYou can now run preprocessing with the improved mapping.")
    
except Exception as _e:
    print('Warning: inlined module', 'apply_mapping_improvements.py', 'raised', _e)




In [13]:
# --- create_master_notebook.py (inlined) ---
try:
    #!/usr/bin/env python3
    """
    scripts/create_master_notebook.py
    Builds notebooks/master_pipeline.ipynb by inlining .py files and creating runnable cells.
    """
    from pathlib import Path
    import json
    import textwrap
    
    ROOT = Path.cwd()
    SCRIPTS = ROOT / "scripts"
    SRC = ROOT / "src"
    NOTEBOOKS = ROOT / "notebooks"
    TARGET = NOTEBOOKS / "master_pipeline.ipynb"
    NOTEBOOKS.mkdir(parents=True, exist_ok=True)
    
    def read_file(path: Path) -> str:
        return path.read_text(encoding="utf-8")
    
    def code_cell(source: str):
        return {
            "cell_type": "code",
            "metadata": {},
            "execution_count": None,
            "outputs": [],
            "source": source.splitlines(keepends=True),
        }
    
    def md_cell(text: str):
        return {
            "cell_type": "markdown",
            "metadata": {},
            "source": (text + "\n").splitlines(keepends=True),
        }
    
    cells = []
    
    # 0. Title + run-headless note
    cells.append(md_cell(
        "# Master ECG Pipeline\n\n"
        "This notebook combines all project scripts and modules into one single runnable file.\n\n"
        "**Usage:** run cells top-to-bottom. For headless execution on Windows use:\n\n"
        "```python\n"
        "import asyncio, sys\n"
        "if sys.platform == 'win32':\n"
        "    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())\n"
        "```\n"
    ))
    
    # 1. Environment & imports (idempotent)
    env_code = textwrap.dedent("""
    # Environment & imports - idempotent
    import os, sys, json, time, math, asyncio
    from pathlib import Path
    import numpy as np
    import random
    import torch
    print('Python:', sys.executable)
    print('Torch:', getattr(torch, '__version__', 'n/a'))
    # Windows asyncio fix for nbconvert headless runs
    import platform
    if platform.system() == 'Windows':
        try:
            import asyncio, sys
            asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
        except Exception:
            pass
    
    # Project root detection
    ROOT = Path(os.environ.get('ECG_ROOT', Path.cwd().resolve()))
    DATASET_DIR = ROOT / 'Dataset'
    ARTIFACTS_DIR = ROOT / 'artifacts'
    PROCESSED_DIR = ARTIFACTS_DIR / 'processed'
    LOGS_DIR = ROOT / 'logs'
    NOTEBOOKS_DIR = ROOT / 'notebooks'
    for p in (ARTIFACTS_DIR, PROCESSED_DIR, PROCESSED_DIR/'records', LOGS_DIR):
        p.mkdir(parents=True, exist_ok=True)
    
    # seeds for reproducibility
    SEED = int(os.environ.get('ECG_SEED', '42'))
    random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('DEVICE', DEVICE)
    """)
    cells.append(code_cell(env_code))
    
    # 2. Inline src modules (each file -> markdown header + code cell)
    def inline_dir(path: Path, title: str):
        py_files = sorted([p for p in path.glob('*.py') if p.is_file()])
        if not py_files:
            return
        cells.append(md_cell(f"## Inlined: {title}\n\nFiles: {', '.join(p.name for p in py_files)}"))
        for p in py_files:
            source = read_file(p)
            header = f"# --- {p.name} (inlined) ---\n"
            # wrap in a try/except to prevent name collisions stopping the notebook
            wrapped = (
                header +
                "try:\n" +
                "\n".join("    " + line for line in source.splitlines()) +
                "\nexcept Exception as _e:\n    print('Warning: inlined module', '{}', 'raised', _e)\n".format(p.name)
            )
            cells.append(code_cell(wrapped))
    
    # Inline src and scripts
    inline_dir(SRC, "Source modules (src/)")
    inline_dir(SCRIPTS, "Utility scripts (scripts/)")
    
    # 3. Operational cells: run mapping generation, optional improvement, preprocessing stub
    cells.append(md_cell("## Quick: Generate/Load unified mapping (run this cell)"))
    cells.append(code_cell(textwrap.dedent("""
    # Generate unified mapping (if you have script)
    candidate = Path('logs/unified_label_mapping.candidate.csv')
    prod = Path('logs/unified_label_mapping.csv')
    if (Path('scripts/generate_unified_mapping.py')).exists() and not candidate.exists():
        print('Generating candidate mapping...')
        os.system(f'python \"{str(Path("scripts/generate_unified_mapping.py"))}\"')
    else:
        print('Candidate mapping exists:', candidate.exists(), 'Prod file exists:', prod.exists())
    # If you have a candidate and want to promote it, uncomment:
    # if candidate.exists(): candidate.replace(prod)
    """)))
    
    cells.append(md_cell("## Preprocessing (streaming, memory-safe)"))
    cells.append(code_cell(textwrap.dedent("""
    # Run streaming preprocessing from scripts/preprocess_streaming.py if present
    proc_script = Path('scripts/preprocess_streaming.py')
    if proc_script.exists():
        print('Launching streaming preprocessing (script)...')
        # recommend using environment var ECG_PREPROCESS_LIMIT to test
        os.system(f'python \"{proc_script}\"')
    else:
        print('No preprocess_streaming.py found. Implement preprocessing in this notebook or inline alternate script.')
    """)))
    
    cells.append(md_cell("## Training (run this after preprocessing finishes)"))
    cells.append(code_cell(textwrap.dedent("""
    # Launch training script if present
    train_script = Path('scripts/train_pipeline.py')  # optional
    if train_script.exists():
        print('Running training script...')
        os.system(f'python \"{train_script}\"')
    else:
        print('No training script detected. Use in-notebook training cells or create scripts/training.py and link it.')
    """)))
    
    cells.append(md_cell("## Evaluation and Visuals"))
    cells.append(code_cell(textwrap.dedent("""
    # Run evaluation if script exists
    eval_script = Path('scripts/evaluate.py')
    if eval_script.exists():
        os.system(f'python \"{eval_script}\"')
    else:
        print('No evaluate.py. Use notebook cells to visualize artifacts/figures/')
    """)))
    
    cells.append(md_cell("## Smoke tests and quick validation"))
    cells.append(code_cell(textwrap.dedent("""
    # Run smoke tests
    smoke = Path('scripts/verify_smoke_test.py')
    if smoke.exists():
        os.system(f'python \"{smoke}\"')
    else:
        print('No smoke-test script. Manual checks:')
        print(' - Count files:', len(list((PROCESSED_DIR/'records').glob('*.npz'))))
        print(' - Check splits:', (PROCESSED_DIR/'splits.json').exists())
    """)))
    
    cells.append(md_cell("## Final: Notebook control\nYou can now run cells in order. Long-running steps are executed as external scripts to avoid kernel timeouts."))
    
    nb = {
        "cells": cells,
        "metadata": {
            "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
            "language_info": {"name": "python"}
        },
        "nbformat": 4,
        "nbformat_minor": 5
    }
    
    TARGET.write_text(json.dumps(nb, indent=2), encoding="utf-8")
    print("Wrote notebook to", TARGET)
except Exception as _e:
    print('Warning: inlined module', 'create_master_notebook.py', 'raised', _e)


Wrote notebook to D:\ecg-research\notebooks\notebooks\master_pipeline.ipynb


In [14]:
# --- create_notebook.py (inlined) ---
try:
    """
    Create a complete Jupyter notebook for ECG tensor pipeline.
    
    This script programmatically builds notebooks/ecg_tensor_pipeline.ipynb
    using nbformat v4 with all necessary cells for preprocessing, training, and evaluation.
    """
    
    import json
    from pathlib import Path
    
    try:
        import nbformat
        from nbformat.v4 import new_notebook, new_code_cell, new_markdown_cell
    except ImportError:
        print("Error: nbformat not installed. Run: pip install nbformat")
        exit(1)
    
    # Project paths
    ROOT = Path(__file__).resolve().parent.parent
    NOTEBOOKS_DIR = ROOT / "notebooks"
    OUTPUT_NOTEBOOK = NOTEBOOKS_DIR / "ecg_tensor_pipeline.ipynb"
    
    # Ensure notebooks directory exists
    NOTEBOOKS_DIR.mkdir(parents=True, exist_ok=True)
    
    # Create a new notebook
    nb = new_notebook()
    
    # ============================================================================
    # CELL 1: Title and Instructions (Markdown)
    # ============================================================================
    nb.cells.append(new_markdown_cell("""# ECG Tensor Pipeline — Preprocessing, Training, Evaluation
    
    ## Overview
    
    This notebook provides an end-to-end pipeline for ECG signal classification:
    1. **Preprocessing**: Load raw ECG datasets, resample, normalize, and save per-record `.npz` files
    2. **Dataset & DataLoader**: Lazy loading with PyTorch for memory efficiency
    3. **Model**: Compact 1D CNN for ECG classification
    4. **Training**: GPU-accelerated training with mixed precision
    5. **Evaluation**: Metrics, confusion matrix, ROC curves, and visualizations
    
    ## Prerequisites
    
    - Python 3.10+
    - Required packages: `numpy`, `scipy`, `pandas`, `wfdb`, `torch`, `scikit-learn`, `matplotlib`, `tqdm`, `nbformat`
    - Datasets should be in `Dataset/` folder
    - Unified label mapping CSV at `logs/unified_label_mapping.csv`
    
    ## Running Headless
    
    To execute this notebook from the command line:
    
    ```powershell
    jupyter nbconvert --to notebook --execute notebooks/ecg_tensor_pipeline.ipynb --output ecg_tensor_pipeline_executed.ipynb
    ```
    
    **Note**: On Windows, if you see `RuntimeError: There is no current event loop in thread`, add this at the top of your script:
    
    ```python
    import asyncio
    import sys
    if sys.platform == 'win32':
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    ```
    
    ## GPU-Intensive Tasks
    
    The following cells will utilize GPU heavily when CUDA is available:
    - **Training loop**: Forward/backward passes, gradient updates
    - **Large batch evaluation**: Model inference on test set
    - **Mixed precision training**: Uses `torch.cuda.amp` for speedup
    """))
    
    # ============================================================================
    # CELL 2: Environment Setup and Imports (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Environment checks, imports, seeds, and directory setup
    import os
    import sys
    import random
    import json
    from pathlib import Path
    from collections import defaultdict, Counter
    
    import numpy as np
    import pandas as pd
    from scipy import signal
    from scipy.signal import resample
    from scipy.io import loadmat
    import matplotlib.pyplot as plt
    from tqdm import tqdm
    
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from torch.utils.data import Dataset, DataLoader
    
    from sklearn.metrics import (
        confusion_matrix, classification_report, 
        f1_score, precision_recall_fscore_support,
        roc_curve, auc, precision_recall_curve
    )
    from sklearn.preprocessing import label_binarize
    
    # Print environment info
    print(f"Python executable: {sys.executable}")
    print(f"Python version: {sys.version}")
    print(f"NumPy version: {np.__version__}")
    print(f"PyTorch version: {torch.__version__}")
    
    # Check CUDA availability
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\\nDevice: {DEVICE}")
    
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("Running on CPU - training will be slower")
    
    # Set deterministic seeds for reproducibility
    DEFAULT_SEED = 42
    random.seed(DEFAULT_SEED)
    np.random.seed(DEFAULT_SEED)
    torch.manual_seed(DEFAULT_SEED)
    if DEVICE.type == 'cuda':
        torch.cuda.manual_seed_all(DEFAULT_SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    
    print(f"\\nRandom seed set to: {DEFAULT_SEED}")
    
    # Define project paths
    # Try to detect if running inside notebooks/ or at project root
    CANDIDATES = [
        Path.cwd().parent,                 # when running inside notebooks/
        Path.cwd(),                        # when running at project root
        Path("D:/ecg-research").resolve()  # explicit fallback for this project
    ]
    ROOT = next((p.resolve() for p in CANDIDATES if (p / "Dataset").exists()), Path.cwd().resolve())
    
    DATASET_DIR = ROOT / "Dataset"
    ARTIFACTS_DIR = ROOT / "artifacts"
    PROCESSED_DIR = ARTIFACTS_DIR / "processed"
    RECORDS_DIR = PROCESSED_DIR / "records"
    CHECKPOINTS_DIR = PROCESSED_DIR / "checkpoints"
    FIGURES_DIR = ARTIFACTS_DIR / "figures"
    LOGS_DIR = ROOT / "logs"
    
    # Create directories
    for p in [ARTIFACTS_DIR, PROCESSED_DIR, RECORDS_DIR, CHECKPOINTS_DIR, FIGURES_DIR, LOGS_DIR]:
        p.mkdir(parents=True, exist_ok=True)
    
    print(f"\\nProject Paths:")
    print(f"  ROOT: {ROOT}")
    print(f"  DATASET_DIR: {DATASET_DIR} (exists: {DATASET_DIR.exists()})")
    print(f"  PROCESSED_DIR: {PROCESSED_DIR}")
    print(f"  FIGURES_DIR: {FIGURES_DIR}")
    
    # List available datasets
    if DATASET_DIR.exists():
        datasets = [p.name for p in sorted(DATASET_DIR.iterdir()) if p.is_dir()]
        print(f"\\nAvailable datasets: {datasets}")
    else:
        print(f"\\nWarning: Dataset directory not found at {DATASET_DIR}")
    
    print("\\n" + "="*80)
    print("Environment setup complete!")
    print("="*80)
    """))
    
    # ============================================================================
    # CELL 3: Configuration Constants (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Configuration constants and hyperparameters
    
    # Signal processing parameters
    TARGET_FS = 500           # Target sampling frequency (Hz)
    TARGET_SAMPLES = 5000     # Target number of samples per record (10 seconds at 500 Hz)
    
    # Label configuration
    LABEL_ORDER = ['MI', 'AF', 'BBB', 'NORM', 'OTHER']
    LABEL_TO_INT = {name: i for i, name in enumerate(LABEL_ORDER)}
    INT_TO_LABEL = {i: name for name, i in LABEL_TO_INT.items()}
    
    # Training hyperparameters
    BATCH_SIZE = 32           # Adjust based on GPU memory
    EPOCHS = 10               # Number of training epochs
    LR = 1e-3                 # Learning rate
    WEIGHT_DECAY = 1e-4       # L2 regularization
    NUM_WORKERS = 0           # DataLoader workers (set to 0 for Windows stability)
    PIN_MEMORY = True         # Pin memory for faster GPU transfer
    
    # Mixed precision training (GPU only)
    USE_MIXED_PRECISION = torch.cuda.is_available()
    
    # Adjust batch size for CPU
    if DEVICE.type == 'cpu' and BATCH_SIZE > 8:
        print(f"CPU detected - reducing batch size from {BATCH_SIZE} to 8")
        BATCH_SIZE = 8
        PIN_MEMORY = False
    
    print(f"\\nConfiguration:")
    print(f"  Target FS: {TARGET_FS} Hz")
    print(f"  Target Samples: {TARGET_SAMPLES}")
    print(f"  Label Mapping: {LABEL_TO_INT}")
    print(f"  Batch Size: {BATCH_SIZE}")
    print(f"  Learning Rate: {LR}")
    print(f"  Epochs: {EPOCHS}")
    print(f"  Mixed Precision: {USE_MIXED_PRECISION}")
    print(f"  Device: {DEVICE}")
    
    print(f"\\nGPU-Intensive Tasks:")
    print("  - Preprocessing: Moderate (CPU-bound mostly)")
    print("  - DataLoader: Low (lazy loading)")
    print("  - Model Training: HIGH (forward + backward passes)")
    print("  - Model Evaluation: Medium (inference only)")
    """))
    
    # ============================================================================
    # CELL 4: Utility Functions (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Utility functions for signal processing and file I/O
    
    def zscore_normalize(arr: np.ndarray) -> np.ndarray:
        \"\"\"Z-score normalization: (x - mean) / std\"\"\"
        arr = arr.astype(np.float32)
        mean = arr.mean()
        std = arr.std()
        if std < 1e-8:
            std = 1.0  # Prevent division by zero
        return ((arr - mean) / std).astype(np.float32)
    
    
    def pad_or_truncate(x: np.ndarray, target_length: int) -> np.ndarray:
        \"\"\"Pad with zeros or truncate to target length\"\"\"
        if x.size >= target_length:
            return x[:target_length]
        pad_width = target_length - x.size
        return np.pad(x, (0, pad_width), mode='constant', constant_values=0).astype(np.float32)
    
    
    def resample_signal(x: np.ndarray, original_fs: float, target_fs: float) -> np.ndarray:
        \"\"\"Resample signal to target sampling frequency\"\"\"
        if original_fs is None or np.isclose(original_fs, target_fs):
            return x
        new_length = int(round(x.size * target_fs / original_fs))
        if new_length <= 0:
            return x
        return resample(x, new_length).astype(np.float32)
    
    
    def safe_save_npz(path: Path, signal: np.ndarray, label: int):
        \"\"\"Save signal and label as compressed npz\"\"\"
        np.savez_compressed(path, signal=signal.astype(np.float32), label=int(label))
    
    
    def load_npz_signal(path: Path):
        \"\"\"Load signal and label from npz file\"\"\"
        with np.load(path, allow_pickle=False) as data:
            signal = data['signal']
            label = int(data['label'])
        return signal, label
    
    
    def read_wfdb(hea_path: Path):
        \"\"\"Read WFDB format (.hea/.dat) and return 1D signal and sampling frequency\"\"\"
        try:
            import wfdb
            record_path = str(hea_path.with_suffix(''))  # wfdb expects path without extension
            record = wfdb.rdsamp(record_path)
            signal = np.asarray(record[0], dtype=np.float32)
            fs = float(record[1].get('fs', TARGET_FS))
            
            # Convert to 1D: average across leads or take first lead
            if signal.ndim == 2:
                signal_1d = signal.mean(axis=1) if signal.shape[1] > 1 else signal[:, 0]
            else:
                signal_1d = signal.reshape(-1)
            
            return signal_1d.astype(np.float32), fs
        except Exception as e:
            raise RuntimeError(f"Failed to read WFDB file {hea_path.name}: {e}")
    
    
    def read_mat(mat_path: Path):
        \"\"\"Read MATLAB .mat file and return 1D signal and sampling frequency\"\"\"
        try:
            mat_data = loadmat(str(mat_path))
            
            # Try common keys for signal data
            signal = None
            for key in ['val', 'data', 'signal', 'ecg']:
                if key in mat_data:
                    signal = np.asarray(mat_data[key], dtype=np.float32)
                    break
            
            # Fallback: find first ndarray
            if signal is None:
                for value in mat_data.values():
                    if isinstance(value, np.ndarray) and value.size > 100:
                        signal = value.astype(np.float32)
                        break
            
            if signal is None:
                raise RuntimeError("No signal array found in MAT file")
            
            # Convert to 1D
            if signal.ndim == 2:
                signal_1d = signal.mean(axis=0) if signal.shape[0] > 1 else signal.reshape(-1)
            else:
                signal_1d = signal.reshape(-1)
            
            # MAT files rarely contain fs info; return None
            fs = None
            return signal_1d.astype(np.float32), fs
        except Exception as e:
            raise RuntimeError(f"Failed to read MAT file {mat_path.name}: {e}")
    
    
    print("Utility functions defined successfully.")
    """))
    
    # ============================================================================
    # CELL 5: Load Unified Label Mapping (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Load unified label mapping from CSV
    
    UNIFIED_CSV = LOGS_DIR / "unified_label_mapping.csv"
    mapping_index = {}
    
    if UNIFIED_CSV.exists():
        print(f"Loading unified label mapping from {UNIFIED_CSV}...")
        df_mapping = pd.read_csv(UNIFIED_CSV, dtype=str).fillna("")
        
        # Verify required columns
        required_cols = {"dataset", "record_id", "mapped_label"}
        if not required_cols.issubset(set(df_mapping.columns)):
            print(f"Warning: Missing required columns. Found: {df_mapping.columns.tolist()}")
            print(f"Expected: {list(required_cols)}")
        else:
            # Build mapping index with multiple key variants for robust lookup
            for _, row in df_mapping.iterrows():
                dataset = str(row.get("dataset", "")).strip()
                record_id = str(row.get("record_id", "")).strip().replace("\\\\", "/").strip("/")
                mapped_label = str(row.get("mapped_label", "")).strip().upper()
                
                if not dataset or not record_id:
                    continue
                
                if dataset not in mapping_index:
                    mapping_index[dataset] = {}
                
                # Add full path
                mapping_index[dataset][record_id] = mapped_label
                
                # Add variants for robust matching
                parts = record_id.split("/")
                if len(parts) >= 1:
                    mapping_index[dataset][parts[-1]] = mapped_label  # basename
                if len(parts) >= 2:
                    mapping_index[dataset]["/".join(parts[-2:])] = mapped_label  # last two components
            
            print(f"Loaded {len(df_mapping)} mappings from {len(mapping_index)} datasets")
            
            # Count mappings per label
            label_counts = Counter(df_mapping['mapped_label'].str.upper())
            print(f"\\nLabel distribution in mapping:")
            for label in LABEL_ORDER:
                count = label_counts.get(label, 0)
                print(f"  {label}: {count:,}")
            unmapped = label_counts.get('', 0)
            print(f"  (unmapped): {unmapped:,}")
    else:
        print(f"Warning: Unified label mapping not found at {UNIFIED_CSV}")
        print("All records will be labeled as OTHER")
    
    
    def lookup_mapped_label(path: Path) -> str:
        \"\"\"Look up mapped label for a given file path\"\"\"
        try:
            rel_path = path.relative_to(DATASET_DIR).with_suffix("")
        except Exception:
            rel_path = path.with_suffix("")
        
        parts = rel_path.as_posix().split("/")
        dataset = parts[0] if parts else ""
        
        if dataset not in mapping_index:
            return "OTHER"
        
        index = mapping_index[dataset]
        
        # Try multiple key variants
        candidates = [
            rel_path.as_posix(),                              # full path
            "/".join(parts[1:]) if len(parts) > 1 else "",   # without dataset prefix
            "/".join(parts[-2:]) if len(parts) >= 2 else "", # last two components
            rel_path.name                                     # basename only
        ]
        
        for key in candidates:
            if key and key in index:
                label = index[key].upper()
                return label if label in LABEL_TO_INT else "OTHER"
        
        # CinC2017 special case
        if "CinC" in dataset and len(parts) >= 3:
            alt_key = "/".join(parts[2:])
            if alt_key in index:
                label = index[alt_key].upper()
                return label if label in LABEL_TO_INT else "OTHER"
        
        return "OTHER"
    
    
    print("Label lookup function ready.")
    """))
    
    # ============================================================================
    # CELL 6: Streaming Preprocessing (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Streaming preprocessing: scan datasets, process, and save per-record npz files
    
    print("="*80)
    print("STARTING PREPROCESSING")
    print("="*80)
    
    # Find all .hea and .mat files
    print(f"\\nScanning {DATASET_DIR} for ECG files...")
    hea_files = sorted(DATASET_DIR.rglob("*.hea"))
    mat_files = sorted(DATASET_DIR.rglob("*.mat"))
    all_files = hea_files + mat_files
    
    print(f"Found {len(hea_files)} .hea files and {len(mat_files)} .mat files")
    print(f"Total files to process: {len(all_files)}")
    
    if not all_files:
        print("\\nNo dataset files found. Generating synthetic records for testing...")
        # Generate synthetic signals
        for i in range(20):
            t = np.linspace(0, TARGET_SAMPLES / TARGET_FS, TARGET_SAMPLES, dtype=np.float32)
            freq = 0.5 + 0.1 * i
            signal = np.sin(2 * np.pi * freq * t).astype(np.float32)
            signal = signal[np.newaxis, :]  # Shape: (1, TARGET_SAMPLES)
            
            label = i % len(LABEL_ORDER)
            out_file = RECORDS_DIR / f"SYNTH_{i:04d}.npz"
            safe_save_npz(out_file, signal, label)
        
        print(f"Generated 20 synthetic records in {RECORDS_DIR}")
    else:
        # Process real dataset files
        manifest = []
        label_counts = Counter()
        skipped = 0
        
        print(f"\\nProcessing files...")
        progress_bar = tqdm(all_files, desc="Processing", unit="file")
        
        for file_path in progress_bar:
            try:
                # Read signal
                if file_path.suffix.lower() == '.hea':
                    signal, fs = read_wfdb(file_path)
                else:
                    signal, fs = read_mat(file_path)
                
                # Resample if needed
                if fs is not None and not np.isclose(fs, TARGET_FS):
                    signal = resample_signal(signal, fs, TARGET_FS)
                
                # Normalize
                signal = zscore_normalize(signal)
                
                # Pad or truncate
                signal = pad_or_truncate(signal, TARGET_SAMPLES)
                
                # Add channel dimension: (1, TARGET_SAMPLES)
                signal = signal[np.newaxis, :]
                
                # Lookup label
                mapped_label = lookup_mapped_label(file_path)
                label_int = LABEL_TO_INT.get(mapped_label, LABEL_TO_INT["OTHER"])
                
                # Generate safe filename
                try:
                    rel_path = file_path.relative_to(DATASET_DIR).with_suffix("")
                    record_id = rel_path.as_posix().replace("/", "__")
                except Exception:
                    record_id = file_path.stem
                
                # Save processed record
                out_file = RECORDS_DIR / f"{record_id}.npz"
                safe_save_npz(out_file, signal, label_int)
                
                # Update manifest
                manifest.append({
                    "path": f"records/{out_file.name}",
                    "label": int(label_int)
                })
                label_counts[label_int] += 1
                
            except Exception as e:
                skipped += 1
                if skipped <= 10:  # Only print first 10 errors
                    tqdm.write(f"Error processing {file_path.name}: {e}")
                progress_bar.set_postfix({"skipped": skipped})
        
        progress_bar.close()
        
        print(f"\\n" + "="*80)
        print("PREPROCESSING SUMMARY")
        print("="*80)
        print(f"Total files processed: {len(manifest):,}")
        print(f"Files skipped (errors): {skipped:,}")
        print(f"\\nLabel distribution:")
        for idx, label_name in enumerate(LABEL_ORDER):
            count = label_counts[idx]
            pct = (count / len(manifest) * 100) if manifest else 0
            print(f"  {idx}={label_name:5s}: {count:6,d} ({pct:5.1f}%)")
        
        # Save manifest and create splits
        print(f"\\nCreating stratified train/val/test splits (80/10/10)...")
        
        # Group by label
        by_label = defaultdict(list)
        for entry in manifest:
            by_label[entry['label']].append(entry)
        
        # Split each label class
        train_list, val_list, test_list = [], [], []
        rng = np.random.default_rng(seed=DEFAULT_SEED)
        
        for label, entries in by_label.items():
            rng.shuffle(entries)
            n = len(entries)
            n_train = int(n * 0.80)
            n_val = int(n * 0.10)
            
            train_list.extend(entries[:n_train])
            val_list.extend(entries[n_train:n_train + n_val])
            test_list.extend(entries[n_train + n_val:])
        
        # Save splits.json
        splits_data = {
            "timestamp": pd.Timestamp.utcnow().isoformat(),
            "label_order": LABEL_ORDER,
            "label_to_int": LABEL_TO_INT,
            "train": train_list,
            "val": val_list,
            "test": test_list,
            "counts": {
                "train": len(train_list),
                "val": len(val_list),
                "test": len(test_list)
            },
            "class_counts": {int(k): int(v) for k, v in label_counts.items()}
        }
        
        splits_file = PROCESSED_DIR / "splits.json"
        with open(splits_file, 'w', encoding='utf-8') as f:
            json.dump(splits_data, f, indent=2)
        
        # Save label_map.json
        label_map = {
            "label_to_int": LABEL_TO_INT,
            "int_to_label": INT_TO_LABEL
        }
        label_map_file = PROCESSED_DIR / "label_map.json"
        with open(label_map_file, 'w', encoding='utf-8') as f:
            json.dump(label_map, f, indent=2)
        
        # Save labels.npy
        np.save(PROCESSED_DIR / "labels.npy", np.array(LABEL_ORDER, dtype=object))
        
        print(f"\\nSaved:")
        print(f"  - {splits_file}")
        print(f"  - {label_map_file}")
        print(f"  - {PROCESSED_DIR / 'labels.npy'}")
        print(f"\\nSplit sizes:")
        print(f"  Train: {len(train_list):,}")
        print(f"  Val:   {len(val_list):,}")
        print(f"  Test:  {len(test_list):,}")
    
    print("\\n" + "="*80)
    print("PREPROCESSING COMPLETED SUCCESSFULLY")
    print("="*80)
    """))
    
    # ============================================================================
    # CELL 7: PyTorch Dataset and DataLoader (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# PyTorch Dataset for lazy loading of per-record npz files
    
    class ECGDataset(Dataset):
        \"\"\"Lazy-loading dataset for preprocessed ECG records\"\"\"
        
        def __init__(self, entries, base_dir):
            \"\"\"
            Args:
                entries: List of dicts with 'path' and 'label' keys
                base_dir: Base directory for processed files
            \"\"\"
            self.entries = entries
            self.base_dir = Path(base_dir)
        
        def __len__(self):
            return len(self.entries)
        
        def __getitem__(self, idx):
            entry = self.entries[idx]
            file_path = self.base_dir / entry['path']
            
            # Load signal and label
            signal, label = load_npz_signal(file_path)
            
            # Convert to tensors
            signal_tensor = torch.from_numpy(signal).float()
            label_tensor = torch.tensor(label, dtype=torch.long)
            
            return signal_tensor, label_tensor
    
    
    def create_dataloaders(splits_file, processed_dir, batch_size, num_workers=0, pin_memory=False):
        \"\"\"Create train, val, and test DataLoaders\"\"\"
        
        with open(splits_file, 'r') as f:
            splits = json.load(f)
        
        train_dataset = ECGDataset(splits['train'], processed_dir)
        val_dataset = ECGDataset(splits['val'], processed_dir)
        test_dataset = ECGDataset(splits['test'], processed_dir)
        
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=num_workers,
            pin_memory=pin_memory
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers,
            pin_memory=pin_memory
        )
        
        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers,
            pin_memory=pin_memory
        )
        
        return train_loader, val_loader, test_loader
    
    
    # Create DataLoaders
    print("Creating DataLoaders...")
    splits_file = PROCESSED_DIR / "splits.json"
    
    if splits_file.exists():
        train_loader, val_loader, test_loader = create_dataloaders(
            splits_file, 
            PROCESSED_DIR,
            batch_size=BATCH_SIZE,
            num_workers=NUM_WORKERS,
            pin_memory=PIN_MEMORY
        )
        
        print(f"DataLoaders created:")
        print(f"  Train batches: {len(train_loader)}")
        print(f"  Val batches:   {len(val_loader)}")
        print(f"  Test batches:  {len(test_loader)}")
        
        # Show example batch
        sample_batch = next(iter(train_loader))
        print(f"\\nExample batch shapes:")
        print(f"  Signals: {sample_batch[0].shape}")
        print(f"  Labels:  {sample_batch[1].shape}")
        print(f"  Label values: {sample_batch[1][:min(10, BATCH_SIZE)].tolist()}")
    else:
        print(f"Error: splits.json not found at {splits_file}")
        print("Please run the preprocessing cell first.")
    """))
    
    # ============================================================================
    # CELL 8: Model Definition (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# 1D CNN Model for ECG Classification
    
    class ECGNet1D(nn.Module):
        \"\"\"Compact 1D CNN for ECG signal classification\"\"\"
        
        def __init__(self, n_classes=len(LABEL_ORDER), input_channels=1, base_channels=32, dropout=0.3):
            super(ECGNet1D, self).__init__()
            
            self.conv1 = nn.Conv1d(input_channels, base_channels, kernel_size=7, stride=2, padding=3)
            self.bn1 = nn.BatchNorm1d(base_channels)
            
            self.conv2 = nn.Conv1d(base_channels, base_channels * 2, kernel_size=5, stride=2, padding=2)
            self.bn2 = nn.BatchNorm1d(base_channels * 2)
            
            self.conv3 = nn.Conv1d(base_channels * 2, base_channels * 4, kernel_size=3, stride=2, padding=1)
            self.bn3 = nn.BatchNorm1d(base_channels * 4)
            
            self.conv4 = nn.Conv1d(base_channels * 4, base_channels * 8, kernel_size=3, stride=2, padding=1)
            self.bn4 = nn.BatchNorm1d(base_channels * 8)
            
            self.global_pool = nn.AdaptiveAvgPool1d(1)
            self.dropout = nn.Dropout(dropout)
            self.fc = nn.Linear(base_channels * 8, n_classes)
            
            self.relu = nn.ReLU(inplace=True)
        
        def forward(self, x):
            # x shape: (batch, 1, samples)
            x = self.relu(self.bn1(self.conv1(x)))
            x = self.relu(self.bn2(self.conv2(x)))
            x = self.relu(self.bn3(self.conv3(x)))
            x = self.relu(self.bn4(self.conv4(x)))
            x = self.global_pool(x)
            x = x.view(x.size(0), -1)
            x = self.dropout(x)
            x = self.fc(x)
            return x
    
    
    # Instantiate model
    model = ECGNet1D(n_classes=len(LABEL_ORDER), base_channels=32, dropout=0.3)
    model = model.to(DEVICE)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print("="*80)
    print("MODEL")
    print("="*80)
    print(model)
    print(f"\\nTotal parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model device: {DEVICE}")
    """))
    
    # ============================================================================
    # CELL 9: Training Loop (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Training loop with mixed precision and metrics tracking
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    
    # Mixed precision scaler
    scaler = torch.cuda.amp.GradScaler() if USE_MIXED_PRECISION else None
    
    # Training history
    history = {
        'train_loss': [],
        'train_acc': [],
        'train_f1': [],
        'val_loss': [],
        'val_acc': [],
        'val_f1': [],
        'lr': []
    }
    
    best_val_f1 = 0.0
    best_epoch = 0
    
    print("="*80)
    print("TRAINING")
    print("="*80)
    
    for epoch in range(EPOCHS):
        print(f"\\nEpoch {epoch + 1}/{EPOCHS}")
        print("-" * 80)
        
        # Training phase
        model.train()
        train_loss = 0.0
        train_preds = []
        train_labels = []
        
        train_progress = tqdm(train_loader, desc="Training", leave=False)
        for signals, labels in train_progress:
            signals = signals.to(DEVICE)
            labels = labels.to(DEVICE)
            
            optimizer.zero_grad()
            
            # Mixed precision forward pass
            if USE_MIXED_PRECISION:
                with torch.cuda.amp.autocast():
                    outputs = model(signals)
                    loss = criterion(outputs, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(signals)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
            train_loss += loss.item()
            preds = outputs.argmax(dim=1).cpu().numpy()
            train_preds.extend(preds)
            train_labels.extend(labels.cpu().numpy())
            
            train_progress.set_postfix({'loss': f'{loss.item():.4f}'})
        
        train_loss /= len(train_loader)
        train_acc = (np.array(train_preds) == np.array(train_labels)).mean()
        train_f1 = f1_score(train_labels, train_preds, average='macro', zero_division=0)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            val_progress = tqdm(val_loader, desc="Validation", leave=False)
            for signals, labels in val_progress:
                signals = signals.to(DEVICE)
                labels = labels.to(DEVICE)
                
                outputs = model(signals)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                preds = outputs.argmax(dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(labels.cpu().numpy())
        
        val_loss /= len(val_loader)
        val_acc = (np.array(val_preds) == np.array(val_labels)).mean()
        val_f1 = f1_score(val_labels, val_preds, average='macro', zero_division=0)
        
        # Update history
        current_lr = optimizer.param_groups[0]['lr']
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['train_f1'].append(train_f1)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)
        history['lr'].append(current_lr)
        
        # Print epoch summary
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}")
        print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f} | Val F1:   {val_f1:.4f}")
        print(f"LR: {current_lr:.6f}")
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_epoch = epoch + 1
            best_model_path = CHECKPOINTS_DIR / "best_model.pth"
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_f1': val_f1,
                'val_acc': val_acc,
                'val_loss': val_loss
            }, best_model_path)
            print(f"✓ Saved best model (F1: {val_f1:.4f})")
        
        # Step scheduler
        scheduler.step()
    
    print("\\n" + "="*80)
    print("TRAINING COMPLETED")
    print("="*80)
    print(f"Best validation F1: {best_val_f1:.4f} (Epoch {best_epoch})")
    
    # Save final model
    final_model_path = CHECKPOINTS_DIR / "final_model.pth"
    torch.save({
        'epoch': EPOCHS,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'history': history
    }, final_model_path)
    print(f"\\nSaved final model to {final_model_path}")
    
    # Save training history
    history_file = PROCESSED_DIR / "training_history.json"
    with open(history_file, 'w') as f:
        json.dump(history, f, indent=2)
    print(f"Saved training history to {history_file}")
    """))
    
    # ============================================================================
    # CELL 10: Evaluation and Plots (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Evaluation on test set with metrics and visualizations
    
    # Load best model
    best_model_path = CHECKPOINTS_DIR / "best_model.pth"
    checkpoint = torch.load(best_model_path, map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded best model from epoch {checkpoint['epoch'] + 1}")
    print(f"Best validation F1: {checkpoint['val_f1']:.4f}")
    
    # Evaluate on test set
    model.eval()
    test_preds = []
    test_labels = []
    test_probs = []
    
    with torch.no_grad():
        test_progress = tqdm(test_loader, desc="Testing")
        for signals, labels in test_progress:
            signals = signals.to(DEVICE)
            labels = labels.to(DEVICE)
            
            outputs = model(signals)
            probs = F.softmax(outputs, dim=1)
            preds = outputs.argmax(dim=1)
            
            test_probs.append(probs.cpu().numpy())
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())
    
    test_probs = np.vstack(test_probs)
    test_preds = np.array(test_preds)
    test_labels = np.array(test_labels)
    
    # Calculate metrics
    test_acc = (test_preds == test_labels).mean()
    test_f1_macro = f1_score(test_labels, test_preds, average='macro', zero_division=0)
    test_f1_weighted = f1_score(test_labels, test_preds, average='weighted', zero_division=0)
    
    print("\\n" + "="*80)
    print("TEST SET EVALUATION")
    print("="*80)
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test F1 (macro): {test_f1_macro:.4f}")
    print(f"Test F1 (weighted): {test_f1_weighted:.4f}")
    
    # Per-class metrics
    print("\\n" + "-"*80)
    print("Per-Class Metrics:")
    print("-"*80)
    precision, recall, f1, support = precision_recall_fscore_support(
        test_labels, test_preds, average=None, zero_division=0
    )
    
    for i, label_name in enumerate(LABEL_ORDER):
        print(f"{label_name:5s}: Precision={precision[i]:.3f}, Recall={recall[i]:.3f}, F1={f1[i]:.3f}, Support={support[i]}")
    
    # Confusion matrix
    cm = confusion_matrix(test_labels, test_preds)
    print("\\n" + "-"*80)
    print("Confusion Matrix:")
    print("-"*80)
    print(cm)
    
    # Save evaluation results
    eval_results = {
        'test_accuracy': float(test_acc),
        'test_f1_macro': float(test_f1_macro),
        'test_f1_weighted': float(test_f1_weighted),
        'per_class_metrics': {
            LABEL_ORDER[i]: {
                'precision': float(precision[i]),
                'recall': float(recall[i]),
                'f1': float(f1[i]),
                'support': int(support[i])
            }
            for i in range(len(LABEL_ORDER))
        },
        'confusion_matrix': cm.tolist()
    }
    
    eval_file = PROCESSED_DIR / "evaluation_results.json"
    with open(eval_file, 'w') as f:
        json.dump(eval_results, f, indent=2)
    print(f"\\nSaved evaluation results to {eval_file}")
    """))
    
    # ============================================================================
    # CELL 11: Visualization Plots (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Generate and save visualization plots
    
    # Set plot style
    plt.style.use('default')
    plt.rcParams['figure.dpi'] = 100
    plt.rcParams['savefig.dpi'] = 150
    
    # 1. Training curves
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Loss
    axes[0, 0].plot(history['train_loss'], label='Train Loss', linewidth=2)
    axes[0, 0].plot(history['val_loss'], label='Val Loss', linewidth=2)
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].set_title('Training and Validation Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Accuracy
    axes[0, 1].plot(history['train_acc'], label='Train Accuracy', linewidth=2)
    axes[0, 1].plot(history['val_acc'], label='Val Accuracy', linewidth=2)
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].set_title('Training and Validation Accuracy')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # F1 Score
    axes[1, 0].plot(history['train_f1'], label='Train F1', linewidth=2)
    axes[1, 0].plot(history['val_f1'], label='Val F1', linewidth=2)
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('F1 Score (macro)')
    axes[1, 0].set_title('Training and Validation F1 Score')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # Learning Rate
    axes[1, 1].plot(history['lr'], label='Learning Rate', linewidth=2, color='orange')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Learning Rate')
    axes[1, 1].set_title('Learning Rate Schedule')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    axes[1, 1].set_yscale('log')
    
    plt.tight_layout()
    training_curves_path = FIGURES_DIR / 'training_curves.png'
    plt.savefig(training_curves_path, bbox_inches='tight')
    print(f"Saved training curves to {training_curves_path}")
    plt.show()
    
    # 2. Confusion Matrix Heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=LABEL_ORDER,
           yticklabels=LABEL_ORDER,
           xlabel='Predicted Label',
           ylabel='True Label',
           title='Confusion Matrix')
    
    # Add text annotations
    thresh = cm.max() / 2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black",
                    fontsize=12)
    
    plt.tight_layout()
    cm_path = FIGURES_DIR / 'confusion_matrix.png'
    plt.savefig(cm_path, bbox_inches='tight')
    print(f"Saved confusion matrix to {cm_path}")
    plt.show()
    
    # 3. Per-class F1 Score Bar Plot
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(LABEL_ORDER))
    bars = ax.bar(x, f1, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
    ax.set_xlabel('Class', fontsize=12)
    ax.set_ylabel('F1 Score', fontsize=12)
    ax.set_title('Per-Class F1 Score on Test Set', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(LABEL_ORDER)
    ax.set_ylim([0, 1.0])
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for i, (bar, value) in enumerate(zip(bars, f1)):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0, height + 0.02,
                f'{value:.3f}',
                ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    f1_bars_path = FIGURES_DIR / 'per_class_f1.png'
    plt.savefig(f1_bars_path, bbox_inches='tight')
    print(f"Saved per-class F1 plot to {f1_bars_path}")
    plt.show()
    
    print("\\nAll visualizations saved to:", FIGURES_DIR)
    """))
    
    # ============================================================================
    # CELL 12: Smoke Tests (Code)
    # ============================================================================
    nb.cells.append(new_code_cell("""# Automated smoke tests to verify pipeline integrity
    
    print("="*80)
    print("SMOKE TESTS")
    print("="*80)
    
    # Test 1: Load a single record
    print("\\n1. Testing record loading...")
    try:
        test_files = list(RECORDS_DIR.glob("*.npz"))
        if test_files:
            test_file = test_files[0]
            signal, label = load_npz_signal(test_file)
            print(f"   ✓ Loaded {test_file.name}")
            print(f"     Signal shape: {signal.shape}")
            print(f"     Label: {label} ({LABEL_ORDER[label]})")
            assert signal.shape[1] == TARGET_SAMPLES, "Signal length mismatch"
            assert 0 <= label < len(LABEL_ORDER), "Invalid label"
            print("   ✓ Record validation passed")
        else:
            print("   ✗ No records found")
    except Exception as e:
        print(f"   ✗ Failed: {e}")
    
    # Test 2: Model forward pass
    print("\\n2. Testing model forward pass...")
    try:
        dummy_input = torch.randn(1, 1, TARGET_SAMPLES).to(DEVICE)
        with torch.no_grad():
            output = model(dummy_input)
        print(f"   ✓ Input shape: {dummy_input.shape}")
        print(f"   ✓ Output shape: {output.shape}")
        assert output.shape == (1, len(LABEL_ORDER)), "Output shape mismatch"
        print("   ✓ Model forward pass successful")
    except Exception as e:
        print(f"   ✗ Failed: {e}")
    
    # Test 3: Checkpoint loading
    print("\\n3. Testing checkpoint loading...")
    try:
        best_checkpoint = CHECKPOINTS_DIR / "best_model.pth"
        if best_checkpoint.exists():
            checkpoint = torch.load(best_checkpoint, map_location=DEVICE)
            print(f"   ✓ Loaded checkpoint from epoch {checkpoint.get('epoch', 'unknown')}")
            print(f"   ✓ Val F1: {checkpoint.get('val_f1', 0):.4f}")
            print("   ✓ Checkpoint loading successful")
        else:
            print(f"   ✗ Checkpoint not found at {best_checkpoint}")
    except Exception as e:
        print(f"   ✗ Failed: {e}")
    
    # Test 4: Dataset integrity
    print("\\n4. Testing dataset integrity...")
    try:
        splits_file = PROCESSED_DIR / "splits.json"
        if splits_file.exists():
            with open(splits_file, 'r') as f:
                splits = json.load(f)
            n_train = len(splits.get('train', []))
            n_val = len(splits.get('val', []))
            n_test = len(splits.get('test', []))
            n_total = n_train + n_val + n_test
            print(f"   ✓ Total records: {n_total}")
            print(f"   ✓ Train: {n_train}, Val: {n_val}, Test: {n_test}")
            print("   ✓ Dataset integrity check passed")
        else:
            print(f"   ✗ Splits file not found")
    except Exception as e:
        print(f"   ✗ Failed: {e}")
    
    print("\\n" + "="*80)
    print("SMOKE TESTS COMPLETED")
    print("="*80)
    """))
    
    # ============================================================================
    # CELL 13: Final Summary (Markdown)
    # ============================================================================
    nb.cells.append(new_markdown_cell("""---
    
    ## Pipeline Complete!
    
    This notebook has successfully completed the full ECG classification pipeline:
    
    ✓ **Preprocessing**: Loaded, resampled, normalized, and saved ECG records  
    ✓ **Dataset**: Created stratified train/val/test splits  
    ✓ **Model**: Trained 1D CNN classifier  
    ✓ **Evaluation**: Generated metrics and visualizations  
    ✓ **Artifacts**: Saved models, checkpoints, and results  
    
    ### Output Files
    
    - **Processed Data**: `artifacts/processed/records/*.npz`
    - **Splits**: `artifacts/processed/splits.json`
    - **Best Model**: `artifacts/processed/checkpoints/best_model.pth`
    - **Training History**: `artifacts/processed/training_history.json`
    - **Evaluation Results**: `artifacts/processed/evaluation_results.json`
    - **Figures**: `artifacts/figures/*.png`
    
    ### Next Steps
    
    1. **Improve Model**: Experiment with deeper architectures (ResNet1D, attention mechanisms)
    2. **Hyperparameter Tuning**: Use grid search or Bayesian optimization
    3. **Data Augmentation**: Add noise, scaling, time-warping
    4. **Multi-Lead Models**: Process all 12 leads instead of averaging
    5. **Ensemble Methods**: Combine multiple models for better performance
    6. **Deployment**: Export to ONNX or TorchScript for production
    
    ### Inference Example
    
    To use the trained model for inference:
    
    ```python
    # Load model
    checkpoint = torch.load('artifacts/processed/checkpoints/best_model.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    # Load a signal
    signal, label = load_npz_signal('path/to/record.npz')
    signal_tensor = torch.from_numpy(signal).float().unsqueeze(0).to(DEVICE)
    
    # Predict
    with torch.no_grad():
        output = model(signal_tensor)
        probabilities = F.softmax(output, dim=1)
        predicted_class = output.argmax(dim=1).item()
        predicted_label = LABEL_ORDER[predicted_class]
    
    print(f"Predicted: {predicted_label} (confidence: {probabilities[0, predicted_class]:.2%})")
    ```
    """))
    
    # ============================================================================
    # Write the notebook to file
    # ============================================================================
    print(f"Writing notebook to {OUTPUT_NOTEBOOK}...")
    with open(OUTPUT_NOTEBOOK, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)
    
    print(f"✓ Notebook created successfully!")
    print(f"  Location: {OUTPUT_NOTEBOOK}")
    print(f"  Total cells: {len(nb.cells)}")
    print(f"\nTo run the notebook:")
    print(f"  jupyter notebook {OUTPUT_NOTEBOOK}")
    print(f"\nOr execute headless:")
    print(f"  jupyter nbconvert --to notebook --execute {OUTPUT_NOTEBOOK.name}")
    
except Exception as _e:
    print('Warning: inlined module', 'create_notebook.py', 'raised', _e)




In [15]:
# --- generate_unified_mapping.py (inlined) ---
try:
    """
    Generate unified label mapping for ECG datasets.
    
    This script scans dataset/ folder for known ECG datasets and creates a unified
    label mapping CSV with columns: dataset, record_id, original_label_text, mapped_label.
    
    Supported datasets:
    - ptb-xl: Parses ptbxl_database.csv
    - CinC2017: Parses REFERENCE*.csv files
    - PTB_Diagnostic: Lists WFDB records
    - Chapman_Shaoxing: Lists WFDB records
    
    Output: logs/unified_label_mapping.candidate.csv
    """
    
    import re
    import csv
    import sys
    from pathlib import Path
    from collections import Counter
    from typing import List, Dict
    
    # Define project root - adjust if running from different locations
    ROOT = Path(__file__).resolve().parent.parent
    DATASET_DIR = ROOT / "Dataset"
    LOGS_DIR = ROOT / "logs"
    OUTPUT_CSV = LOGS_DIR / "unified_label_mapping.candidate.csv"
    
    # Target label categories (consistent with pipeline)
    TARGET_LABELS = ["MI", "AF", "BBB", "NORM", "OTHER"]
    
    
    def map_label_heuristic(original_text: str) -> str:
        """
        Apply simple heuristic mapping based on keyword patterns.
        Returns one of: MI, AF, BBB, NORM, OTHER, or empty string if ambiguous.
    
        Args:
            original_text: Original label text from dataset
    
        Returns:
            Mapped label or empty string if uncertain
        """
        if not original_text:
            return ""
    
        text_upper = original_text.upper()
    
        # Myocardial Infarction patterns
        mi_patterns = [
            r'\bMI\b', r'MYOCARDIAL\s+INFARC', r'\bIMI\b', r'\bAMI\b',
            r'\bSTEMI\b', r'\bNSTEMI\b', r'\bPMI\b', r'\bLMI\b',
            r'INFARCT', r'Q\s*WAVE', r'PATHOLOGICAL\s+Q'
        ]
        if any(re.search(pat, text_upper) for pat in mi_patterns):
            return "MI"
    
        # Atrial Fibrillation patterns (including flutter)
        af_patterns = [
            r'\bAF\b', r'ATRIAL\s+FIB', r'A[\s-]?FIB', r'\bAFIB\b',
            r'AFIB', r'A\.FIB', r'\bAFLT\b', r'ATRIAL\s+FLUTTER',
            r'A[\s-]?FLUTTER'
        ]
        if any(re.search(pat, text_upper) for pat in af_patterns):
            return "AF"
    
        # Bundle Branch Block patterns (including fascicular blocks and incomplete blocks)
        bbb_patterns = [
            r'\bBBB\b', r'BUNDLE\s+BRANCH\s+BLOCK',
            r'\bLBBB\b', r'LEFT\s+BUNDLE\s+BRANCH',
            r'\bRBBB\b', r'RIGHT\s+BUNDLE\s+BRANCH',
            r'\bIRBBB\b', r'INCOMPLETE.*BUNDLE',
            r'\bILBBB\b',
            r'IVCD', r'INTRAVENTRICULAR\s+CONDUCTION',
            r'\bLAFB\b', r'\bLPFB\b', r'FASCICULAR\s+BLOCK',
            r'LEFT\s+ANTERIOR\s+FASCICULAR', r'LEFT\s+POSTERIOR\s+FASCICULAR'
        ]
        if any(re.search(pat, text_upper) for pat in bbb_patterns):
            return "BBB"
    
        # Normal patterns (be careful - only if explicitly stated)
        norm_patterns = [
            r'\bNORM\b', r'NORMAL\s+ECG', r'SINUSRHYTHMUS\s+NORMALES\s+EKG',
            r'^N$', r'^NORMAL$', r'NO\s+ABNORMAL', r'SINUS\s+RHYTHM.*NORMAL'
        ]
        if any(re.search(pat, text_upper) for pat in norm_patterns):
            # Additional safety: if text also contains pathological terms, don't map to NORM
            pathological_terms = ['INFARCT', 'ISCHEMI', 'HYPERTROPHY', 'BLOCK',
                                 'FIBRILLATION', 'FLUTTER', 'TACHYCARDIA', 'BRADYCARDIA']
            if not any(term in text_upper for term in pathological_terms):
                return "NORM"
    
        # If we can't confidently map, return empty (will be OTHER in preprocessing)
        return ""
    
    
    def parse_ptbxl(dataset_path: Path) -> List[Dict[str, str]]:
        """
        Parse PTB-XL dataset from ptbxl_database.csv.
    
        Expected columns: filename_lr (or similar), report, scp_codes
    
        Returns:
            List of dicts with keys: dataset, record_id, original_label_text, mapped_label
        """
        csv_path = dataset_path / "ptbxl_database.csv"
        if not csv_path.exists():
            print(f"Warning: {csv_path} not found. Skipping PTB-XL.")
            return []
    
        records = []
        try:
            with open(csv_path, 'r', encoding='utf-8', errors='replace') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    # Try multiple possible column names
                    filename_lr = row.get('filename_lr') or row.get('filename') or row.get('ecg_id')
                    if not filename_lr:
                        continue
    
                    # Clean up filename - remove 'records100/' or 'records500/' prefix
                    record_id = filename_lr.replace('records100/', '').replace('records500/', '')
                    record_id = record_id.strip()
    
                    # Get label information (prefer scp_codes, fall back to report)
                    scp_codes = row.get('scp_codes', '')
                    report = row.get('report', '')
                    original_label = scp_codes if scp_codes else report
    
                    # Map label
                    mapped = map_label_heuristic(original_label)
    
                    records.append({
                        'dataset': 'ptb-xl',
                        'record_id': record_id,
                        'original_label_text': original_label.strip() if original_label else '',
                        'mapped_label': mapped
                    })
    
            print(f"✓ PTB-XL: Parsed {len(records)} records from {csv_path.name}")
        except Exception as e:
            print(f"Error parsing PTB-XL: {e}")
    
        return records
    
    
    def parse_cinc2017(dataset_path: Path) -> List[Dict[str, str]]:
        """
        Parse CinC2017 dataset from REFERENCE*.csv files in training/validation subdirs.
    
        CinC2017 labels: N (normal), A (AF), O (other), ~ (noisy)
    
        Returns:
            List of dicts with keys: dataset, record_id, original_label_text, mapped_label
        """
        records = []
    
        # CinC2017 label mapping
        cinc_label_map = {
            'N': 'NORM',
            'A': 'AF',
            'O': '',  # Other - leave empty for heuristic or default to OTHER
            '~': ''   # Noisy - leave empty
        }
    
        # Look in training, validation, and test subdirectories
        subdirs = ['training', 'validation', 'test']
        reference_files = []
    
        for subdir in subdirs:
            subdir_path = dataset_path / subdir
            if subdir_path.exists():
                # Find REFERENCE*.csv files (prefer REFERENCE.csv, then latest version)
                ref_files = sorted(subdir_path.glob("REFERENCE*.csv"))
                if ref_files:
                    # Use the first one (typically REFERENCE.csv or highest version)
                    reference_files.append((subdir, ref_files[0]))
    
        # Also check root directory for REFERENCE files
        root_ref_files = sorted(dataset_path.glob("REFERENCE*.csv"))
        if root_ref_files:
            reference_files.append(('root', root_ref_files[-1]))  # Use latest version
    
        if not reference_files:
            print(f"Warning: No REFERENCE*.csv files found in {dataset_path}. Skipping CinC2017.")
            return []
    
        seen_ids = set()
        for subdir, ref_file in reference_files:
            try:
                with open(ref_file, 'r', encoding='utf-8', errors='replace') as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
    
                        # Format: record_id,label (e.g., "A00/A00003,N")
                        parts = line.split(',')
                        if len(parts) != 2:
                            continue
    
                        record_id_raw, label = parts[0].strip(), parts[1].strip()
    
                        # Normalize record_id: use forward slashes
                        record_id = record_id_raw.replace('\\', '/')
    
                        # Skip duplicates across files
                        if record_id in seen_ids:
                            continue
                        seen_ids.add(record_id)
    
                        # Map label
                        mapped = cinc_label_map.get(label, '')
    
                        records.append({
                            'dataset': 'CinC2017',
                            'record_id': record_id,
                            'original_label_text': label,
                            'mapped_label': mapped
                        })
            except Exception as e:
                print(f"Error reading {ref_file.name}: {e}")
    
        print(f"✓ CinC2017: Parsed {len(records)} records from {len(reference_files)} reference file(s)")
        return records
    
    
    def parse_wfdb_fallback(dataset_path: Path, dataset_name: str) -> List[Dict[str, str]]:
        """
        Fallback parser for WFDB-format datasets (PTB_Diagnostic, Chapman_Shaoxing).
        Lists all .hea files and uses relative path as record_id.
    
        For these datasets, we don't have direct label info, so original_label_text
        will be empty and mapped_label will be empty (will become OTHER in preprocessing).
    
        Returns:
            List of dicts with keys: dataset, record_id, original_label_text, mapped_label
        """
        records = []
    
        # Check for RECORDS file first
        records_file = dataset_path / "RECORDS"
        if records_file.exists():
            try:
                with open(records_file, 'r', encoding='utf-8', errors='replace') as f:
                    for line in f:
                        line = line.strip()
                        if not line or line.startswith('#'):
                            continue
    
                        # RECORDS file lists paths relative to dataset root
                        record_id = line.replace('\\', '/')
    
                        # For Chapman, RECORDS lists directories; we need to find .hea files
                        if dataset_name == 'Chapman_Shaoxing':
                            record_path = dataset_path / line
                            if record_path.is_dir():
                                # Find .hea files in this directory
                                hea_files = list(record_path.glob("*.hea"))
                                for hea in hea_files:
                                    rel_path = hea.relative_to(dataset_path).as_posix()
                                    # Remove .hea extension for record_id
                                    rel_id = rel_path[:-4] if rel_path.endswith('.hea') else rel_path
    
                                    # Try to infer label from header if available
                                    original_label = extract_label_from_wfdb_header(hea)
                                    mapped = map_label_heuristic(original_label)
    
                                    records.append({
                                        'dataset': dataset_name,
                                        'record_id': rel_id,
                                        'original_label_text': original_label,
                                        'mapped_label': mapped
                                    })
                            else:
                                # Single record entry
                                original_label = ""
                                hea_path = dataset_path / f"{line}.hea"
                                if hea_path.exists():
                                    original_label = extract_label_from_wfdb_header(hea_path)
    
                                mapped = map_label_heuristic(original_label)
                                records.append({
                                    'dataset': dataset_name,
                                    'record_id': record_id,
                                    'original_label_text': original_label,
                                    'mapped_label': mapped
                                })
                        else:
                            # PTB_Diagnostic: RECORDS lists actual record paths
                            original_label = ""
                            hea_path = dataset_path / f"{line}.hea"
                            if hea_path.exists():
                                original_label = extract_label_from_wfdb_header(hea_path)
    
                            mapped = map_label_heuristic(original_label)
                            records.append({
                                'dataset': dataset_name,
                                'record_id': record_id,
                                'original_label_text': original_label,
                                'mapped_label': mapped
                            })
            except Exception as e:
                print(f"Error reading RECORDS file for {dataset_name}: {e}")
    
        # If no RECORDS file or it failed, fall back to globbing
        if not records:
            print(f"Scanning {dataset_name} for .hea files (no RECORDS file)...")
            hea_files = list(dataset_path.rglob("*.hea"))
            for hea in hea_files:
                try:
                    rel_path = hea.relative_to(dataset_path).as_posix()
                    # Remove .hea extension
                    rel_id = rel_path[:-4] if rel_path.endswith('.hea') else rel_path
    
                    original_label = extract_label_from_wfdb_header(hea)
                    mapped = map_label_heuristic(original_label)
    
                    records.append({
                        'dataset': dataset_name,
                        'record_id': rel_id,
                        'original_label_text': original_label,
                        'mapped_label': mapped
                    })
                except Exception as e:
                    print(f"Error processing {hea.name}: {e}")
    
        print(f"✓ {dataset_name}: Found {len(records)} records")
        return records
    
    
    def extract_label_from_wfdb_header(hea_path: Path) -> str:
        """
        Attempt to extract diagnostic label from WFDB .hea file comments.
        Many WFDB headers contain diagnostic info in comment lines.
    
        Returns:
            Extracted label text or empty string
        """
        try:
            with open(hea_path, 'r', encoding='utf-8', errors='replace') as f:
                lines = f.readlines()
    
            # Look for comment lines (start with #)
            comments = []
            for line in lines:
                line = line.strip()
                if line.startswith('#'):
                    # Remove leading # and whitespace
                    comment = line.lstrip('#').strip()
                    comments.append(comment)
    
            # Combine comments
            full_comment = ' '.join(comments)
    
            # Look for common diagnostic keywords
            if any(kw in full_comment.upper() for kw in ['MYOCARDIAL', 'INFARCT', 'MI', 'HEALTHY',
                                                           'NORMAL', 'BUNDLE', 'FIBRILLATION']):
                return full_comment
    
            return ""
        except Exception:
            return ""
    
    
    def deduplicate_records(records: List[Dict[str, str]]) -> List[Dict[str, str]]:
        """
        Remove duplicate entries based on (dataset, record_id) tuple.
        Keep the first occurrence.
    
        Args:
            records: List of record dictionaries
    
        Returns:
            Deduplicated list
        """
        seen = set()
        unique = []
    
        for rec in records:
            key = (rec['dataset'], rec['record_id'])
            if key not in seen:
                seen.add(key)
                unique.append(rec)
    
        duplicates = len(records) - len(unique)
        if duplicates > 0:
            print(f"Removed {duplicates} duplicate entries")
    
        return unique
    
    
    def write_unified_csv(records: List[Dict[str, str]], output_path: Path):
        """
        Write unified label mapping to CSV file.
    
        Args:
            records: List of record dictionaries
            output_path: Output CSV file path
        """
        # Ensure output directory exists
        output_path.parent.mkdir(parents=True, exist_ok=True)
    
        # Write CSV
        fieldnames = ['dataset', 'record_id', 'original_label_text', 'mapped_label']
    
        try:
            with open(output_path, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(records)
    
            print(f"\n✓ Wrote {len(records)} records to {output_path}")
        except Exception as e:
            print(f"Error writing CSV: {e}")
            sys.exit(1)
    
    
    def print_summary(records: List[Dict[str, str]]):
        """
        Print summary statistics of the generated mapping.
    
        Args:
            records: List of record dictionaries
        """
        print("\n" + "="*60)
        print("SUMMARY STATISTICS")
        print("="*60)
    
        # Count by dataset
        by_dataset = Counter(r['dataset'] for r in records)
        print("\nRecords per dataset:")
        for ds, count in sorted(by_dataset.items()):
            print(f"  {ds:25s}: {count:6,d}")
    
        # Count by mapped label
        by_label = Counter(r['mapped_label'] for r in records)
        print("\nRecords per mapped label:")
        for label in TARGET_LABELS:
            count = by_label.get(label, 0)
            print(f"  {label:10s}: {count:6,d}")
    
        unmapped_count = by_label.get('', 0)
        print(f"  {'(unmapped)':10s}: {unmapped_count:6,d}")
    
        # Unmapped percentage
        if records:
            unmapped_pct = (unmapped_count / len(records)) * 100
            print(f"\nUnmapped: {unmapped_pct:.1f}%")
            print("(Unmapped records will be assigned to OTHER during preprocessing)")
    
        print("\n" + "="*60)
    
    
    def main():
        """Main execution function."""
        print("="*60)
        print("ECG Unified Label Mapping Generator")
        print("="*60)
        print(f"Project root: {ROOT}")
        print(f"Dataset directory: {DATASET_DIR}")
        print(f"Output: {OUTPUT_CSV}")
        print()
    
        # Check if dataset directory exists
        if not DATASET_DIR.exists():
            print(f"Error: Dataset directory not found: {DATASET_DIR}")
            sys.exit(1)
    
        all_records = []
    
        # Parse PTB-XL
        ptbxl_path = DATASET_DIR / "ptb-xl"
        if ptbxl_path.exists():
            print(f"\nProcessing PTB-XL at {ptbxl_path}...")
            all_records.extend(parse_ptbxl(ptbxl_path))
        else:
            print(f"\nSkipping PTB-XL (not found at {ptbxl_path})")
    
        # Parse CinC2017
        cinc_path = DATASET_DIR / "CinC2017"
        if cinc_path.exists():
            print(f"\nProcessing CinC2017 at {cinc_path}...")
            all_records.extend(parse_cinc2017(cinc_path))
        else:
            print(f"\nSkipping CinC2017 (not found at {cinc_path})")
    
        # Parse PTB_Diagnostic
        ptb_diag_path = DATASET_DIR / "PTB_Diagnostic"
        if ptb_diag_path.exists():
            print(f"\nProcessing PTB_Diagnostic at {ptb_diag_path}...")
            all_records.extend(parse_wfdb_fallback(ptb_diag_path, "PTB_Diagnostic"))
        else:
            print(f"\nSkipping PTB_Diagnostic (not found at {ptb_diag_path})")
    
        # Parse Chapman_Shaoxing
        chapman_path = DATASET_DIR / "Chapman_Shaoxing"
        if chapman_path.exists():
            print(f"\nProcessing Chapman_Shaoxing at {chapman_path}...")
            all_records.extend(parse_wfdb_fallback(chapman_path, "Chapman_Shaoxing"))
        else:
            print(f"\nSkipping Chapman_Shaoxing (not found at {chapman_path})")
    
        # Check if we found any records
        if not all_records:
            print("\nError: No records found in any dataset!")
            sys.exit(1)
    
        print(f"\nTotal records collected: {len(all_records)}")
    
        # Deduplicate
        print("\nDeduplicating records...")
        unique_records = deduplicate_records(all_records)
    
        # Write output
        print(f"\nWriting output to {OUTPUT_CSV}...")
        write_unified_csv(unique_records, OUTPUT_CSV)
    
        # Print summary
        print_summary(unique_records)
    
        print("\n✓ Done!")
        print(f"\nNext steps:")
        print(f"1. Review the generated file: {OUTPUT_CSV}")
        print(f"2. Manually refine any ambiguous mappings if needed")
        print(f"3. Copy/rename to logs/unified_label_mapping.csv when ready")
    
    
    if __name__ == "__main__":
        main()
    
except Exception as _e:
    print('Warning: inlined module', 'generate_unified_mapping.py', 'raised', _e)




In [16]:
# --- improve_mapping.py (inlined) ---
try:
    """
    Improved label mapping using enhanced heuristics.
    Analyzes unmapped records and attempts to assign labels based on pattern matching.
    """
    import pandas as pd
    import re
    from pathlib import Path
    from collections import Counter
    
    ROOT = Path(__file__).parent.parent
    LOGS_DIR = ROOT / "logs"
    
    # Load current mapping
    mapping_file = LOGS_DIR / "unified_label_mapping.csv"
    df = pd.read_csv(mapping_file, dtype=str).fillna("")
    
    print("="*80)
    print("LABEL MAPPING IMPROVEMENT")
    print("="*80)
    
    # Identify unmapped
    unmapped = df[df['mapped_label'] == ''].copy()
    print(f"\nUnmapped records: {len(unmapped):,}")
    
    # Enhanced heuristics
    def enhanced_label_heuristic(original_text: str, dataset: str) -> str:
        """
        Apply enhanced heuristics to guess labels.
        Returns label or empty string if uncertain.
        """
        if not original_text:
            return ""
    
        text_lower = str(original_text).lower()
    
        # MI patterns (more comprehensive)
        mi_patterns = [
            r'\bmi\b', r'myocardial\s*infarction', r'\bstemi\b', r'\bnstemi\b',
            r'anterior\s*infarct', r'inferior\s*infarct', r'lateral\s*infarct',
            r'old\s*infarct', r'recent\s*infarct', r'\bami\b', r'\bpmi\b',
            r'q\s*wave', r'pathologic.*q', r'\bimis?\b'
        ]
    
        # AF patterns
        af_patterns = [
            r'\baf\b', r'atrial\s*fib', r'a[\s-]*fib', r'\bafib\b', r'\baflu?\b',
            r'atrial\s*flutter', r'a[\s-]*flutter', r'\bpaf\b'
        ]
    
        # BBB patterns
        bbb_patterns = [
            r'\bbbb\b', r'bundle\s*branch\s*block', r'\blbbb\b', r'\brbbb\b',
            r'left\s*bundle', r'right\s*bundle', r'bifascicular',
            r'incomplete.*bundle', r'complete.*bundle'
        ]
    
        # NORM patterns
        norm_patterns = [
            r'\bnorm(al)?\b', r'sinus\s*rhythm', r'\bsr\b', r'regular\s*rhythm',
            r'no\s*abnormal', r'within\s*normal', r'unremarkable',
            r'^normal$', r'healthy', r'control'
        ]
    
        # Check patterns (order matters - specific before general)
        for pattern in mi_patterns:
            if re.search(pattern, text_lower):
                return "MI"
    
        for pattern in af_patterns:
            if re.search(pattern, text_lower):
                return "AF"
    
        for pattern in bbb_patterns:
            if re.search(pattern, text_lower):
                return "BBB"
    
        for pattern in norm_patterns:
            if re.search(pattern, text_lower):
                return "NORM"
    
        # Dataset-specific rules
        if dataset == "PTB_Diagnostic":
            # PTB often has "Healthy control" or specific condition names
            if "healthy" in text_lower or "control" in text_lower:
                return "NORM"
            if "myocardial infarction" in text_lower:
                return "MI"
    
        if dataset == "ptb-xl":
            # PTBXL uses SCP codes, check for common patterns
            if "norm" in text_lower:
                return "NORM"
    
        return ""
    
    # Apply enhanced heuristics to unmapped records
    print("\nApplying enhanced heuristics...")
    improvements = []
    
    for idx, row in unmapped.iterrows():
        original = str(row.get('original_label_text', '')).strip()
        dataset = str(row.get('dataset', '')).strip()
    
        new_label = enhanced_label_heuristic(original, dataset)
    
        if new_label:
            improvements.append({
                'index': idx,
                'dataset': dataset,
                'record_id': row.get('record_id', ''),
                'original_text': original,
                'suggested_label': new_label
            })
    
    print(f"Found {len(improvements):,} potential improvements")
    
    # Show distribution of suggested labels
    if improvements:
        suggested_counts = Counter(imp['suggested_label'] for imp in improvements)
        print(f"\nSuggested label distribution:")
        for label, count in suggested_counts.most_common():
            pct = count / len(improvements) * 100
            print(f"  {label}: {count:,} ({pct:.1f}%)")
    
        # Save suggestions to file
        improvements_df = pd.DataFrame(improvements)
        output_file = LOGS_DIR / "mapping_improvements_suggested.csv"
        improvements_df.to_csv(output_file, index=False)
        print(f"\n✓ Saved suggestions to: {output_file}")
    
        # Show samples from each label
        print(f"\nExample suggestions (5 per label):")
        for label in ['MI', 'AF', 'BBB', 'NORM']:
            label_samples = improvements_df[improvements_df['suggested_label'] == label].head(5)
            if not label_samples.empty:
                print(f"\n  {label}:")
                for _, row in label_samples.iterrows():
                    print(f"    - [{row['dataset']}] {row['original_text'][:60]}")
    
        # Option to apply improvements
        print("\n" + "="*80)
        print("TO APPLY THESE IMPROVEMENTS:")
        print("="*80)
        print("Review the suggestions in: logs/mapping_improvements_suggested.csv")
        print("Then run: python scripts/apply_mapping_improvements.py")
        print("\nThis will create an updated unified_label_mapping.csv file.")
    else:
        print("\nNo improvements found with current heuristics.")
    
    print("\n" + "="*80)
    print("ANALYSIS COMPLETE")
    print("="*80)
    
except Exception as _e:
    print('Warning: inlined module', 'improve_mapping.py', 'raised', _e)




In [17]:
# --- model_smoke_test.py (inlined) ---
try:
    """Model smoke test - verify shapes and forward pass"""
    import json
    import numpy as np
    import torch
    import torch.nn as nn
    from pathlib import Path
    
    # Simple 1D CNN for testing
    class ECGNet1D(nn.Module):
        def __init__(self, n_classes=5, input_channels=1, base_channels=32, dropout=0.3):
            super(ECGNet1D, self).__init__()
            
            self.conv1 = nn.Conv1d(input_channels, base_channels, kernel_size=7, stride=2, padding=3)
            self.bn1 = nn.BatchNorm1d(base_channels)
            
            self.conv2 = nn.Conv1d(base_channels, base_channels * 2, kernel_size=5, stride=2, padding=2)
            self.bn2 = nn.BatchNorm1d(base_channels * 2)
            
            self.conv3 = nn.Conv1d(base_channels * 2, base_channels * 4, kernel_size=3, stride=2, padding=1)
            self.bn3 = nn.BatchNorm1d(base_channels * 4)
            
            self.conv4 = nn.Conv1d(base_channels * 4, base_channels * 8, kernel_size=3, stride=2, padding=1)
            self.bn4 = nn.BatchNorm1d(base_channels * 8)
            
            self.global_pool = nn.AdaptiveAvgPool1d(1)
            self.dropout = nn.Dropout(dropout)
            self.fc = nn.Linear(base_channels * 8, n_classes)
            
            self.relu = nn.ReLU(inplace=True)
        
        def forward(self, x):
            x = self.relu(self.bn1(self.conv1(x)))
            x = self.relu(self.bn2(self.conv2(x)))
            x = self.relu(self.bn3(self.conv3(x)))
            x = self.relu(self.bn4(self.conv4(x)))
            x = self.global_pool(x)
            x = x.view(x.size(0), -1)
            x = self.dropout(x)
            x = self.fc(x)
            return x
    
    
    ROOT = Path(__file__).parent.parent
    PROCESSED_DIR = ROOT / "artifacts" / "processed"
    RECORDS_DIR = PROCESSED_DIR / "records"
    CHECKPOINTS_DIR = PROCESSED_DIR / "checkpoints"
    CHECKPOINTS_DIR.mkdir(exist_ok=True)
    
    print("="*80)
    print("MODEL SMOKE TEST")
    print("="*80)
    
    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\nDevice: {device}")
    
    # Load splits
    splits_file = PROCESSED_DIR / "splits.json"
    with open(splits_file, 'r') as f:
        splits = json.load(f)
    
    # Load 3 samples from each label (if available)
    print("\nLoading sample records from each label...")
    by_label = {}
    for entry in splits['train'][:100]:  # Check first 100
        label = entry['label']
        if label not in by_label:
            by_label[label] = []
        if len(by_label[label]) < 3:
            by_label[label].append(entry)
    
    print(f"Loaded samples for {len(by_label)} labels")
    
    # Load signals into a batch
    signals = []
    labels = []
    for label, entries in by_label.items():
        for entry in entries:
            npy_file = PROCESSED_DIR / entry['path']
            signal = np.load(npy_file, allow_pickle=False)
            signals.append(signal)
            labels.append(label)
    
    batch_signals = torch.from_numpy(np.stack(signals)).float()
    batch_labels = torch.tensor(labels, dtype=torch.long)
    
    print(f"\nBatch shape: {batch_signals.shape}")
    print(f"Labels shape: {batch_labels.shape}")
    print(f"Label values: {batch_labels.tolist()}")
    
    # Create model
    model = ECGNet1D(n_classes=5, base_channels=32, dropout=0.3)
    model = model.to(device)
    
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\nModel parameters: {total_params:,}")
    
    # Forward pass
    batch_signals = batch_signals.to(device)
    batch_labels = batch_labels.to(device)
    
    model.eval()
    with torch.no_grad():
        outputs = model(batch_signals)
        predictions = outputs.argmax(dim=1)
    
    print(f"\nForward pass successful!")
    print(f"Output shape: {outputs.shape}")
    print(f"Predictions: {predictions.cpu().tolist()}")
    print(f"True labels: {batch_labels.cpu().tolist()}")
    
    # Save skeleton checkpoint
    checkpoint_path = CHECKPOINTS_DIR / "checkpoint_skel.pth"
    torch.save({
        'model_state_dict': model.state_dict(),
        'n_classes': 5,
        'base_channels': 32,
        'dropout': 0.3
    }, checkpoint_path)
    
    print(f"\n✓ Saved skeleton checkpoint to {checkpoint_path}")
    
    print("\n" + "="*80)
    print("MODEL SMOKE TEST PASSED")
    print("="*80)
    
except Exception as _e:
    print('Warning: inlined module', 'model_smoke_test.py', 'raised', _e)




In [18]:
# --- preprocess_streaming.py (inlined) ---
try:
    """
    Memory-safe, idempotent streaming preprocessing for ECG datasets.
    Saves individual .npy files with metadata and creates manifest.jsonl for lazy loading.
    """
    import os
    import sys
    import json
    import time
    import logging
    from pathlib import Path
    from collections import Counter, defaultdict
    from datetime import datetime
    
    import numpy as np
    import pandas as pd
    from scipy.signal import resample
    from scipy.io import loadmat
    from tqdm import tqdm
    
    # Windows asyncio fix
    if sys.platform == "win32":
        import asyncio
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    
    # Configuration
    ROOT = Path(__file__).parent.parent
    DATASET_DIR = ROOT / "Dataset"
    ARTIFACTS_DIR = ROOT / "artifacts"
    PROCESSED_DIR = ARTIFACTS_DIR / "processed"
    RECORDS_DIR = PROCESSED_DIR / "records"
    LOGS_DIR = ROOT / "logs"
    
    # Create directories
    for d in [ARTIFACTS_DIR, PROCESSED_DIR, RECORDS_DIR, LOGS_DIR]:
        d.mkdir(parents=True, exist_ok=True)
    
    # Setup logging
    log_file = LOGS_DIR / "preprocess_automation.log"
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s | %(levelname)s | %(message)s',
        handlers=[
            logging.FileHandler(log_file, mode='a'),
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger(__name__)
    
    # Constants
    TARGET_FS = 500
    TARGET_SAMPLES = 5000
    LABEL_ORDER = ['MI', 'AF', 'BBB', 'NORM', 'OTHER']
    LABEL_TO_INT = {name: i for i, name in enumerate(LABEL_ORDER)}
    INT_TO_LABEL = {i: name for name, i in LABEL_TO_INT.items()}
    
    # Environment variable for smoke testing
    PREPROCESS_LIMIT = int(os.getenv('ECG_PREPROCESS_LIMIT', '0'))
    
    logger.info("="*80)
    logger.info("ECG STREAMING PREPROCESSING")
    logger.info("="*80)
    logger.info(f"ROOT: {ROOT}")
    logger.info(f"DATASET_DIR: {DATASET_DIR}")
    logger.info(f"RECORDS_DIR: {RECORDS_DIR}")
    logger.info(f"PREPROCESS_LIMIT: {PREPROCESS_LIMIT if PREPROCESS_LIMIT > 0 else 'UNLIMITED'}")
    
    
    def zscore_normalize(arr: np.ndarray) -> np.ndarray:
        """Z-score normalization"""
        arr = arr.astype(np.float32)
        mean = arr.mean()
        std = arr.std()
        if std < 1e-8:
            std = 1.0
        return ((arr - mean) / std).astype(np.float32)
    
    
    def pad_or_truncate(x: np.ndarray, target_length: int) -> np.ndarray:
        """Pad with zeros or truncate"""
        if x.size >= target_length:
            return x[:target_length]
        pad_width = target_length - x.size
        return np.pad(x, (0, pad_width), mode='constant', constant_values=0).astype(np.float32)
    
    
    def resample_signal(x: np.ndarray, original_fs: float, target_fs: float) -> np.ndarray:
        """Resample signal"""
        if original_fs is None or np.isclose(original_fs, target_fs):
            return x
        new_length = int(round(x.size * target_fs / original_fs))
        if new_length <= 0:
            return x
        return resample(x, new_length).astype(np.float32)
    
    
    def read_wfdb(hea_path: Path):
        """Read WFDB format (.hea/.dat)"""
        try:
            import wfdb
            record_path = str(hea_path.with_suffix(''))
            record = wfdb.rdsamp(record_path)
            signal = np.asarray(record[0], dtype=np.float32)
            fs = float(record[1].get('fs', TARGET_FS))
    
            # Convert to 1D
            if signal.ndim == 2:
                signal_1d = signal.mean(axis=1) if signal.shape[1] > 1 else signal[:, 0]
            else:
                signal_1d = signal.reshape(-1)
    
            return signal_1d.astype(np.float32), fs
        except Exception as e:
            raise RuntimeError(f"Failed to read WFDB {hea_path.name}: {e}")
    
    
    def read_mat(mat_path: Path):
        """Read MATLAB .mat file"""
        try:
            mat_data = loadmat(str(mat_path))
    
            signal = None
            for key in ['val', 'data', 'signal', 'ecg']:
                if key in mat_data:
                    signal = np.asarray(mat_data[key], dtype=np.float32)
                    break
    
            if signal is None:
                for value in mat_data.values():
                    if isinstance(value, np.ndarray) and value.size > 100:
                        signal = value.astype(np.float32)
                        break
    
            if signal is None:
                raise RuntimeError("No signal array found")
    
            # Convert to 1D
            if signal.ndim == 2:
                signal_1d = signal.mean(axis=0) if signal.shape[0] > 1 else signal.reshape(-1)
            else:
                signal_1d = signal.reshape(-1)
    
            return signal_1d.astype(np.float32), None
        except Exception as e:
            raise RuntimeError(f"Failed to read MAT {mat_path.name}: {e}")
    
    
    def load_mapping_index():
        """Load unified label mapping"""
        mapping_file = LOGS_DIR / "unified_label_mapping.csv"
        mapping_index = {}
    
        if not mapping_file.exists():
            logger.warning(f"Mapping file not found: {mapping_file}")
            return mapping_index
    
        df = pd.read_csv(mapping_file, dtype=str).fillna("")
    
        for _, row in df.iterrows():
            dataset = str(row.get("dataset", "")).strip()
            record_id = str(row.get("record_id", "")).strip().replace("\\", "/").strip("/")
            mapped_label = str(row.get("mapped_label", "")).strip().upper()
    
            if not dataset or not record_id:
                continue
    
            if dataset not in mapping_index:
                mapping_index[dataset] = {}
    
            # Add full path
            mapping_index[dataset][record_id] = mapped_label
    
            # Add variants
            parts = record_id.split("/")
            if len(parts) >= 1:
                mapping_index[dataset][parts[-1]] = mapped_label
            if len(parts) >= 2:
                mapping_index[dataset]["/".join(parts[-2:])] = mapped_label
    
        logger.info(f"Loaded mapping for {len(mapping_index)} datasets")
        return mapping_index
    
    
    def lookup_mapped_label(path: Path, mapping_index: dict) -> str:
        """Look up mapped label"""
        try:
            rel_path = path.relative_to(DATASET_DIR).with_suffix("")
        except Exception:
            rel_path = path.with_suffix("")
    
        parts = rel_path.as_posix().split("/")
        dataset = parts[0] if parts else ""
    
        if dataset not in mapping_index:
            return "OTHER"
    
        index = mapping_index[dataset]
    
        candidates = [
            rel_path.as_posix(),
            "/".join(parts[1:]) if len(parts) > 1 else "",
            "/".join(parts[-2:]) if len(parts) >= 2 else "",
            rel_path.name
        ]
    
        for key in candidates:
            if key and key in index:
                label = index[key].upper()
                return label if label in LABEL_TO_INT else "OTHER"
    
        # CinC special case
        if "CinC" in dataset and len(parts) >= 3:
            alt_key = "/".join(parts[2:])
            if alt_key in index:
                label = index[alt_key].upper()
                return label if label in LABEL_TO_INT else "OTHER"
    
        return "OTHER"
    
    
    def load_progress_checkpoint():
        """Load progress checkpoint"""
        checkpoint_file = PROCESSED_DIR / "progress.json"
        if checkpoint_file.exists():
            with open(checkpoint_file, 'r') as f:
                return json.load(f)
        return {"processed_files": set(), "last_index": 0, "processed_count": 0}
    
    
    def save_progress_checkpoint(processed_files: set, last_index: int, processed_count: int):
        """Save progress checkpoint"""
        checkpoint_file = PROCESSED_DIR / "progress.json"
        checkpoint = {
            "processed_files": list(processed_files),
            "last_index": last_index,
            "processed_count": processed_count,
            "timestamp": datetime.utcnow().isoformat()
        }
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint, f, indent=2)
    
    
    def main():
        """Main preprocessing pipeline"""
        logger.info("Starting preprocessing...")
    
        # Load mapping
        mapping_index = load_mapping_index()
    
        # Load progress checkpoint
        checkpoint = load_progress_checkpoint()
        processed_files = set(checkpoint.get("processed_files", []))
        logger.info(f"Loaded checkpoint: {len(processed_files)} files already processed")
    
        # Find all files
        logger.info(f"Scanning {DATASET_DIR}...")
        hea_files = sorted(DATASET_DIR.rglob("*.hea"))
        mat_files = sorted(DATASET_DIR.rglob("*.mat"))
        all_files = hea_files + mat_files
    
        logger.info(f"Found {len(hea_files)} .hea and {len(mat_files)} .mat files")
    
        # Apply limit for smoke testing
        if PREPROCESS_LIMIT > 0:
            all_files = all_files[:PREPROCESS_LIMIT]
            logger.info(f"LIMITED TO: {len(all_files)} files (ECG_PREPROCESS_LIMIT={PREPROCESS_LIMIT})")
    
        # Filter out already processed
        all_files = [f for f in all_files if str(f) not in processed_files]
        logger.info(f"Files to process (after skipping existing): {len(all_files)}")
    
        if not all_files:
            logger.info("No new files to process!")
            return
    
        # Open manifest file in append mode
        manifest_file = PROCESSED_DIR / "manifest.jsonl"
        manifest_fp = open(manifest_file, 'a', encoding='utf-8')
    
        # Processing loop
        label_counts = Counter()
        skipped = 0
        start_time = time.time()
        save_times = []
    
        progress_bar = tqdm(all_files, desc="Processing", unit="file")
    
        for idx, file_path in enumerate(progress_bar):
            try:
                # Generate record ID
                try:
                    rel_path = file_path.relative_to(DATASET_DIR).with_suffix("")
                    record_id = rel_path.as_posix().replace("/", "__")
                except Exception:
                    record_id = file_path.stem
    
                # Check if already exists
                npy_file = RECORDS_DIR / f"{record_id}.npy"
                meta_file = RECORDS_DIR / f"{record_id}.meta.json"
                label_file = RECORDS_DIR / f"{record_id}.label"
    
                if npy_file.exists() and meta_file.exists() and label_file.exists():
                    processed_files.add(str(file_path))
                    continue
    
                # Read signal
                read_start = time.time()
                if file_path.suffix.lower() == '.hea':
                    signal, fs = read_wfdb(file_path)
                else:
                    signal, fs = read_mat(file_path)
    
                # Resample
                if fs is not None and not np.isclose(fs, TARGET_FS):
                    signal = resample_signal(signal, fs, TARGET_FS)
    
                # Normalize
                signal = zscore_normalize(signal)
    
                # Pad/truncate
                signal = pad_or_truncate(signal, TARGET_SAMPLES)
    
                # Add channel dimension
                signal = signal[np.newaxis, :]
    
                # Lookup label
                mapped_label = lookup_mapped_label(file_path, mapping_index)
                label_int = LABEL_TO_INT.get(mapped_label, LABEL_TO_INT["OTHER"])
    
                # Save files
                save_start = time.time()
                np.save(npy_file, signal, allow_pickle=False)
    
                with open(label_file, 'w') as f:
                    f.write(str(label_int))
    
                meta = {
                    "dataset": rel_path.parts[0] if len(rel_path.parts) > 0 else "unknown",
                    "source_path": str(file_path.relative_to(DATASET_DIR)),
                    "original_fs": float(fs) if fs is not None else None,
                    "mapped_label": mapped_label,
                    "label_int": int(label_int)
                }
                with open(meta_file, 'w') as f:
                    json.dump(meta, f)
    
                # Append to manifest
                manifest_entry = {
                    "path": f"records/{npy_file.name}",
                    "label": int(label_int)
                }
                manifest_fp.write(json.dumps(manifest_entry) + '\n')
                manifest_fp.flush()
    
                save_times.append(time.time() - save_start)
    
                # Update counters
                label_counts[label_int] += 1
                processed_files.add(str(file_path))
    
                # Periodic checkpoint save
                if (idx + 1) % 1000 == 0:
                    save_progress_checkpoint(processed_files, idx, len(processed_files))
                    elapsed = time.time() - start_time
                    speed = len(processed_files) / elapsed
                    avg_save_time = np.mean(save_times[-100:]) if save_times else 0
                    progress_bar.set_postfix({
                        'rec/s': f'{speed:.1f}',
                        'skip': skipped,
                        'save_ms': f'{avg_save_time*1000:.1f}'
                    })
    
            except Exception as e:
                skipped += 1
                if skipped <= 20:
                    logger.error(f"Error processing {file_path.name}: {e}")
                progress_bar.set_postfix({'skipped': skipped})
    
        progress_bar.close()
        manifest_fp.close()
    
        # Final checkpoint
        save_progress_checkpoint(processed_files, len(all_files), len(processed_files))
    
        # Print summary
        elapsed = time.time() - start_time
        logger.info("="*80)
        logger.info("PREPROCESSING SUMMARY")
        logger.info("="*80)
        logger.info(f"Total processed: {len(processed_files):,}")
        logger.info(f"Skipped (errors): {skipped:,}")
        logger.info(f"Time elapsed: {elapsed:.1f}s ({len(processed_files)/elapsed:.1f} rec/s)")
        logger.info(f"\nLabel distribution:")
        for idx, label_name in enumerate(LABEL_ORDER):
            count = label_counts[idx]
            pct = (count / len(processed_files) * 100) if processed_files else 0
            logger.info(f"  {idx}={label_name:5s}: {count:6,d} ({pct:5.1f}%)")
    
        # Create splits
        create_splits()
    
        logger.info("\nPreprocessing complete!")
    
    
    def create_splits():
        """Create stratified train/val/test splits"""
        logger.info("\nCreating stratified splits...")
    
        # Read manifest
        manifest_file = PROCESSED_DIR / "manifest.jsonl"
        if not manifest_file.exists():
            logger.error("Manifest file not found!")
            return
    
        manifest = []
        with open(manifest_file, 'r') as f:
            for line in f:
                if line.strip():
                    manifest.append(json.loads(line))
    
        logger.info(f"Loaded {len(manifest)} entries from manifest")
    
        # Group by label
        by_label = defaultdict(list)
        for entry in manifest:
            by_label[entry['label']].append(entry)
    
        # Stratified split
        train_list, val_list, test_list = [], [], []
        rng = np.random.default_rng(seed=42)
    
        for label, entries in by_label.items():
            entries_copy = entries.copy()
            rng.shuffle(entries_copy)
            n = len(entries_copy)
            n_train = int(n * 0.80)
            n_val = int(n * 0.10)
    
            train_list.extend(entries_copy[:n_train])
            val_list.extend(entries_copy[n_train:n_train + n_val])
            test_list.extend(entries_copy[n_train + n_val:])
    
        # Save splits
        splits_data = {
            "timestamp": datetime.utcnow().isoformat(),
            "label_order": LABEL_ORDER,
            "label_to_int": LABEL_TO_INT,
            "train": train_list,
            "val": val_list,
            "test": test_list,
            "counts": {
                "train": len(train_list),
                "val": len(val_list),
                "test": len(test_list),
                "total": len(manifest)
            }
        }
    
        splits_file = PROCESSED_DIR / "splits.json"
        with open(splits_file, 'w') as f:
            json.dump(splits_data, f, indent=2)
    
        # Save label map
        label_map = {
            "label_to_int": LABEL_TO_INT,
            "int_to_label": INT_TO_LABEL
        }
        label_map_file = PROCESSED_DIR / "label_map.json"
        with open(label_map_file, 'w') as f:
            json.dump(label_map, f, indent=2)
    
        # Save labels array
        np.save(PROCESSED_DIR / "labels.npy", np.array(LABEL_ORDER, dtype=object))
    
        logger.info(f"Saved splits to {splits_file}")
        logger.info(f"  Train: {len(train_list):,}")
        logger.info(f"  Val:   {len(val_list):,}")
        logger.info(f"  Test:  {len(test_list):,}")
    
    
    if __name__ == "__main__":
        main()
    
except Exception as _e:
    print('Warning: inlined module', 'preprocess_streaming.py', 'raised', _e)




In [19]:
# --- run_full_automation.py (inlined) ---
try:
    """
    Complete preprocessing automation orchestrator.
    Runs the full pipeline with checks and validation.
    """
    import sys
    import time
    import subprocess
    from pathlib import Path
    from datetime import datetime
    
    ROOT = Path(__file__).parent.parent
    LOGS_DIR = ROOT / "logs"
    SCRIPTS_DIR = ROOT / "scripts"
    VENV_PYTHON = ROOT / ".venv1" / "Scripts" / "python.exe"
    
    # Ensure log directory exists
    LOGS_DIR.mkdir(exist_ok=True)
    
    def run_script(script_name: str, description: str, env_vars: dict = None) -> bool:
        """Run a Python script and return success status"""
        print(f"\n{'='*80}")
        print(f"RUNNING: {description}")
        print(f"{'='*80}")
    
        script_path = SCRIPTS_DIR / script_name
        if not script_path.exists():
            print(f"Error: Script not found: {script_path}")
            return False
    
        cmd = [str(VENV_PYTHON), str(script_path)]
    
        # Build environment
        import os
        env = os.environ.copy()
        if env_vars:
            env.update(env_vars)
    
        try:
            result = subprocess.run(
                cmd,
                cwd=str(ROOT),
                env=env,
                capture_output=False,
                text=True
            )
    
            if result.returncode == 0:
                print(f"✓ {description} completed successfully")
                return True
            else:
                print(f"✗ {description} failed with return code {result.returncode}")
                return False
    
        except Exception as e:
            print(f"✗ Error running {description}: {e}")
            return False
    
    
    def main():
        """Main orchestration pipeline"""
        start_time = time.time()
    
        print("="*80)
        print("ECG PREPROCESSING AUTOMATION ORCHESTRATOR")
        print("="*80)
        print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Python: {VENV_PYTHON}")
        print(f"Root: {ROOT}")
    
        # Step 1: Validate mapping
        if not run_script("validate_mapping.py", "Step 1: Validate unified label mapping"):
            print("\n⚠ Warning: Mapping validation had issues, but continuing...")
    
        # Step 2: Improve mapping (optional)
        print("\n" + "="*80)
        print("OPTIONAL: Improve Label Mapping")
        print("="*80)
        print("Would you like to run the mapping improvement heuristics?")
        print("This may increase coverage from 33.8% to ~50-60%")
        response = input("Run improvement? (y/n, default=n): ").strip().lower()
    
        if response == 'y':
            if run_script("improve_mapping.py", "Step 2a: Analyze mapping improvements"):
                print("\nReview suggestions in: logs/mapping_improvements_suggested.csv")
                apply_response = input("Apply these improvements? (y/n, default=n): ").strip().lower()
    
                if apply_response == 'y':
                    if not run_script("apply_mapping_improvements.py", "Step 2b: Apply mapping improvements"):
                        print("\n✗ Failed to apply improvements")
                        return False
    
        # Step 3: Run preprocessing
        print("\n" + "="*80)
        print("PREPROCESSING MODE SELECTION")
        print("="*80)
        print("Choose preprocessing mode:")
        print("  1. Full run (all datasets, ~7 hours)")
        print("  2. Medium test (5,000 files, ~15 minutes)")
        print("  3. Large test (20,000 files, ~1 hour)")
        print("  4. Custom limit")
    
        mode = input("Select mode (1-4, default=2): ").strip() or "2"
    
        env_vars = {}
        if mode == "1":
            print("\nStarting FULL preprocessing...")
            print("This will take approximately 7 hours.")
            confirm = input("Confirm full run? (yes/no): ").strip().lower()
            if confirm != "yes":
                print("Cancelled.")
                return False
        elif mode == "2":
            env_vars["ECG_PREPROCESS_LIMIT"] = "5000"
            print("\nRunning medium test (5,000 files)...")
        elif mode == "3":
            env_vars["ECG_PREPROCESS_LIMIT"] = "20000"
            print("\nRunning large test (20,000 files)...")
        elif mode == "4":
            limit = input("Enter custom limit: ").strip()
            try:
                limit_int = int(limit)
                env_vars["ECG_PREPROCESS_LIMIT"] = str(limit_int)
                print(f"\nRunning with custom limit ({limit_int} files)...")
            except ValueError:
                print("Invalid limit. Defaulting to 5,000.")
                env_vars["ECG_PREPROCESS_LIMIT"] = "5000"
    
        # Run preprocessing
        if not run_script("preprocess_streaming.py", "Step 3: Streaming preprocessing", env_vars):
            print("\n✗ Preprocessing failed!")
            return False
    
        # Step 4: Verify outputs
        if not run_script("verify_smoke_test.py", "Step 4: Verify preprocessing outputs"):
            print("\n⚠ Warning: Verification had issues")
    
        # Step 5: Model smoke test
        if not run_script("model_smoke_test.py", "Step 5: Model compatibility test"):
            print("\n⚠ Warning: Model test had issues")
    
        # Generate final report
        elapsed = time.time() - start_time
        report_file = LOGS_DIR / "preprocess_report.txt"
    
        with open(report_file, 'w') as f:
            f.write("="*80 + "\n")
            f.write("ECG PREPROCESSING AUTOMATION - FINAL REPORT\n")
            f.write("="*80 + "\n")
            f.write(f"Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total time: {elapsed/60:.1f} minutes ({elapsed/3600:.2f} hours)\n")
            f.write("\n")
            f.write("All steps completed successfully!\n")
            f.write("\n")
            f.write("Output locations:\n")
            f.write(f"  - Processed records: artifacts/processed/records/\n")
            f.write(f"  - Manifest: artifacts/processed/manifest.jsonl\n")
            f.write(f"  - Splits: artifacts/processed/splits.json\n")
            f.write(f"  - Label map: artifacts/processed/label_map.json\n")
            f.write(f"  - Checkpoints: artifacts/processed/checkpoints/\n")
            f.write("\n")
            f.write("Next steps:\n")
            f.write("  1. Review preprocessing log: logs/preprocess_automation.log\n")
            f.write("  2. Open notebook: notebooks/ecg_tensor_pipeline.ipynb\n")
            f.write("  3. Begin model training\n")
            f.write("\n")
            f.write("="*80 + "\n")
    
        print("\n" + "="*80)
        print("AUTOMATION COMPLETE!")
        print("="*80)
        print(f"Total time: {elapsed/60:.1f} minutes")
        print(f"Report saved to: {report_file}")
        print("\nYou can now begin model training!")
    
        return True
    
    
    if __name__ == "__main__":
        try:
            success = main()
            sys.exit(0 if success else 1)
        except KeyboardInterrupt:
            print("\n\nInterrupted by user. Progress has been saved.")
            print("You can resume by running this script again.")
            sys.exit(1)
        except Exception as e:
            print(f"\n\n✗ Fatal error: {e}")
            import traceback
            traceback.print_exc()
            sys.exit(1)
    
except Exception as _e:
    print('Warning: inlined module', 'run_full_automation.py', 'raised', _e)




In [20]:
# --- validate_mapping.py (inlined) ---
try:
    """Validate and summarize unified label mapping CSV"""
    import pandas as pd
    from pathlib import Path
    
    ROOT = Path(__file__).parent.parent
    LOGS_DIR = ROOT / "logs"
    mapping_file = LOGS_DIR / "unified_label_mapping.csv"
    
    if not mapping_file.exists():
        print(f"Error: {mapping_file} not found")
        exit(1)
    
    df = pd.read_csv(mapping_file, dtype=str).fillna('')
    
    print("="*80)
    print("UNIFIED LABEL MAPPING VALIDATION")
    print("="*80)
    print(f"Total rows: {len(df):,}")
    print(f"\nColumns: {list(df.columns)}")
    
    # Dataset distribution
    print(f"\nDataset distribution:")
    for dataset, count in df['dataset'].value_counts().items():
        print(f"  {dataset}: {count:,}")
    
    # Label distribution
    print(f"\nLabel distribution:")
    label_counts = df['mapped_label'].value_counts()
    for label, count in label_counts.items():
        pct = count / len(df) * 100
        print(f"  {label if label else '(unmapped)'}: {count:,} ({pct:.1f}%)")
    
    # Count unmapped
    unmapped = (df['mapped_label'] == '').sum()
    print(f"\nUnmapped (blank) records: {unmapped:,} ({unmapped/len(df)*100:.1f}%)")
    
    # If high unmapped rate, create sample file
    if unmapped > 0:
        unmapped_df = df[df['mapped_label'] == ''].copy()
        sample_size = min(200, len(unmapped_df))
        sample = unmapped_df.sample(n=sample_size, random_state=42)
        sample_file = LOGS_DIR / "unmapped_sample.csv"
        sample.to_csv(sample_file, index=False)
        print(f"\nSaved {sample_size} unmapped samples to: {sample_file}")
    
    print("\nValidation complete!")
    
except Exception as _e:
    print('Warning: inlined module', 'validate_mapping.py', 'raised', _e)




In [21]:
# --- verify_smoke_test.py (inlined) ---
try:
    """Verify smoke test outputs"""
    import json
    import numpy as np
    from pathlib import Path
    import random
    
    ROOT = Path(__file__).parent.parent
    PROCESSED_DIR = ROOT / "artifacts" / "processed"
    RECORDS_DIR = PROCESSED_DIR / "records"
    
    print("="*80)
    print("SMOKE TEST VERIFICATION")
    print("="*80)
    
    # Check splits.json
    splits_file = PROCESSED_DIR / "splits.json"
    if splits_file.exists():
        with open(splits_file, 'r') as f:
            splits = json.load(f)
        print(f"\n✓ splits.json exists")
        print(f"  Train: {splits['counts']['train']}")
        print(f"  Val: {splits['counts']['val']}")
        print(f"  Test: {splits['counts']['test']}")
        print(f"  Total: {splits['counts']['total']}")
    else:
        print("\n✗ splits.json missing!")
    
    # Check label_map.json
    label_map_file = PROCESSED_DIR / "label_map.json"
    if label_map_file.exists():
        with open(label_map_file, 'r') as f:
            label_map = json.load(f)
        print(f"\n✓ label_map.json exists")
        print(f"  Labels: {label_map['label_to_int']}")
    else:
        print("\n✗ label_map.json missing!")
    
    # Check labels.npy
    labels_file = PROCESSED_DIR / "labels.npy"
    if labels_file.exists():
        labels = np.load(labels_file, allow_pickle=True)
        print(f"\n✓ labels.npy exists")
        print(f"  Labels: {list(labels)}")
    else:
        print("\n✗ labels.npy missing!")
    
    # Check manifest.jsonl
    manifest_file = PROCESSED_DIR / "manifest.jsonl"
    if manifest_file.exists():
        with open(manifest_file, 'r') as f:
            manifest = [json.loads(line) for line in f if line.strip()]
        print(f"\n✓ manifest.jsonl exists")
        print(f"  Entries: {len(manifest)}")
    else:
        print("\n✗ manifest.jsonl missing!")
    
    # Check record files
    record_files = list(RECORDS_DIR.glob("*.npy"))
    print(f"\n✓ Found {len(record_files)} .npy files in records/")
    
    # Load and verify 5 random records
    if record_files:
        print("\nVerifying 5 random records...")
        samples = random.sample(record_files, min(5, len(record_files)))
    
        for i, npy_file in enumerate(samples, 1):
            try:
                # Load signal
                signal = np.load(npy_file, allow_pickle=False)
    
                # Load label
                label_file = npy_file.with_suffix('.label')
                with open(label_file, 'r') as f:
                    label = int(f.read().strip())
    
                # Load metadata
                meta_file = npy_file.parent / f"{npy_file.stem}.meta.json"
                with open(meta_file, 'r') as f:
                    meta = json.load(f)
    
                print(f"\n  {i}. {npy_file.name}")
                print(f"     Signal shape: {signal.shape}")
                print(f"     Label: {label} ({meta.get('mapped_label', 'unknown')})")
                print(f"     Dataset: {meta.get('dataset', 'unknown')}")
    
                # Verify shape
                assert signal.shape == (1, 5000), f"Expected (1, 5000), got {signal.shape}"
                assert 0 <= label <= 4, f"Invalid label: {label}"
    
                print(f"     ✓ Valid")
    
            except Exception as e:
                print(f"\n  {i}. {npy_file.name}")
                print(f"     ✗ Error: {e}")
    
    print("\n" + "="*80)
    print("VERIFICATION COMPLETE")
    print("="*80)
    
except Exception as _e:
    print('Warning: inlined module', 'verify_smoke_test.py', 'raised', _e)




## Quick: Generate/Load unified mapping (run this cell)


In [22]:

# Generate unified mapping (if you have script)
candidate = Path('logs/unified_label_mapping.candidate.csv')
prod = Path('logs/unified_label_mapping.csv')
if (Path('scripts/generate_unified_mapping.py')).exists() and not candidate.exists():
    print('Generating candidate mapping...')
    os.system(f'python "{str(Path("scripts/generate_unified_mapping.py"))}"')
else:
    print('Candidate mapping exists:', candidate.exists(), 'Prod file exists:', prod.exists())
# If you have a candidate and want to promote it, uncomment:
# if candidate.exists(): candidate.replace(prod)


Candidate mapping exists: True Prod file exists: False


## Preprocessing (streaming, memory-safe)


In [23]:

# Run streaming preprocessing from scripts/preprocess_streaming.py if present
proc_script = Path('scripts/preprocess_streaming.py')
if proc_script.exists():
    print('Launching streaming preprocessing (script)...')
    # recommend using environment var ECG_PREPROCESS_LIMIT to test
    os.system(f'python "{proc_script}"')
else:
    print('No preprocess_streaming.py found. Implement preprocessing in this notebook or inline alternate script.')


No preprocess_streaming.py found. Implement preprocessing in this notebook or inline alternate script.


## Training (run this after preprocessing finishes)


In [24]:

# Launch training script if present
train_script = Path('scripts/train_pipeline.py')  # optional
if train_script.exists():
    print('Running training script...')
    os.system(f'python "{train_script}"')
else:
    print('No training script detected. Use in-notebook training cells or create scripts/training.py and link it.')


No training script detected. Use in-notebook training cells or create scripts/training.py and link it.


## Evaluation and Visuals


In [25]:

# Run evaluation if script exists
eval_script = Path('scripts/evaluate.py')
if eval_script.exists():
    os.system(f'python "{eval_script}"')
else:
    print('No evaluate.py. Use notebook cells to visualize artifacts/figures/')


No evaluate.py. Use notebook cells to visualize artifacts/figures/


## Smoke tests and quick validation


In [26]:

# Run smoke tests
smoke = Path('scripts/verify_smoke_test.py')
if smoke.exists():
    os.system(f'python "{smoke}"')
else:
    print('No smoke-test script. Manual checks:')
    print(' - Count files:', len(list((PROCESSED_DIR/'records').glob('*.npz'))))
    print(' - Check splits:', (PROCESSED_DIR/'splits.json').exists())


No smoke-test script. Manual checks:


 - Count files: 98127
 - Check splits: True


## Final: Notebook control
You can now run cells in order. Long-running steps are executed as external scripts to avoid kernel timeouts.
