### Environment Setup


In [1]:
# Environment Setup (robust torch import)
from pathlib import Path
import os
import sys
import json
import numpy as np

# Attempt torch import with graceful fallback message for DLL issues
try:
    import torch  # noqa: E402
except OSError as e:
    print("Torch import failed (likely CUDA DLL issue). Falling back instructions.")
    print("Original error:\n", e)
    print("If you lack proper NVIDIA drivers or want CPU-only, reinstall with:\n  pip install --force-reinstall --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu")
    # Retry CPU-only import if already installed
    try:
        import importlib
        torch = importlib.import_module('torch')  # noqa: F401
    except Exception:
        raise

import wfdb
from scipy.signal import resample
from sklearn.model_selection import train_test_split  # retained (may be unused now)
import mlflow
import fastapi  # ensure core library is available
import uvicorn  # ensure core library is available

_ = (os, sys, fastapi, uvicorn)

ROOT = Path('.')
ARTIFACTS_DIR = ROOT / 'artifacts'
MODELS_DIR = ARTIFACTS_DIR / 'models'
PROCESSED_DIR = ARTIFACTS_DIR / 'processed'
MLFLOW_DIR = ARTIFACTS_DIR / 'mlflow'
DATASET_DIR = ROOT / 'dataset'
FIGURES_DIR = ROOT / 'figures'
LOGS_DIR = ROOT / 'logs'

for d in [ARTIFACTS_DIR, MODELS_DIR, PROCESSED_DIR, MLFLOW_DIR, DATASET_DIR, FIGURES_DIR, LOGS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

cuda_available = torch.cuda.is_available() if hasattr(torch, 'cuda') else False
print(f"CUDA available: {cuda_available}")
if cuda_available:
    try:
        print(f"GPU: {torch.cuda.get_device_name(0)}")
    except Exception as e:
        print(f"GPU name unavailable: {e}")

print("Paths configured:")
print(f"- ROOT:          {ROOT.resolve().as_posix()}")
print(f"- DATASET_DIR:   {DATASET_DIR.resolve().as_posix()}")
print(f"- ARTIFACTS_DIR: {ARTIFACTS_DIR.resolve().as_posix()}")
print(f"  - MODELS_DIR:   {MODELS_DIR.resolve().as_posix()}")
print(f"  - PROCESSED_DIR:{PROCESSED_DIR.resolve().as_posix()}")
print(f"  - MLFLOW_DIR:   {MLFLOW_DIR.resolve().as_posix()}")
print(f"- FIGURES_DIR:   {FIGURES_DIR.resolve().as_posix()}")


  import pkg_resources  # noqa: TID251
* 'schema_extra' has been renamed to 'json_schema_extra'


CUDA available: True
GPU: NVIDIA GeForce RTX 2050
Paths configured:
- ROOT:          D:/ecg-research
- DATASET_DIR:   D:/ecg-research/dataset
- ARTIFACTS_DIR: D:/ecg-research/artifacts
  - MODELS_DIR:   D:/ecg-research/artifacts/models
  - PROCESSED_DIR:D:/ecg-research/artifacts/processed
  - MLFLOW_DIR:   D:/ecg-research/artifacts/mlflow
- FIGURES_DIR:   D:/ecg-research/figures


### Preprocessing: Load, Normalize, and Split ECG Datasets


In [3]:
# Preprocessing (memory-safe streaming approach)
from datetime import datetime, timezone
from typing import Optional, Tuple, List, Dict
from scipy.io import loadmat
from scipy.signal import resample  # added: ensure resample available if cell runs first
import pandas as pd
import numpy as np  # added: ensure np available if cell runs first
import wfdb  # added: ensure wfdb available if cell runs first
import torch  # ensure torch is available here as well
from pathlib import Path as _PathGuard
from pathlib import Path  # added: Path symbol used below

# Safety: rehydrate core paths if this cell runs before Environment Setup
try:
    ROOT
except NameError:
    ROOT = _PathGuard('.')

ARTIFACTS_DIR = ARTIFACTS_DIR if 'ARTIFACTS_DIR' in globals() else ROOT / 'artifacts'
PROCESSED_DIR = PROCESSED_DIR if 'PROCESSED_DIR' in globals() else ARTIFACTS_DIR / 'processed'
DATASET_DIR = DATASET_DIR if 'DATASET_DIR' in globals() else ROOT / 'dataset'
FIGURES_DIR = FIGURES_DIR if 'FIGURES_DIR' in globals() else ROOT / 'figures'
LOGS_DIR = LOGS_DIR if 'LOGS_DIR' in globals() else ROOT / 'logs'
for _d in [ARTIFACTS_DIR, PROCESSED_DIR, DATASET_DIR, FIGURES_DIR, LOGS_DIR]:
    _PathGuard(_d).mkdir(parents=True, exist_ok=True)

TARGET_FS = 500
TARGET_SAMPLES = 5000
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Unified label mapping setup
UNIFIED_CSV = LOGS_DIR / 'unified_label_mapping.csv'
LABEL_ORDER = ['MI', 'AF', 'BBB', 'NORM', 'OTHER']
LABEL_TO_INT: Dict[str, int] = {name: idx for idx, name in enumerate(LABEL_ORDER)}

# Build a robust mapping index from unified CSV
mapping_index: Dict[str, Dict[str, str]] = {}
if UNIFIED_CSV.exists():
    umap_df = pd.read_csv(UNIFIED_CSV, dtype=str)
    umap_df.columns = [c.strip() for c in umap_df.columns]
    required_cols = {'dataset', 'record_id', 'mapped_label'}
    missing = required_cols - set(umap_df.columns)
    if missing:
        raise RuntimeError(f"Unified mapping missing columns: {missing}")
    for _, row in umap_df.iterrows():
        ds = str(row['dataset']).strip()
        rid = str(row['record_id']).strip()
        lab = str(row['mapped_label']).strip().upper()
        if not ds or not rid:
            continue
        if ds not in mapping_index:
            mapping_index[ds] = {}
        rid_norm = rid.replace('\\', '/').strip('/')
        mapping_index[ds][rid_norm] = lab
        try:
            p = Path(rid_norm)
            mapping_index[ds][p.name] = lab
            if len(p.parts) >= 2:
                mapping_index[ds]['/'.join(p.parts[-2:])] = lab
            if len(p.parts) >= 2 and p.parts[0] == ds:
                mapping_index[ds]['/'.join(p.parts[1:])] = lab
        except Exception:
            pass
else:
    print(f"WARNING: {UNIFIED_CSV.as_posix()} not found. Unmapped records will default to OTHER.")

print("Scanning dataset directory for WFDB headers (.hea)...")
hea_files = sorted(DATASET_DIR.rglob('*.hea'))
mat_files = sorted(DATASET_DIR.rglob('*.mat')) if not hea_files else []
if not hea_files and not mat_files:
    raise RuntimeError(f"No supported ECG files found under {DATASET_DIR} (expected .hea or .mat)")

# --- Utility functions ---

def zscore_normalize(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    m = float(x.mean())
    s = float(x.std())
    if s < 1e-8:
        s = 1.0
    return (x - m) / s

def pad_or_truncate(x: np.ndarray, length: int) -> np.ndarray:
    if x.size < length:
        return np.pad(x, (0, length - x.size), mode='constant')
    if x.size > length:
        return x[:length]
    return x

def _wfdb_read(record_header_path: Path) -> Tuple[np.ndarray, float]:
    record_path = record_header_path.with_suffix('')
    try:
        sig, fields = wfdb.rdsamp(str(record_path))
        fs = float(fields.get('fs', TARGET_FS))
        if sig.ndim == 2:
            sig_1d = sig[:, 0]
        else:
            sig_1d = sig.reshape(-1)
        return sig_1d.astype(np.float32), fs
    except Exception as e:
        raise RuntimeError(f"WFDB read failed for {record_header_path.name}: {e}")

def _mat_read(mat_path: Path) -> Tuple[np.ndarray, Optional[float]]:
    mat = loadmat(mat_path)
    for key in ('val', 'data', 'signal'):
        if key in mat:
            arr = np.asarray(mat[key])
            break
    else:
        raise RuntimeError(f"Unexpected MAT structure for {mat_path.name}")
    if arr.ndim == 2:
        if arr.shape[0] > 1:
            arr = arr[0]
        else:
            arr = arr.reshape(-1)
    elif arr.ndim > 2:
        arr = arr.reshape(-1)
    arr = np.asarray(arr, dtype=np.float32)
    fs = None
    return arr, fs

def _lookup_mapped_label(path: Path) -> str:
    rel = path.relative_to(DATASET_DIR).with_suffix('')
    parts = rel.parts
    if not parts:
        return 'OTHER'
    ds = parts[0]
    key_full = rel.as_posix()
    key_wo_ds = '/'.join(parts[1:]) if len(parts) > 1 else ''
    key_last2 = '/'.join(parts[-2:]) if len(parts) >= 2 else ''
    key_name = rel.name
    candidates = [key_full, key_wo_ds, key_last2, key_name]
    index = mapping_index.get(ds, {})
    for k in candidates:
        if k and k in index:
            lab = index[k]
            return lab if lab in LABEL_TO_INT else 'OTHER'
    if ds == 'CinC_2017_AFDB':
        if len(parts) >= 3 and parts[1] in {'training', 'validation', 'test'}:
            alt = '/'.join(parts[2:])
            if alt in index:
                lab = index[alt]
                return lab if lab in LABEL_TO_INT else 'OTHER'
    return 'OTHER'

def _resample_if_needed(x: np.ndarray, fs: Optional[float]) -> np.ndarray:
    if fs is None or np.isclose(fs, TARGET_FS):
        return x
    new_len = int(round(x.size * TARGET_FS / float(fs)))
    return resample(x, new_len)

# --- Streaming save implementation ---
RECORDS_DIR = PROCESSED_DIR / 'records'
RECORDS_DIR.mkdir(parents=True, exist_ok=True)
manifest: List[Dict[str, str]] = []
from collections import Counter, defaultdict
class_counts = Counter()

source_files = hea_files if hea_files else mat_files
source_type = 'WFDB' if hea_files else 'MAT'
print(f"Processing {len(source_files)} {source_type} record(s) with streaming save...")

skipped = 0
for i, path in enumerate(source_files):
    try:
        if source_type == 'WFDB':
            raw, fs = _wfdb_read(path)
        else:
            raw, fs = _mat_read(path)
        raw = _resample_if_needed(raw, fs)
        raw = zscore_normalize(raw)
        raw = pad_or_truncate(raw, TARGET_SAMPLES)
        label_name = _lookup_mapped_label(path)
        label_int = LABEL_TO_INT.get(label_name, LABEL_TO_INT['OTHER'])
        # Construct record id without extension (sanitized)
        rel = path.relative_to(DATASET_DIR).with_suffix('')
        record_id = rel.as_posix().replace('/', '__')  # flatten hierarchy into filename
        out_file = RECORDS_DIR / f"{record_id}.npz"
        np.savez_compressed(out_file, signal=raw.astype(np.float32), label=np.int64(label_int))
        manifest.append({'path': f"records/{out_file.name}", 'label': int(label_int)})
        class_counts[label_int] += 1
        if (i + 1) % 1000 == 0:
            print(f"Saved {i+1}/{len(source_files)} records...")
    except Exception as e:
        skipped += 1
        if skipped < 25:  # limit verbose errors
            print(f"Skipping {path.name}: {e}")

if skipped:
    print(f"Skipped {skipped} record(s) due to read errors.")

if not manifest:
    raise RuntimeError("No records successfully processed; aborting streaming pipeline.")

# Stratified split (80/10/10) using per-class grouping for balanced distribution
rng = np.random.default_rng(seed=42)
by_class = defaultdict(list)
for entry in manifest:
    by_class[entry['label']].append(entry)

train_paths, val_paths, test_paths = [], [], []
for label_val, items in by_class.items():
    rng.shuffle(items)
    n = len(items)
    n_train = int(round(0.80 * n))
    n_val = int(round(0.10 * n))
    n_test = n - n_train - n_val
    # Adjust rounding if off by one due to round() usage
    if n_test < 0:
        n_test = 0
        while n_train + n_val > n:
            n_train -= 1
    train_items = items[:n_train]
    val_items = items[n_train:n_train + n_val]
    test_items = items[n_train + n_val:]
    train_paths.extend([x['path'] for x in train_items])
    val_paths.extend([x['path'] for x in val_items])
    test_paths.extend([x['path'] for x in test_items])

splits = {
    'timestamp': datetime.now(timezone.utc).isoformat(timespec='seconds'),
    'label_order': LABEL_ORDER,
    'label_to_int': LABEL_TO_INT,
    'counts': {
        'train': len(train_paths),
        'val': len(val_paths),
        'test': len(test_paths)
    },
    'paths': {
        'train': train_paths,
        'val': val_paths,
        'test': test_paths
    },
    'class_counts': {int(k): int(v) for k, v in class_counts.items()}
}
(PROCESSED_DIR / 'splits.json').write_text(json.dumps(splits, indent=2), encoding='utf-8')

# Save label names and label map (unchanged interface for downstream)
np.save(PROCESSED_DIR / 'labels.npy', np.array(LABEL_ORDER, dtype=object))
label_map = {
    'label_to_int': LABEL_TO_INT,
    'int_to_label': {str(v): k for k, v in LABEL_TO_INT.items()}
}
(PROCESSED_DIR / 'label_map.json').write_text(json.dumps(label_map, indent=2), encoding='utf-8')

print("Per-class counts:")
for idx, name in enumerate(LABEL_ORDER):
    print(f"  {idx}={name}: {class_counts.get(idx, 0)}")
print(f"Total records saved: {len(manifest)}")
print("Chunked preprocessing completed; all signals saved individually and memory-safe.")
print("Preprocessing with unified labels completed successfully.")

# NOTE: Previous in-memory stacking removed for OOM safety.
# np.stack(signals)  # <-- removed / commented out intentionally.


Scanning dataset directory for WFDB headers (.hea)...
Processing 108520 WFDB record(s) with streaming save...
Skipping JS00001.hea: WFDB read failed for JS00001.hea: name 'wfdb' is not defined
Skipping JS00002.hea: WFDB read failed for JS00002.hea: name 'wfdb' is not defined
Skipping JS00004.hea: WFDB read failed for JS00004.hea: name 'wfdb' is not defined
Skipping JS00005.hea: WFDB read failed for JS00005.hea: name 'wfdb' is not defined
Skipping JS00006.hea: WFDB read failed for JS00006.hea: name 'wfdb' is not defined
Skipping JS00007.hea: WFDB read failed for JS00007.hea: name 'wfdb' is not defined
Skipping JS00008.hea: WFDB read failed for JS00008.hea: name 'wfdb' is not defined
Skipping JS00009.hea: WFDB read failed for JS00009.hea: name 'wfdb' is not defined
Skipping JS00010.hea: WFDB read failed for JS00010.hea: name 'wfdb' is not defined
Skipping JS00011.hea: WFDB read failed for JS00011.hea: name 'wfdb' is not defined
Skipping JS00012.hea: WFDB read failed for JS00012.hea: name

RuntimeError: No records successfully processed; aborting streaming pipeline.

### Model Training and Experiment Logging


In [3]:
# Model Training with MLflow (adapted for lazy per-record loading)
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class RecordDataset(Dataset):
    def __init__(self, split: str, splits_path: Path, processed_root: Path):
        payload = json.loads(splits_path.read_text())
        if split not in payload['paths']:
            raise ValueError(f"Split '{split}' not found in splits.json")
        self.paths = [processed_root / p for p in payload['paths'][split]]
        self.label_to_int = payload['label_to_int']
    def __len__(self) -> int:
        return len(self.paths)
    def __getitem__(self, idx: int):
        p = self.paths[idx]
        data = np.load(p)
        sig = data['signal'].astype(np.float32)  # (5000,)
        lab = int(data['label'])
        x = torch.from_numpy(sig).float().unsqueeze(0).unsqueeze(0)  # (1,1,5000)
        return x, lab

class SmallNet(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(1, 7), padding=(0, 3)),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((1, 2)),
            nn.Conv2d(16, 32, kernel_size=(1, 5), padding=(0, 2)),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((1, 2))
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 1, 1, TARGET_SAMPLES)
            flattened = self.features(dummy).view(1, -1).size(1)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.classifier(self.features(x))

splits_file = PROCESSED_DIR / 'splits.json'
labels_file = PROCESSED_DIR / 'labels.npy'
if not splits_file.exists():
    raise FileNotFoundError('splits.json not found. Run preprocessing cell first.')
label_names = np.load(labels_file, allow_pickle=True).tolist()
num_classes = len(label_names)

train_ds = RecordDataset('train', splits_file, PROCESSED_DIR)
val_ds = RecordDataset('val', splits_file, PROCESSED_DIR)

batch_size = 32
epochs = 8
learning_rate = 1e-3

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

model = SmallNet(num_classes=num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

tracking_uri = f"sqlite:///{(MLFLOW_DIR / 'mlflow.db').as_posix()}"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment('ECG_Tensor_Research')


def evaluate(loader: DataLoader) -> Tuple[float, float]:
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            logits = model(xb)
            loss = criterion(logits, yb)
            preds = logits.argmax(dim=1)
            total_loss += loss.item() * xb.size(0)
            correct += int(np.count_nonzero(preds.cpu().numpy() == yb.cpu().numpy()))
            total += xb.size(0)
    return (total_loss / max(total, 1)), (correct / max(total, 1))

history = []
best_val_loss = float('inf')
best_state = None
best_epoch = -1

with mlflow.start_run(run_name='notebook-training'):
    mlflow.log_params({
        'batch_size': batch_size,
        'epochs': epochs,
        'learning_rate': learning_rate,
        'architecture': 'SmallNet-2xConv2D',
        'streaming_preprocessing': True
    })
    for epoch in range(1, epochs + 1):
        model.train()
        running = 0.0
        for xb, yb in train_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            optimizer.zero_grad(set_to_none=True)
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running += loss.item() * xb.size(0)
        train_loss = running / len(train_ds)
        val_loss, val_acc = evaluate(val_loader)
        history.append({'epoch': epoch, 'train_loss': train_loss, 'val_loss': val_loss, 'val_acc': val_acc})
        mlflow.log_metric('train_loss', train_loss, step=epoch)
        mlflow.log_metric('val_loss', val_loss, step=epoch)
        mlflow.log_metric('val_accuracy', val_acc, step=epoch)
        print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f} | val_acc={val_acc:.3f}")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = model.state_dict()
            best_epoch = epoch

    if best_state is not None:
        model.load_state_dict(best_state)
        print(f"Loaded best model from epoch {best_epoch}")
    best_model_path = MODELS_DIR / 'best_model.pt'
    torch.save({
        'state_dict': model.state_dict(),
        'labels': label_names,
        'config': {
            'target_fs': TARGET_FS,
            'target_samples': TARGET_SAMPLES,
            'architecture': 'SmallNet-2xConv2D',
            'streaming_preprocessing': True
        }
    }, best_model_path)
    mlflow.log_artifact(str(best_model_path), artifact_path='models')
    history_path = MODELS_DIR / 'training_history.json'
    history_path.write_text(json.dumps(history, indent=2), encoding='utf-8')
    mlflow.log_artifact(str(history_path), artifact_path='history')

print("Training finished.")


2025/11/10 14:32:19 INFO mlflow.tracking.fluent: Experiment with name 'ECG_Tensor_Research' does not exist. Creating a new experiment.


Epoch 01 | train_loss=5.2410 | val_loss=4.6069 | val_acc=0.111
Epoch 02 | train_loss=4.6908 | val_loss=4.4625 | val_acc=0.119
Epoch 03 | train_loss=4.5052 | val_loss=4.3770 | val_acc=0.123
Epoch 04 | train_loss=4.4075 | val_loss=4.3645 | val_acc=0.124
Epoch 05 | train_loss=4.3206 | val_loss=4.3288 | val_acc=0.124
Epoch 06 | train_loss=4.2411 | val_loss=4.3419 | val_acc=0.126
Epoch 07 | train_loss=4.1952 | val_loss=4.6939 | val_acc=0.114
Epoch 08 | train_loss=4.1296 | val_loss=4.3063 | val_acc=0.147
Loaded best model from epoch 8
Training finished.


### Evaluation: Metrics and Diagnostic Plots


In [None]:
# Evaluation and Visualization (lazy record loading)
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve
from sklearn.preprocessing import label_binarize
from torch.utils.data import Dataset, DataLoader

splits_file = PROCESSED_DIR / 'splits.json'
label_names = np.load(PROCESSED_DIR / 'labels.npy', allow_pickle=True).tolist()
num_classes = len(label_names)

class TestRecordDataset(Dataset):
    def __init__(self, split: str, splits_path: Path, processed_root: Path):
        payload = json.loads(splits_path.read_text())
        self.paths = [processed_root / p for p in payload['paths'][split]]
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, i: int):
        p = self.paths[i]
        data = np.load(p)
        sig = data['signal'].astype(np.float32)
        lab = int(data['label'])
        x = torch.from_numpy(sig).float().unsqueeze(0).unsqueeze(0)
        return x, lab

class _EvalSmallNet(SmallNet):
    pass


def _load_best_model() -> Tuple[torch.nn.Module, list]:
    ckpt_path = MODELS_DIR / 'best_model.pt'
    if not ckpt_path.exists():
        raise FileNotFoundError('best_model.pt not found. Train the model first.')
    payload = torch.load(ckpt_path, map_location=DEVICE)
    labels = payload.get('labels')
    model = SmallNet(num_classes=len(labels))
    model.load_state_dict(payload['state_dict'])
    model.to(DEVICE)
    model.eval()
    return model, labels

model, label_names = _load_best_model()

test_ds = TestRecordDataset('test', splits_file, PROCESSED_DIR)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

all_logits = []
all_targets = []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        all_logits.append(logits.cpu().numpy())
        all_targets.append(yb.numpy())

logits = np.concatenate(all_logits, axis=0)
y_true = np.concatenate(all_targets, axis=0)
y_prob = torch.softmax(torch.from_numpy(logits), dim=1).numpy()
y_pred = y_prob.argmax(axis=1)

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
try:
    if num_classes == 2:
        auc = roc_auc_score(y_true, y_prob[:, 1])
    else:
        y_true_oh = label_binarize(y_true, classes=list(range(num_classes)))
        auc = roc_auc_score(y_true_oh, y_prob, average='macro', multi_class='ovr')
except Exception:
    auc = float('nan')

cm = confusion_matrix(y_true, y_pred, labels=list(range(num_classes)))

plt.figure(figsize=(6, 5))
if num_classes == 2:
    fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
    plt.plot(fpr, tpr, label=f'AUC={auc:.3f}')
else:
    fpr_grid = np.linspace(0.0, 1.0, 1001)
    tprs = []
    for c in range(num_classes):
        try:
            y_true_c = np.asarray(y_true == c, dtype=int)
            fpr_c, tpr_c, _ = roc_curve(y_true_c, y_prob[:, c])
            tpr_interp = np.interp(fpr_grid, fpr_c, tpr_c)
            tprs.append(tpr_interp)
        except Exception:
            pass
    if tprs:
        mean_tpr = np.mean(tprs, axis=0)
        plt.plot(fpr_grid, mean_tpr, label=f'Macro AUC={auc:.3f}')
    else:
        plt.plot([0, 1], [0, 1], 'k--', label='No ROC')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
roc_path = FIGURES_DIR / 'roc.png'
plt.tight_layout()
plt.savefig(roc_path, dpi=150)
plt.close()

fig, ax = plt.subplots(figsize=(6, 5))
Disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
Disp.plot(ax=ax, cmap='Blues', colorbar=False)
plt.title('Confusion Matrix')
cm_path = FIGURES_DIR / 'confusion_matrix.png'
plt.tight_layout()
plt.savefig(cm_path, dpi=150)
plt.close()

try:
    if mlflow.active_run():
        mlflow.end_run()
except Exception:
    pass

with mlflow.start_run(run_name='evaluation'):
    mlflow.log_metric('test_accuracy', float(acc))
    if not np.isnan(auc):
        mlflow.log_metric('test_auroc', float(auc))
    mlflow.log_metric('test_f1_macro', float(f1))
    mlflow.log_artifact(str(roc_path), artifact_path='figures')
    mlflow.log_artifact(str(cm_path), artifact_path='figures')

print("Evaluation summary:")
print(f"- Accuracy: {acc:.4f}")
print(f"- AUROC:   {auc:.4f}")
print(f"- F1:      {f1:.4f}")


### Serving via FastAPI


In [None]:
# Serving API (do not launch server from within the notebook)
import io
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from scipy.signal import resample  # added: ensure resample available here, too

MODEL_CACHE = {}


def load_checkpoint(checkpoint_path: Optional[Path] = None):
    if checkpoint_path is None:
        candidates = sorted(MODELS_DIR.glob('*.pt'), key=lambda p: p.stat().st_mtime, reverse=True)
        if not candidates:
            raise FileNotFoundError('No checkpoint available. Train the model before serving.')
        checkpoint_path = candidates[0]
    payload = torch.load(checkpoint_path, map_location=DEVICE)
    labels = payload.get('labels')
    model = SmallNet(num_classes=len(labels))
    model.load_state_dict(payload['state_dict'])
    model.to(DEVICE)
    model.eval()
    return model, labels, checkpoint_path


def prepare_for_inference(arr: np.ndarray) -> torch.Tensor:
    arr = np.asarray(arr).astype(np.float32)
    if arr.ndim > 1:
        arr = arr.reshape(-1)
    if arr.size != TARGET_SAMPLES:
        arr = resample(arr, TARGET_SAMPLES)
    m = float(arr.mean())
    s = float(arr.std()) or 1.0
    arr = (arr - m) / s
    return torch.from_numpy(arr).float().unsqueeze(0).unsqueeze(0).to(DEVICE)


def predict_signal(input_data: np.ndarray) -> dict:
    if 'model' not in MODEL_CACHE:
        model, labels, ckpt = load_checkpoint()
        MODEL_CACHE['model'] = model
        MODEL_CACHE['labels'] = labels
        MODEL_CACHE['ckpt'] = ckpt
        print(f"Loaded checkpoint {ckpt.name} for serving.")
    model = MODEL_CACHE['model']
    labels = MODEL_CACHE['labels']
    xb = prepare_for_inference(input_data)
    with torch.no_grad():
        logits = model(xb)
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    top = int(np.argmax(probs))
    return {
        'predicted_class': labels[top],
        'confidence': float(probs[top]),
        'probabilities': {label: float(p) for label, p in zip(labels, probs)}
    }

app = FastAPI(title='ECG Tensor Inference API')

@app.post('/predict')
async def predict_endpoint(file: UploadFile = File(...)):
    payload = await file.read()
    buffer = io.BytesIO(payload)
    suffix = Path(file.filename).suffix.lower()
    if suffix == '.npy':
        buffer.seek(0)
        arr = np.load(buffer, allow_pickle=True)
    else:  # assume CSV
        buffer.seek(0)
        arr = np.loadtxt(buffer, delimiter=',')
    result = predict_signal(arr)
    return JSONResponse(result)

# Run this in terminal using uvicorn serve:app --reload


### 6. PowerShell Execution Instructions

```powershell
# (Optional) Create and activate a virtual environment
python -m venv .venv
.\.venv\Scripts\Activate.ps1

# Install dependencies
pip install -U pip
pip install -r requirements.txt

# Run preprocessing cell (streaming, memory-safe)
# Train model
# Evaluate and generate figures

# Start MLflow UI (optional)
mlflow ui --backend-store-uri sqlite:///artifacts/mlflow/mlflow.db --default-artifact-root ./artifacts --host 127.0.0.1 --port 5000

# Serve the API locally
uvicorn ecg_tensor_pipeline:app --reload
```
