# ECG Image Digitization - Kaggle Submission Notebook

This notebook trains an ECG digitization model and generates a submission file for the PhysioNet Challenge.

**Competition**: [PhysioNet ECG Image Digitization](https://www.kaggle.com/competitions/physionet-ecg-image-digitization)

## Pipeline Overview
1. ‚úÖ Environment Setup
2. ‚úÖ Dataset Loading
3. ‚úÖ Model Training
4. ‚úÖ Inference
5. ‚úÖ Submission Generation

## 1. Environment Setup

Install dependencies and detect Kaggle environment.

In [None]:
import os
import sys
from pathlib import Path

# Detect Kaggle environment
IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
print(f"Running on Kaggle: {IS_KAGGLE}")

# Install additional dependencies if needed
if IS_KAGGLE:
    !pip install -q segmentation-models-pytorch hydra-core omegaconf wfdb neurokit2 biosppy loguru rich
    
    # Set paths for Kaggle
    DATA_DIR = Path('/kaggle/input/physionet-ecg-image-digitization')
    OUTPUT_DIR = Path('/kaggle/working')
else:
    # Local paths
    DATA_DIR = Path('../data')
    OUTPUT_DIR = Path('../models')

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

## 2. Dataset Loading

Load and verify the competition dataset.

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

# List available files
if DATA_DIR.exists():
    print("\nüìÅ Available files:")
    for item in sorted(DATA_DIR.rglob('*')):
        if item.is_file():
            print(f"  {item.relative_to(DATA_DIR)}")
else:
    print(f"‚ö†Ô∏è  Data directory not found: {DATA_DIR}")
    print("Please ensure the competition data is linked/downloaded.")

# Load metadata if available
train_csv = DATA_DIR / 'train.csv'
if train_csv.exists():
    train_df = pd.read_csv(train_csv)
    print(f"\nüìä Training samples: {len(train_df)}")
    print(f"Columns: {list(train_df.columns)}")
    display(train_df.head())
else:
    print(f"‚ö†Ô∏è  Training metadata not found: {train_csv}")

### Visualize Sample ECG Images

In [None]:
# Visualize a few sample images
train_images = DATA_DIR / 'train'
if train_images.exists():
    image_files = list(train_images.glob('*.png'))[:6]
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, img_path in enumerate(image_files):
        img = Image.open(img_path)
        axes[idx].imshow(img, cmap='gray')
        axes[idx].set_title(img_path.name, fontsize=10)
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()
    print(f"\n‚úÖ Displayed {len(image_files)} sample images")
else:
    print(f"‚ö†Ô∏è  Training images directory not found: {train_images}")

## 3. Configure Training Pipeline

Set up Hydra configuration programmatically with MLflow **disabled** for Kaggle.

In [None]:
from omegaconf import OmegaConf, DictConfig
import torch

# Create configuration
cfg = OmegaConf.create({
    'project': {
        'name': 'ecg-digitization',
        'version': '0.1.0',
        'seed': 42,
    },
    'mlflow': {
        'enabled': False,  # DISABLED for Kaggle
        'tracking_uri': 'http://localhost:5050',
        'experiment_name': 'ecg-digitization-kaggle',
    },
    'paths': {
        'data_dir': str(DATA_DIR),
        'train_dir': str(DATA_DIR / 'train'),
        'test_dir': str(DATA_DIR / 'test'),
        'output_dir': str(OUTPUT_DIR / 'models'),
        'checkpoint_dir': str(OUTPUT_DIR / 'checkpoints'),
        'submission_dir': str(OUTPUT_DIR),
        'log_dir': str(OUTPUT_DIR / 'logs'),
    },
    'data': {
        'image_size': [512, 512],
        'batch_size': 4 if IS_KAGGLE else 8,  # Smaller batch for Kaggle GPU
        'num_workers': 2,
        'pin_memory': True,
        'augment_prob': 0.5,
    },
    'model': {
        'encoder_name': 'resnet50',
        'encoder_weights': 'imagenet',
        'num_leads': 12,
        'signal_length': 5000,
    },
    'training': {
        'epochs': 10 if IS_KAGGLE else 20,  # Fewer epochs for Kaggle time limits
        'learning_rate': 1e-4,
        'weight_decay': 1e-5,
        'val_split': 0.2,
    },
    'approach': {
        'method': 'baseline',
    },
})

# Set random seed
torch.manual_seed(cfg.project.seed)
np.random.seed(cfg.project.seed)

# Create output directories
for dir_path in [cfg.paths.output_dir, cfg.paths.checkpoint_dir, cfg.paths.log_dir]:
    Path(dir_path).mkdir(parents=True, exist_ok=True)

print("\n‚öôÔ∏è  Configuration:")
print(OmegaConf.to_yaml(cfg))
print(f"\nüîß MLflow tracking: {'‚úÖ ENABLED' if cfg.mlflow.enabled else '‚ùå DISABLED'}")
print(f"üñ•Ô∏è  Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

## 4. Model Training

Train the ECG digitization model using our pipeline.

In [None]:
# Add src to path if running locally
if not IS_KAGGLE:
    src_path = Path('../src')
    if src_path.exists() and str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path.resolve()))

from torch.utils.data import DataLoader, random_split
from ecg_digitization.data import ECGImageDataset, get_train_transforms, get_val_transforms, collate_fn
from ecg_digitization.models import ECGDigitizer
from ecg_digitization.training import ECGTrainer, CombinedLoss
from ecg_digitization.utils.mlflow_utils import create_mlflow_tracker
from ecg_digitization.utils import setup_logging

# Setup logging
setup_logging(cfg.paths.log_dir)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nüéØ Training on: {device}")

# Initialize MLflow tracker (will be no-op since mlflow.enabled=False)
mlflow_tracker = create_mlflow_tracker(
    enabled=cfg.mlflow.enabled,
    tracking_uri=cfg.mlflow.tracking_uri,
    experiment_name=cfg.mlflow.experiment_name,
    run_name="kaggle_training",
    tags={"environment": "kaggle" if IS_KAGGLE else "local"},
)

# Start MLflow run (no-op if disabled)
mlflow_tracker.start_run()

try:
    # Log config (no-op if disabled)
    config_dict = OmegaConf.to_container(cfg, resolve=True)
    mlflow_tracker.log_config(config_dict)
    
    # Create datasets
    print("\nüì¶ Preparing datasets...")
    train_transform = get_train_transforms(tuple(cfg.data.image_size), cfg.data.augment_prob)
    val_transform = get_val_transforms(tuple(cfg.data.image_size))
    
    full_dataset = ECGImageDataset(
        cfg.paths.data_dir,
        transform=train_transform,
        is_train=True,
    )
    
    # Split into train/val
    val_size = int(len(full_dataset) * cfg.training.val_split)
    train_size = len(full_dataset) - val_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
    val_dataset.dataset.transform = val_transform
    
    print(f"  Training samples: {train_size}")
    print(f"  Validation samples: {val_size}")
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=cfg.data.batch_size,
        shuffle=True,
        num_workers=cfg.data.num_workers,
        pin_memory=cfg.data.pin_memory,
        collate_fn=collate_fn,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=cfg.data.batch_size,
        shuffle=False,
        num_workers=cfg.data.num_workers,
        pin_memory=cfg.data.pin_memory,
        collate_fn=collate_fn,
    )
    
    # Create model
    print("\nüèóÔ∏è  Building model...")
    model = ECGDigitizer(
        encoder_name=cfg.model.encoder_name,
        encoder_weights=cfg.model.encoder_weights,
        num_leads=cfg.model.num_leads,
        signal_length=cfg.model.signal_length,
    )
    
    # Setup training
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=cfg.training.learning_rate,
        weight_decay=cfg.training.weight_decay,
    )
    
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=cfg.training.epochs
    )
    
    criterion = CombinedLoss()
    
    # Create trainer with MLflow integration (will be no-op)
    trainer = ECGTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        scheduler=scheduler,
        device=device,
        checkpoint_dir=cfg.paths.checkpoint_dir,
        mlflow_tracker=mlflow_tracker,
    )
    
    # Train model
    print(f"\nüöÄ Starting training for {cfg.training.epochs} epochs...")
    print("=" * 60)
    trainer.train(cfg.training.epochs)
    print("=" * 60)
    print("\n‚úÖ Training completed!")
    
    # Plot training curves
    fig, ax = plt.subplots(figsize=(10, 6))
    epochs = range(1, len(trainer.train_losses) + 1)
    ax.plot(epochs, trainer.train_losses, 'b-', label='Training Loss', linewidth=2)
    ax.plot(epochs, trainer.val_losses, 'r-', label='Validation Loss', linewidth=2)
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Loss', fontsize=12)
    ax.set_title('Training Progress', fontsize=14, fontweight='bold')
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìà Best validation loss: {trainer.best_val_loss:.4f}")
    
    # End MLflow run (no-op if disabled)
    mlflow_tracker.end_run(status="FINISHED")
    
except Exception as e:
    print(f"\n‚ùå Training failed: {e}")
    mlflow_tracker.end_run(status="FAILED")
    raise

## 5. Inference & Submission Generation

Generate predictions on the test set and create submission file.

In [None]:
from ecg_digitization.inference import ECGPredictor

print("\nüîÆ Running inference on test set...")

# Prepare test dataset
test_transform = get_val_transforms(tuple(cfg.data.image_size))
test_dataset = ECGImageDataset(
    cfg.paths.data_dir,
    transform=test_transform,
    is_train=False,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=cfg.data.batch_size,
    shuffle=False,
    num_workers=cfg.data.num_workers,
    collate_fn=collate_fn,
)

print(f"  Test samples: {len(test_dataset)}")

# Load best model
model = ECGDigitizer(
    encoder_name=cfg.model.encoder_name,
    num_leads=cfg.model.num_leads,
    signal_length=cfg.model.signal_length,
)

predictor = ECGPredictor(
    model=model,
    checkpoint_path=f"{cfg.paths.checkpoint_dir}/best_model.pt",
    device=device,
)

# Generate predictions
predictions = predictor.predict(test_loader)
print(f"\n‚úÖ Generated predictions for {len(predictions)} samples")

# Load test metadata
test_csv = Path(cfg.paths.data_dir) / 'test.csv'
if test_csv.exists():
    metadata = pd.read_csv(test_csv)
    print(f"  Loaded metadata for {len(metadata)} test samples")
else:
    metadata = None
    print("  ‚ö†Ô∏è  No test metadata found")

# Generate submission file
submission_path = Path(cfg.paths.submission_dir) / 'submission.parquet'
predictor.generate_submission(
    predictions,
    str(submission_path),
    metadata,
)

print(f"\nüìù Submission file created: {submission_path}")
print(f"  File size: {submission_path.stat().st_size / 1024 / 1024:.2f} MB")

# Verify submission format
if submission_path.exists():
    submission_df = pd.read_parquet(submission_path)
    print(f"\n‚úÖ Submission verification:")
    print(f"  Shape: {submission_df.shape}")
    print(f"  Columns: {list(submission_df.columns)}")
    display(submission_df.head())
else:
    print("\n‚ùå Submission file not created!")

## 6. Visualize Sample Predictions

Display some sample predictions to verify quality.

In [None]:
# Visualize a few predictions
num_samples = min(3, len(predictions))

fig, axes = plt.subplots(num_samples, 1, figsize=(14, 4 * num_samples))
if num_samples == 1:
    axes = [axes]

for idx in range(num_samples):
    pred = predictions[idx]
    time = np.arange(pred.shape[1]) / 500  # Assuming 500 Hz sampling rate
    
    # Plot all 12 leads
    for lead_idx in range(min(12, pred.shape[0])):
        axes[idx].plot(time, pred[lead_idx, :] + lead_idx * 2, linewidth=0.8, alpha=0.8)
    
    axes[idx].set_xlabel('Time (s)', fontsize=11)
    axes[idx].set_ylabel('Lead (offset)', fontsize=11)
    axes[idx].set_title(f'Sample {idx + 1} - Predicted ECG Signals', fontsize=12, fontweight='bold')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Displayed {num_samples} sample predictions")

## üéâ Submission Ready!

Your submission file has been generated and is ready to submit to Kaggle.

**Next Steps**:
1. Download `submission.parquet` from the output directory
2. Submit to the [PhysioNet ECG Image Digitization competition](https://www.kaggle.com/competitions/physionet-ecg-image-digitization)
3. Check your leaderboard score!

**Note**: MLflow tracking was disabled for this Kaggle run. To enable tracking locally with MLflow, set `mlflow.enabled=true` in the configuration.