# HDF5 to Multi-resolution Zarr Pyramid

This notebook converts HDF5 data to a multi-resolution Zarr pyramid with optimal chunking.

**Pipeline:**
1. Convert HDF5 → Zarr
2. Rechunk to (64, 64, 64) using rechunker
3. Generate pyramid levels 1-4 (level 0 already exists from rechunking)
4. Add OME-Zarr metadata

## Install Required Packages

Uncomment and run if needed:

In [None]:
# !pip install rechunker scikit-image

## Import Libraries

In [None]:
import h5py
import zarr
import numpy as np
from rechunker import rechunk
import dask.array as da
from skimage.transform import downscale_local_mean
import tempfile
import shutil
from pathlib import Path

print("✓ All libraries imported successfully")

## Configuration

Set your input/output paths and parameters:

In [None]:
# Configuration
HDF5_PATH = "your_data.h5"              # Path to your HDF5 file
DATASET_NAME = "/data"                  # HDF5 dataset name (e.g., '/data', '/images/stack')
OUTPUT_ZARR_PATH = "output_pyramid.zarr"  # Output Zarr path
CHUNK_SIZE = (64, 64, 64)               # Chunk size for all levels
N_LEVELS = 5                            # Number of pyramid levels
DOWNSCALE_FACTOR = 2                    # Downsampling factor between levels
MAX_MEMORY = "2GB"                      # Memory limit for rechunker

print(f"Configuration:")
print(f"  Input: {HDF5_PATH}")
print(f"  Output: {OUTPUT_ZARR_PATH}")
print(f"  Chunk size: {CHUNK_SIZE}")
print(f"  Pyramid levels: {N_LEVELS}")

## Step 1: Convert HDF5 to Temporary Zarr

In [None]:
print("Step 1: Converting HDF5 to Zarr...")

# Open HDF5 and inspect
with h5py.File(HDF5_PATH, 'r') as f:
    dset = f[DATASET_NAME]
    shape = dset.shape
    dtype = dset.dtype
    
    print(f"  Dataset shape: {shape}")
    print(f"  Dataset dtype: {dtype}")
    
    # Create temporary Zarr for initial conversion
    temp_zarr = tempfile.mkdtemp(prefix='temp_zarr_')
    temp_store = zarr.open(temp_zarr, mode='w')
    temp_array = temp_store.create_dataset(
        'data',
        shape=shape,
        dtype=dtype,
        chunks=CHUNK_SIZE
    )
    
    # Copy data in chunks to avoid memory issues
    print("  Copying data...")
    for i in range(0, shape[0], CHUNK_SIZE[0]):
        for j in range(0, shape[1], CHUNK_SIZE[1]):
            for k in range(0, shape[2], CHUNK_SIZE[2]):
                i_end = min(i + CHUNK_SIZE[0], shape[0])
                j_end = min(j + CHUNK_SIZE[1], shape[1])
                k_end = min(k + CHUNK_SIZE[2], shape[2])
                temp_array[i:i_end, j:j_end, k:k_end] = \
                    dset[i:i_end, j:j_end, k:k_end]
        print(f"    Progress: {min(i + CHUNK_SIZE[0], shape[0])}/{shape[0]} slices")

print("  ✓ Conversion complete")
print(f"  Temporary Zarr: {temp_zarr}")

## Step 2: Rechunk Using Rechunker

This creates level 0 of the pyramid with optimal chunking:

In [None]:
print("Step 2: Rechunking with rechunker...")

# Load source array
source_array = da.from_zarr(temp_zarr, component='data')
print(f"  Source array shape: {source_array.shape}")
print(f"  Source chunks: {source_array.chunksize}")

# Create output store
output_store = zarr.open(OUTPUT_ZARR_PATH, mode='w')

# Setup rechunker
temp_rechunk = tempfile.mkdtemp(prefix='temp_rechunk_')
print(f"  Target chunks: {CHUNK_SIZE}")
print(f"  Max memory: {MAX_MEMORY}")

rechunked = rechunk(
    source_array,
    target_chunks=CHUNK_SIZE,
    max_mem=MAX_MEMORY,
    target_store=OUTPUT_ZARR_PATH + '/0',
    temp_store=temp_rechunk
)

# Execute rechunking
print("  Executing rechunk (this may take a while)...")
rechunked.execute()

print("  ✓ Rechunking complete")
print(f"  Level 0 created at: {OUTPUT_ZARR_PATH}/0")

# Clean up temporary stores
shutil.rmtree(temp_zarr)
shutil.rmtree(temp_rechunk)
print("  ✓ Temporary files cleaned up")

## Step 3: Generate Pyramid Levels

Create downsampled levels 1-4 (level 0 already exists):

In [None]:
print(f"Step 3: Generating pyramid levels 1-{N_LEVELS-1}...")

output_store = zarr.open(OUTPUT_ZARR_PATH, mode='a')
previous_level = output_store['0']

for level in range(1, N_LEVELS):
    print(f"\n  Creating level {level}...")
    
    # Calculate new shape
    factor = DOWNSCALE_FACTOR ** level
    new_shape = tuple(s // factor for s in shape)
    print(f"    Target shape: {new_shape}")
    print(f"    Downscale factor: {factor}x")
    
    # Load data from previous level
    data = previous_level[:]
    print(f"    Loaded data shape: {data.shape}")
    
    # Perform downsampling
    scale_factors = (DOWNSCALE_FACTOR,) * len(shape)
    print(f"    Downsampling...")
    downsampled = downscale_local_mean(data, scale_factors)
    print(f"    Downsampled shape: {downsampled.shape}")
    
    # Create new level
    output_store.create_dataset(
        str(level),
        data=downsampled,
        chunks=CHUNK_SIZE,
        dtype=dtype,
        overwrite=True
    )
    
    previous_level = output_store[str(level)]
    print(f"    ✓ Level {level} created")

print(f"\n  ✓ All pyramid levels generated")

## Step 4: Add OME-Zarr Metadata

In [None]:
print("Step 4: Adding OME-Zarr metadata...")

# Create multiscales metadata
datasets = []
for level in range(N_LEVELS):
    factor = DOWNSCALE_FACTOR ** level
    datasets.append({
        "path": str(level),
        "coordinateTransformations": [{
            "type": "scale",
            "scale": [factor, factor, factor]
        }]
    })

multiscales = [{
    "version": "0.4",
    "name": "pyramid",
    "axes": [
        {"name": "z", "type": "space", "unit": "micrometer"},
        {"name": "y", "type": "space", "unit": "micrometer"},
        {"name": "x", "type": "space", "unit": "micrometer"}
    ],
    "datasets": datasets,
    "type": "gaussian",
    "metadata": {
        "description": "Multi-resolution pyramid",
        "method": "skimage.transform.downscale_local_mean"
    }
}]

output_store.attrs['multiscales'] = multiscales
print("  ✓ OME-Zarr metadata added")

## Verification

Verify the pyramid structure:

In [None]:
print("\n" + "="*60)
print("PYRAMID CREATED SUCCESSFULLY!")
print("="*60)

store = zarr.open(OUTPUT_ZARR_PATH, mode='r')

print(f"\nOutput location: {OUTPUT_ZARR_PATH}")
print(f"\nPyramid structure:")
print("-" * 60)

for level in range(N_LEVELS):
    if str(level) in store:
        arr = store[str(level)]
        factor = DOWNSCALE_FACTOR ** level
        memory_mb = np.prod(arr.shape) * arr.dtype.itemsize / (1024**2)
        print(f"Level {level}:")
        print(f"  Shape:  {arr.shape}")
        print(f"  Chunks: {arr.chunks}")
        print(f"  Factor: {factor}x downsampled")
        print(f"  Memory: {memory_mb:.2f} MB")
        print()

print("-" * 60)
print("\nTo view in napari:")
print(f"  napari --plugin napari-ome-zarr {OUTPUT_ZARR_PATH}")
print("\nOr in Python:")
print(f"  import napari")
print(f"  viewer = napari.Viewer()")
print(f"  viewer.open('{OUTPUT_ZARR_PATH}', plugin='napari-ome-zarr')")

## Optional: Quick Data Inspection

In [None]:
# Inspect a middle slice from each level
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, N_LEVELS, figsize=(15, 3))
fig.suptitle('Middle Z-slice from each pyramid level')

for level in range(N_LEVELS):
    arr = store[str(level)]
    mid_slice = arr[arr.shape[0]//2, :, :]
    
    axes[level].imshow(mid_slice, cmap='gray')
    axes[level].set_title(f'Level {level}\n{arr.shape}')
    axes[level].axis('off')

plt.tight_layout()
plt.show()