Third version HDF5 -> OME-Zarr Converter

In [1]:
import h5py
import zarr
from zarr.storage import LocalStore
import time
from datetime import timedelta
import dask
import dask.array as da
import numpy as np
from zarr.codecs import BloscCodec, BytesCodec
from pathlib import Path

print("All libraries imported succesfully!")
print(zarr.__version__)

All libraries imported succesfully!
3.0.0


In [2]:
# Conversion arguments

input_path = "/Users/tobiasschleiss/documents/dtu/thesis/input/brain_2bin_cropSmall.h5"
output_path = "/Users/tobiasschleiss/Documents/DTU/Thesis/output/output.ome.zarr"
target_chunks = (64, 64, 64)
dataset_path = 'exchange/data'
max_mem_gb=10
pyramid_levels = 5
downsample_factor=2
compression_level=3
target_top_level_mb=10

n_workers = 2
worker_mem = 4 # memory per wroker in GB

In [3]:
# Inspect HDF5 file
with h5py.File(input_path, 'r') as f:
    if dataset_path not in f:
        print(f"  ERROR: Dataset '{dataset_path}' not found")
        print(f"  Available paths: {list(f.keys())}")
        
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
    h5_chunks = dataset.chunks
    data_size_gb = dataset.nbytes / (1024**3)
    data_size_mb = dataset.nbytes / (1024**2)
    dtype_size = dtype.itemsize
        
    print(f"  Shape: {shape}")
    print(f"  Dtype: {dtype}")
    print(f"  Size: {data_size_gb:.2f} GB")
    print(f"  HDF5 chunks: {h5_chunks if h5_chunks else 'Contiguous'}")

  Shape: (150, 3768, 2008)
  Dtype: float32
  Size: 4.23 GB
  HDF5 chunks: Contiguous


In [4]:
"""Calculate pyramid levels based on target top-level size"""
    
# Calculate levels needed
levels = 1
current_size_mb = data_size_mb
    
while current_size_mb > target_top_level_mb:
    current_size_mb = current_size_mb / (downsample_factor ** 3)
    levels += 1

print(f"Target top level: {target_top_level_mb} MB")
print(f"Recommended levels: {levels}")
print(f"Actual top level: {current_size_mb:.1f} MB")

pyramid_levels = levels

Target top level: 10 MB
Recommended levels: 4
Actual top level: 8.5 MB


In [5]:
print("Stage 1: HDF5 -> Level 0 zarr (optimized for sequential reads)")

# Open HDF5
with h5py.File(input_path, 'r') as f:
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
        
    print(f"  Source shape: {shape}, dtype: {dtype}")
        
    # Setup output zarr store    
    # store = zarr.NestedDirectoryStore(output_path) # NestedDirectoryStore deprecated
    # root = zarr.open_group(store=store, mode='w')

    store = LocalStore(output_path)

    root = zarr.group(store=store, overwrite=True)

    # Compressor for all levels
    compressor = BloscCodec(
    cname='zstd',
    clevel=compression_level,
    shuffle='bitshuffle'
    )
        
    level_0 = root.create_array(
        '0',
        shape=shape,
        chunks=target_chunks,  # Large Z-slabs
        dtype=dtype,
        compressors=compressor,
        chunk_key_encoding={"name": "v2", "separator": "/"},
        overwrite=True
    )
        
    # Copy in large slabs (efficient for contiguous HDF5)

    level0_start = time.time()
    
    slab_size = target_chunks[0]
    for z_start in range(0, shape[0], slab_size):
        z_end = min(z_start + slab_size, shape[0])
        print(f"  Reading Z planes {z_start}-{z_end}")
            
        slab = dataset[z_start:z_end, :, :]
        level_0[z_start:z_end, :, :] = slab
        del slab

    elapsed_level0 = time.time() - level0_start

    print(f"\nTiming breakdown:")
    print(f"  Level 0 write: {elapsed_level0:.1f}s")

Stage 1: HDF5 -> Level 0 zarr (optimized for sequential reads)
  Source shape: (150, 3768, 2008), dtype: float32
  Reading Z planes 0-64
  Reading Z planes 64-128
  Reading Z planes 128-150

Timing breakdown:
  Level 0 write: 10.3s


In [6]:
# Inspection level_0
root = zarr.open_group(output_path, mode='r')
# Access level 0
level_0 = root['0']

# Wrap in Dask
source = da.from_array(level_0, chunks=level_0.chunks)

print(f"  Source chunks: {source.chunksize}")
print(f"  Source shape: {source.shape}")

  Source chunks: (64, 64, 64)
  Source shape: (150, 3768, 2008)


In [7]:
# Configure dask for memory constraints
dask.config.set({
    'array.chunk-size': f'{int(max_mem_gb * 0.3)}GB',
    'distributed.worker.memory.target': 0.7,
    'distributed.worker.memory.spill': 0.8,
})
    
print("=" * 60)
print("Building OME-Zarr Multi-Resolution Pyramid")
print("=" * 60)
    
# Compressor for all levels
codecs = [
    BytesCodec(),
    BloscCodec(cname='zstd', clevel=compression_level, shuffle='bitshuffle')
]

    
# Load input zarr
root = zarr.open_group(output_path, mode='r')

# Level 0
level0 = root['0']

# Wrap in Dask
source = da.from_array(level0, chunks=level0.chunks)

source_shape = source.shape
source_chunks = source.chunksize
    
# ===== PYRAMID LEVELS: Build each level from previous =====
current_shape = source_shape

pyramid_start = time.time()
    
for level in range(1, pyramid_levels):
    print(f"\n{'='*60}")
    print(f"LEVEL {level}: Downsampling")
    print(f"{'='*60}")
        
    # Calculate new shape after downsampling
    new_shape = tuple(max(1, s // downsample_factor) for s in current_shape)
        
    print(f"Previous shape: {current_shape}")
    print(f"New shape: {new_shape}")
        
    # Calculate optimal chunk size for reading previous level
    # We want chunks that align well with both source and target
    read_chunks = tuple(
        min(c * downsample_factor, s) 
        for c, s in zip(target_chunks, current_shape)
    )
    
    # Load previous level
    # prev_array = da.from_zarr(store, component=str(level - 1))
    prev_array = da.from_array(root[str(level-1)], chunks=read_chunks)
        
    # Rechunk previous level for efficient downsampling
    prev_array = prev_array.rechunk(read_chunks)
        
    print(f"Reading with chunks: {read_chunks}")
    
    # Downsample using coarsen (block mean)
    # This is memory efficient and HDD-friendly
    downsampled = da.coarsen(
        np.mean,
        prev_array,
        {0: downsample_factor, 1: downsample_factor, 2: downsample_factor},
        trim_excess=True
    ).astype(prev_array.dtype)
        
    # Rechunk to target chunks
    # Adjust chunk size if array is smaller than target chunks
    level_chunks = tuple(min(tc, ns) for tc, ns in zip(target_chunks, new_shape))
    downsampled = downsampled.rechunk(level_chunks)
        
    print(f"Target chunks: {level_chunks}")
        
    # Calculate memory for this level
    level_mem_gb = np.prod(read_chunks) * prev_array.dtype.itemsize * 10 / 1e9
    print(f"Estimated memory: {level_mem_gb:.2f} GB")
        
    # Write to zarr
    print(f"Writing level {level}...")
    downsampled.to_zarr(
        store,
        component=str(level),
        chunk_key_encoding={"name": "v2", "separator": "/"},
        codecs=codecs,
        overwrite=True
    )
        
    print(f"✓ Level {level} complete: shape={new_shape}, chunks={level_chunks}")
        
    # Update current shape for next iteration
    current_shape = new_shape

elapsed_pyramid = time.time() - pyramid_start

print("\n" + "=" * 60)
print("Pyramid Complete!")
print("=" * 60)

total_seconds = elapsed_pyramid + elapsed_level0

print(f"\nTiming breakdown:")
print(f"  Pyramid write: {elapsed_pyramid:.1f}s")
print(f"  Full multiscale image write: {total_seconds:.1f}s")
print(f"  Total runtime: {timedelta(seconds=int(total_seconds))}")

Building OME-Zarr Multi-Resolution Pyramid

LEVEL 1: Downsampling
Previous shape: (150, 3768, 2008)
New shape: (75, 1884, 1004)
Reading with chunks: (128, 128, 128)
Target chunks: (64, 64, 64)
Estimated memory: 0.08 GB
Writing level 1...
✓ Level 1 complete: shape=(75, 1884, 1004), chunks=(64, 64, 64)

LEVEL 2: Downsampling
Previous shape: (75, 1884, 1004)
New shape: (37, 942, 502)
Reading with chunks: (75, 128, 128)
Target chunks: (37, 64, 64)
Estimated memory: 0.05 GB
Writing level 2...
✓ Level 2 complete: shape=(37, 942, 502), chunks=(37, 64, 64)

LEVEL 3: Downsampling
Previous shape: (37, 942, 502)
New shape: (18, 471, 251)
Reading with chunks: (37, 128, 128)
Target chunks: (18, 64, 64)
Estimated memory: 0.02 GB
Writing level 3...
✓ Level 3 complete: shape=(18, 471, 251), chunks=(18, 64, 64)

Pyramid Complete!

Timing breakdown:
  Pyramid write: 6.4s
  Full multiscale image write: 16.6s
  Total runtime: 0:00:16


In [8]:
# ===== ADD OME-ZARR METADATA =====



store = LocalStore(output_path)

root = zarr.open_group(
    store,
    mode="a",
    zarr_version=2
)

print(f"\n{'='*60}")
print("Adding OME-Zarr Metadata")
print(f"{'='*60}")
    
# Build datasets list
datasets = []
for level in range(pyramid_levels):
    scale_factor = downsample_factor ** level
    datasets.append({
        'path': str(level),
        'coordinateTransformations': [{
            'type': 'scale',
            'scale': [
                float(scale_factor),  # z
                float(scale_factor),  # y
                float(scale_factor)   # x
            ]
        }]
    })
    
# Add multiscales metadata
root.attrs['multiscales'] = [{
    'version': '0.4',
    'name': 'pyramid',
    'axes': [
        {'name': 'z', 'type': 'space', 'unit': 'micrometer'},
        {'name': 'y', 'type': 'space', 'unit': 'micrometer'},
        {'name': 'x', 'type': 'space', 'unit': 'micrometer'}
    ],
    'datasets': datasets,
    'type': 'mean',  # Downsampling method
    'metadata': {
        'description': 'Multi-resolution pyramid',
        'method': 'block mean downsampling'
    }
}]

print("\nPyramid Summary:")
print(f"Number of levels: {pyramid_levels}")
print("-" * 60)
    
for level in range(pyramid_levels):
    arr = zarr.open(store, mode='r')[str(level)]
    size_gb = np.prod(arr.shape) * arr.dtype.itemsize / 1e9
    print(f"  Level {level}: shape={arr.shape}, chunks={arr.chunks}, size={size_gb:.2f} GB")    


Adding OME-Zarr Metadata

Pyramid Summary:
Number of levels: 4
------------------------------------------------------------


KeyError: '0'

In [9]:
import zarr
import json

def validate_ome_zarr_structure(path):
    """Check OME-Zarr v3 structure and print details nicely"""
    root = zarr.open(path, mode='r')

    print("="*60)
    print("OME-Zarr Structure Validation")
    print("="*60)

    # Root check
    if hasattr(root, 'attrs') and hasattr(root, '__getitem__'):
        print("✓ Root is a group")
    else:
        print("❌ Root is not a group!")
        return False

    # Multiscales metadata
    if 'multiscales' not in root.attrs:
        print("❌ Missing 'multiscales' attribute!")
        return False
    print("✓ Has multiscales metadata")

    multiscales = root.attrs['multiscales']
    if not isinstance(multiscales, list) or len(multiscales) == 0:
        print("❌ Multiscales is not a list or empty!")
        return False

    ms = multiscales[0]
    datasets = ms.get('datasets', [])
    print(f"✓ Multiscales defines {len(datasets)} datasets")

    # Dataset check
    print("\nDataset Check:")
    print("-"*60)
    for i, ds in enumerate(datasets):
        path_name = ds.get('path')
        print(f"\nDataset {i}: path='{path_name}'")

        if path_name not in root:
            print(f"  ❌ Array '{path_name}' not found!")
            return False

        arr = root[path_name]

        # Array validation (duck typing)
        if not hasattr(arr, 'shape') or not hasattr(arr, 'chunks') or not hasattr(arr, 'dtype'):
            print(f"  ❌ '{path_name}' is not array-like!")
            return False

        print(f"  ✓ Array exists")
        print(f"    Shape: {arr.shape}")
        print(f"    Chunks: {arr.chunks}")
        print(f"    Dtype: {arr.dtype}")

        # Coordinate transformations
        if 'coordinateTransformations' in ds:
            transforms = ds['coordinateTransformations']
            print(f"    Transforms: {transforms}")

    # Extra/missing arrays
    print("\n" + "="*60)
    print("Checking for unexpected arrays...")
    print("="*60)
    expected_paths = {ds['path'] for ds in datasets}
    actual_paths = set(root.array_keys())
    extra = actual_paths - expected_paths
    missing = expected_paths - actual_paths

    if extra:
        print(f"⚠️  Found unexpected arrays: {extra}")
    else:
        print("✓ No unexpected arrays")

    if missing:
        print(f"❌ Missing expected arrays: {missing}")
        return False

    # Full metadata
    print("\n" + "="*60)
    print("Full Multiscales Metadata:")
    print("="*60)
    print(json.dumps(multiscales, indent=2))

# Run validator
validate_ome_zarr_structure(output_path)

OME-Zarr Structure Validation
✓ Root is a group
✓ Has multiscales metadata
✓ Multiscales defines 4 datasets

Dataset Check:
------------------------------------------------------------

Dataset 0: path='0'
  ✓ Array exists
    Shape: (150, 3768, 2008)
    Chunks: (64, 64, 64)
    Dtype: float32
    Transforms: [{'type': 'scale', 'scale': [1.0, 1.0, 1.0]}]

Dataset 1: path='1'
  ✓ Array exists
    Shape: (75, 1884, 1004)
    Chunks: (64, 64, 64)
    Dtype: float32
    Transforms: [{'type': 'scale', 'scale': [2.0, 2.0, 2.0]}]

Dataset 2: path='2'
  ✓ Array exists
    Shape: (37, 942, 502)
    Chunks: (37, 64, 64)
    Dtype: float32
    Transforms: [{'type': 'scale', 'scale': [4.0, 4.0, 4.0]}]

Dataset 3: path='3'
  ✓ Array exists
    Shape: (18, 471, 251)
    Chunks: (18, 64, 64)
    Dtype: float32
    Transforms: [{'type': 'scale', 'scale': [8.0, 8.0, 8.0]}]

Checking for unexpected arrays...
✓ No unexpected arrays

Full Multiscales Metadata:
[
  {
    "version": "0.4",
    "name": "pyr