Third version HDF5 -> OME-Zarr Converter

In [1]:
import h5py
import zarr
import time
import dask
import dask.array as da
import numpy as np
from numcodecs import Blosc
from pathlib import Path
from ome_zarr.io import parse_url
from ome_zarr.writer import write_multiscale

print("All libraries imported succesfully!")

All libraries imported succesfully!


In [2]:
# Conversion arguments

input_path = "/Users/tobiasschleiss/documents/dtu/thesis/input/brain_2bin_cropSmall.h5"
output_path = "/Users/tobiasschleiss/Documents/DTU/Thesis/output/output.ome.zarr"
target_chunks = (64, 64, 64)
dataset_path = 'exchange/data'
temp_chunk_size=(64, 512, 512)
max_mem_gb=12.4
pyramid_levels = 5
downsample_factor=2
compression_level=3
target_top_level_mb=10

n_workers = 2
worker_mem = 4 # memory per wroker in GB

In [3]:
# Inspect HDF5 file
with h5py.File(input_path, 'r') as f:
    if dataset_path not in f:
        print(f"  ERROR: Dataset '{dataset_path}' not found")
        print(f"  Available paths: {list(f.keys())}")
        
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
    h5_chunks = dataset.chunks
    data_size_gb = dataset.nbytes / (1024**3)
    data_size_mb = dataset.nbytes / (1024**2)
    dtype_size = dtype.itemsize
        
    print(f"  Shape: {shape}")
    print(f"  Dtype: {dtype}")
    print(f"  Size: {data_size_gb:.2f} GB")
    print(f"  HDF5 chunks: {h5_chunks if h5_chunks else 'Contiguous'}")

  Shape: (150, 3768, 2008)
  Dtype: float32
  Size: 4.23 GB
  HDF5 chunks: Contiguous


In [4]:
"""Calculate pyramid levels based on target top-level size"""
    
# Calculate levels needed
levels = 1
current_size_mb = data_size_mb
    
while current_size_mb > target_top_level_mb:
    current_size_mb = current_size_mb / (downsample_factor ** 3)
    levels += 1

print(f"Target top level: {target_top_level_mb} MB")
print(f"Recommended levels: {levels}")
print(f"Actual top level: {current_size_mb:.1f} MB")

pyramid_levels = levels

Base size: 4329.4 MB
Target top level: 10 MB
Recommended levels: 4
Actual top level: 8.5 MB


In [5]:
print("Stage 1: HDF5 -> Temporary Zarr (optimized for sequential reads)")
    
temp_path = Path(output_path).parent / "temp_rechunk.zarr"
print(temp_path)
    
# Open HDF5
with h5py.File(input_path, 'r') as f:
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
        
    print(f"  Source shape: {shape}, dtype: {dtype}")
        
    # Create temporary zarr with chunks optimized for reading from contiguous HDF5
    temp_store = zarr.DirectoryStore(temp_path)
    temp_root = zarr.open_group(temp_store, mode='w')
        
    temp_array = temp_root.create_dataset(
        'data',
        shape=shape,
        chunks=temp_chunk_size,  # Large Z-slabs
        dtype=dtype,
        compressor=Blosc(cname='lz4', clevel=1)  # Fast compression for temp
    )
        
    # Copy in large slabs (efficient for contiguous HDF5)

    temp_start = time.time()
    
    slab_size = temp_chunk_size[0]
    for z_start in range(0, shape[0], slab_size):
        z_end = min(z_start + slab_size, shape[0])
        print(f"  Reading Z planes {z_start}-{z_end}")
            
        slab = dataset[z_start:z_end, :, :]
        temp_array[z_start:z_end, :, :] = slab
        del slab

    temp_time = time.time() - temp_start

    print(f"\nTiming breakdown:")
    print(f"  Writing: {temp_time:.1f}s")

Stage 1: HDF5 -> Temporary Zarr (optimized for sequential reads)
/Users/tobiasschleiss/Documents/DTU/Thesis/output/temp_rechunk.zarr
  Source shape: (150, 3768, 2008), dtype: float32
  Reading Z planes 0-64
  Reading Z planes 64-128
  Reading Z planes 128-150

Timing breakdown:
  Writing: 3.1s


In [6]:
# Configure dask for memory constraints
dask.config.set({
    'array.chunk-size': f'{int(max_mem_gb * 0.3)}GB',
    'distributed.worker.memory.target': 0.7,
    'distributed.worker.memory.spill': 0.8,
})
    
print("=" * 60)
print("Building OME-Zarr Multi-Resolution Pyramid")
print("=" * 60)
    
# Setup output zarr store    
store = zarr.NestedDirectoryStore(output_path)
root = zarr.open_group(store=store, mode='w')
    
# Compressor for all levels
compressor = Blosc(cname='zstd', clevel=compression_level, shuffle=Blosc.BITSHUFFLE)
    
# ===== LEVEL 0: Rechunk from (64, 512, 512) to (64, 64, 64) =====
print(f"\n{'='*60}")
print("LEVEL 0: Rechunking to target chunks")
print(f"{'='*60}")
    
# Load input zarr
source = da.from_zarr(temp_path, component='data')
source_shape = source.shape
source_chunks = source.chunksize
    
print(f"Input shape: {source_shape}")
print(f"Input chunks: {source_chunks}")
print(f"Target chunks: {target_chunks}")
    
# Calculate memory requirements
source_chunk_bytes = np.prod(source_chunks) * source.dtype.itemsize
target_chunk_bytes = np.prod(target_chunks) * source.dtype.itemsize
    
# Estimate memory: dask typically needs ~10 chunks in memory during rechunking
estimated_mem_gb = (source_chunk_bytes * 10 + target_chunk_bytes * 10) / 1e9
print(f"Estimated memory usage: {estimated_mem_gb:.2f} GB")
    
if estimated_mem_gb > max_mem_gb * 0.8:
    print(f"WARNING: May approach memory limit!")
    
# Rechunk to target
level_0 = source.rechunk(target_chunks)
    
# Write level 0
print("Writing level 0...")
level_0.to_zarr(
    store,
    component='0',
    compressor=compressor,
    overwrite=True
)
    
print(f"✓ Level 0 complete: shape={source_shape}, chunks={target_chunks}")
    
# ===== PYRAMID LEVELS: Build each level from previous =====
current_shape = source_shape
    
for level in range(1, pyramid_levels):
    print(f"\n{'='*60}")
    print(f"LEVEL {level}: Downsampling")
    print(f"{'='*60}")
        
    # Calculate new shape after downsampling
    new_shape = tuple(max(1, s // downsample_factor) for s in current_shape)
        
    print(f"Previous shape: {current_shape}")
    print(f"New shape: {new_shape}")
        
    # Load previous level
    prev_array = da.from_zarr(store, component=str(level - 1))
        
    # Calculate optimal chunk size for reading previous level
    # We want chunks that align well with both source and target
    read_chunks = tuple(
        min(c * downsample_factor, s) 
        for c, s in zip(target_chunks, current_shape)
    )
        
    # Rechunk previous level for efficient downsampling
    prev_array = prev_array.rechunk(read_chunks)
        
    print(f"Reading with chunks: {read_chunks}")
    
    # Downsample using coarsen (block mean)
    # This is memory efficient and HDD-friendly
    downsampled = da.coarsen(
        np.mean,
        prev_array,
        {0: downsample_factor, 1: downsample_factor, 2: downsample_factor},
        trim_excess=True
    ).astype(prev_array.dtype)
        
    # Rechunk to target chunks
    # Adjust chunk size if array is smaller than target chunks
    level_chunks = tuple(min(tc, ns) for tc, ns in zip(target_chunks, new_shape))
    downsampled = downsampled.rechunk(level_chunks)
        
    print(f"Target chunks: {level_chunks}")
        
    # Calculate memory for this level
    level_mem_gb = np.prod(read_chunks) * prev_array.dtype.itemsize * 10 / 1e9
    print(f"Estimated memory: {level_mem_gb:.2f} GB")
        
    # Write to zarr
    print(f"Writing level {level}...")
    downsampled.to_zarr(
        store,
        component=str(level),
        compressor=compressor,
        overwrite=True
    )
        
    print(f"✓ Level {level} complete: shape={new_shape}, chunks={level_chunks}")
        
    # Update current shape for next iteration
    current_shape = new_shape
    
# ===== ADD OME-ZARR METADATA =====
print(f"\n{'='*60}")
print("Adding OME-Zarr Metadata")
print(f"{'='*60}")
    
# Build datasets list
datasets = []
for level in range(pyramid_levels):
    scale_factor = downsample_factor ** level
    datasets.append({
        'path': str(level),
        'coordinateTransformations': [{
            'type': 'scale',
            'scale': [
                float(scale_factor),  # z
                float(scale_factor),  # y
                float(scale_factor)   # x
            ]
        }]
    })
    
# Add multiscales metadata
root.attrs['multiscales'] = [{
    'version': '0.4',
    'name': 'pyramid',
    'axes': [
        {'name': 'z', 'type': 'space', 'unit': 'micrometer'},
        {'name': 'y', 'type': 'space', 'unit': 'micrometer'},
        {'name': 'x', 'type': 'space', 'unit': 'micrometer'}
    ],
    'datasets': datasets,
    'type': 'mean',  # Downsampling method
    'metadata': {
        'description': 'Multi-resolution pyramid',
        'method': 'block mean downsampling'
    }
}]
    
# Add optional OMERO metadata for visualization
root.attrs['omero'] = {
    'version': '0.4',
    'channels': [{
        'color': 'FFFFFF',
        'window': {'start': 0, 'end': 65535, 'min': 0, 'max': 65535},
        'label': 'Channel_0',
        'active': True
    }]
}
    
print("\n" + "=" * 60)
print("Pyramid Complete!")
print("=" * 60)
print(f"\nOutput: {output_path}")
print(f"Number of levels: {pyramid_levels}")
print("\nPyramid Summary:")
print("-" * 60)
    
for level in range(pyramid_levels):
    arr = zarr.open(store, mode='r')[str(level)]
    size_gb = np.prod(arr.shape) * arr.dtype.itemsize / 1e9
    print(f"  Level {level}: shape={arr.shape}, chunks={arr.chunks}, size={size_gb:.2f} GB")
    
# Cleanup temp
import shutil
shutil.rmtree(temp_path)
print("Temporary files cleaned up")
    

Building OME-Zarr Multi-Resolution Pyramid

LEVEL 0: Rechunking to target chunks
Input shape: (150, 3768, 2008)
Input chunks: (64, 512, 512)
Target chunks: (64, 64, 64)
Estimated memory usage: 0.68 GB
Writing level 0...


  store = zarr.NestedDirectoryStore(output_path)


✓ Level 0 complete: shape=(150, 3768, 2008), chunks=(64, 64, 64)

LEVEL 1: Downsampling
Previous shape: (150, 3768, 2008)
New shape: (75, 1884, 1004)
Reading with chunks: (128, 128, 128)
Target chunks: (64, 64, 64)
Estimated memory: 0.08 GB
Writing level 1...
✓ Level 1 complete: shape=(75, 1884, 1004), chunks=(64, 64, 64)

LEVEL 2: Downsampling
Previous shape: (75, 1884, 1004)
New shape: (37, 942, 502)
Reading with chunks: (75, 128, 128)
Target chunks: (37, 64, 64)
Estimated memory: 0.05 GB
Writing level 2...
✓ Level 2 complete: shape=(37, 942, 502), chunks=(37, 64, 64)

LEVEL 3: Downsampling
Previous shape: (37, 942, 502)
New shape: (18, 471, 251)
Reading with chunks: (37, 128, 128)
Target chunks: (18, 64, 64)
Estimated memory: 0.02 GB
Writing level 3...
✓ Level 3 complete: shape=(18, 471, 251), chunks=(18, 64, 64)

Adding OME-Zarr Metadata

Pyramid Complete!

Output: /Users/tobiasschleiss/Documents/DTU/Thesis/output/output.ome.zarr
Number of levels: 4

Pyramid Summary:
--------------

In [5]:
# Inspection of temp data
source = da.from_zarr("/Users/tobiasschleiss/Documents/DTU/Thesis/output/temp_rechunk.zarr", component='data')
    
print(f"  Source chunks: {source.chunksize}")
print(f"  Source shape: {source.shape}")

  Source chunks: (64, 512, 512)
  Source shape: (150, 3768, 2008)


In [6]:
# Check for unexpected arrays
root = zarr.open('/Users/tobiasschleiss/Documents/DTU/Thesis/output/output.ome.zarr', mode='r')
print("All arrays in root:", list(root.array_keys()))
print("All groups in root:", list(root.group_keys()))

All arrays in root: ['0', '1']
All groups in root: []


In [18]:
import zarr
import json

def validate_ome_zarr_structure(path):
    """Check if OME-Zarr structure is correct"""
    
    root = zarr.open(path, mode='r')
    
    print("=" * 60)
    print("OME-Zarr Structure Validation")
    print("=" * 60)
    
    # Check root is a group
    if not isinstance(root, zarr.hierarchy.Group):
        print("❌ Root is not a group!")
        return False
    
    print("✓ Root is a group")
    
    # Check multiscales metadata
    if 'multiscales' not in root.attrs:
        print("❌ Missing 'multiscales' attribute!")
        return False
    
    print("✓ Has multiscales metadata")
    
    multiscales = root.attrs['multiscales']
    if not isinstance(multiscales, list) or len(multiscales) == 0:
        print("❌ Multiscales is not a list or is empty!")
        return False
    
    ms = multiscales[0]
    datasets = ms.get('datasets', [])
    
    print(f"✓ Multiscales defines {len(datasets)} datasets")
    
    # Check each dataset exists
    print("\nDataset Check:")
    print("-" * 60)
    
    for i, ds in enumerate(datasets):
        path_name = ds.get('path')
        print(f"\nDataset {i}: path='{path_name}'")
        
        if path_name not in root:
            print(f"  ❌ Array '{path_name}' not found in root!")
            return False
        
        arr = root[path_name]
        
        if not isinstance(arr, zarr.core.Array):
            print(f"  ❌ '{path_name}' is not an array!")
            return False
        
        print(f"  ✓ Array exists")
        print(f"    Shape: {arr.shape}")
        print(f"    Chunks: {arr.chunks}")
        print(f"    Dtype: {arr.dtype}")
        
        # Check coordinate transformations
        if 'coordinateTransformations' in ds:
            transforms = ds['coordinateTransformations']
            print(f"    Transforms: {transforms}")
    
    # Check for extra arrays
    print("\n" + "=" * 60)
    print("Checking for unexpected arrays...")
    print("=" * 60)
    
    expected_paths = {ds['path'] for ds in datasets}
    actual_paths = set(root.array_keys())
    
    extra = actual_paths - expected_paths
    missing = expected_paths - actual_paths
    
    if extra:
        print(f"⚠️  Found unexpected arrays: {extra}")
        print("   These might cause validator confusion!")
    else:
        print("✓ No unexpected arrays")
    
    if missing:
        print(f"❌ Missing expected arrays: {missing}")
        return False
    
    # Print full metadata
    print("\n" + "=" * 60)
    print("Full Multiscales Metadata:")
    print("=" * 60)
    print(json.dumps(multiscales, indent=2))
    
    return True

# Run validation
validate_ome_zarr_structure('/Users/tobiasschleiss/Documents/DTU/Thesis/output/output.ome.zarr')

OME-Zarr Structure Validation
✓ Root is a group
✓ Has multiscales metadata
✓ Multiscales defines 2 datasets

Dataset Check:
------------------------------------------------------------

Dataset 0: path='0'
  ✓ Array exists
    Shape: (150, 3768, 2008)
    Chunks: (64, 64, 64)
    Dtype: float32
    Transforms: [{'scale': [1.0, 1.0, 1.0], 'type': 'scale'}]

Dataset 1: path='1'
  ✓ Array exists
    Shape: (75, 1884, 1004)
    Chunks: (64, 64, 64)
    Dtype: float32
    Transforms: [{'scale': [2.0, 2.0, 2.0], 'type': 'scale'}]

Checking for unexpected arrays...
✓ No unexpected arrays

Full Multiscales Metadata:
[
  {
    "axes": [
      {
        "name": "z",
        "type": "space",
        "unit": "micrometer"
      },
      {
        "name": "y",
        "type": "space",
        "unit": "micrometer"
      },
      {
        "name": "x",
        "type": "space",
        "unit": "micrometer"
      }
    ],
    "datasets": [
      {
        "coordinateTransformations": [
          {
     

True