In [1]:
import h5py
import zarr
import time
import dask.array as da
from numcodecs import Blosc
from pathlib import Path
from ome_zarr.io import parse_url
from ome_zarr.writer import write_multiscale

print("All libraries imported succesfully!")

All libraries imported succesfully!


In [2]:
# Conversion arguments

input_path = "/Users/tobiasschleiss/documents/dtu/thesis/input/brain_2bin_cropSmall.h5"
output_path = "/Users/tobiasschleiss/Documents/DTU/Thesis/output/output.ome.zarr"
target_chunks = (64, 64, 64)
dataset_path = 'exchange/data'
temp_chunk_size=(64, 512, 512)
max_mem_gb=12.4
pyramid_levels = 5

n_workers = 2
worker_mem = 4 # memory per wroker in GB

In [3]:
# Inspect HDF5 file
with h5py.File(input_path, 'r') as f:
    if dataset_path not in f:
        print(f"  ERROR: Dataset '{dataset_path}' not found")
        print(f"  Available paths: {list(f.keys())}")
        
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
    h5_chunks = dataset.chunks
    data_size_gb = dataset.nbytes / (1024**3)
    dtype_size = dtype.itemsize
        
    print(f"  Shape: {shape}")
    print(f"  Dtype: {dtype}")
    print(f"  Size: {data_size_gb:.2f} GB")
    print(f"  HDF5 chunks: {h5_chunks if h5_chunks else 'Contiguous'}")

  Shape: (150, 3768, 2008)
  Dtype: float32
  Size: 4.23 GB
  HDF5 chunks: Contiguous


In [4]:
print("Stage 1: HDF5 -> Temporary Zarr (optimized for sequential reads)")
    
temp_path = Path(output_path).parent / "temp_rechunk.zarr"
print(temp_path)
    
# Open HDF5
with h5py.File(input_path, 'r') as f:
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
        
    print(f"  Source shape: {shape}, dtype: {dtype}")
        
    # Create temporary zarr with chunks optimized for reading from contiguous HDF5
    temp_store = zarr.DirectoryStore(temp_path)
    temp_root = zarr.open_group(temp_store, mode='w')
        
    temp_array = temp_root.create_dataset(
        'data',
        shape=shape,
        chunks=temp_chunk_size,  # Large Z-slabs
        dtype=dtype,
        compressor=Blosc(cname='lz4', clevel=1)  # Fast compression for temp
    )
        
    # Copy in large slabs (efficient for contiguous HDF5)

    temp_start = time.time()
    
    slab_size = temp_chunk_size[0]
    for z_start in range(0, shape[0], slab_size):
        z_end = min(z_start + slab_size, shape[0])
        print(f"  Reading Z planes {z_start}-{z_end}")
            
        slab = dataset[z_start:z_end, :, :]
        temp_array[z_start:z_end, :, :] = slab
        del slab

    temp_time = time.time() - temp_start

    print(f"\nTiming breakdown:")
    print(f"  Writing: {temp_time:.1f}s")

Stage 1: HDF5 -> Temporary Zarr (optimized for sequential reads)
/Users/tobiasschleiss/Documents/DTU/Thesis/output/temp_rechunk.zarr
  Source shape: (150, 3768, 2008), dtype: float32
  Reading Z 0-64
  Reading Z 64-128
  Reading Z 128-150

Timing breakdown:
  Writing: 3.3s


In [6]:
print("\nStage 2: Temporary Zarr -> Final Zarr (target chunks)")

# Write to final location
compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)

try:
    print(f"  Reading from Zarr...")
    data = da.from_zarr(temp_path, component='data')
    print(f"  Source chunks: {data.chunksize}")
    print(f"  Target chunks: {target_chunks}")

    # Create pyramid
    pyramid = [data]
    print(f"  Creating {pyramid_levels}-level pyramid...")
    for i in range(1, pyramid_levels):
        pyramid.append(pyramid[-1][::2, ::2, ::2])
            
    # Write OME-Zarr
    print("Writing OME-Zarr...")
    write_start = time.time()
        
    store = parse_url(output_path, mode='w').store
    root = zarr.group(store=store)
            
    write_multiscale(
        pyramid=pyramid,
        group=root,
        axes="zyx",
        storage_options={
            'chunks': target_chunks,
        },
        compute=True
    )
        

    write_time = time.time() - write_start
            
    print(f"\n✓ Conversion complete!")
    print(f"  Output: {output_path}")
         
    print(f"\nTiming breakdown:")
    print(f"  Writing: {write_time:.1f}s")
            
    # Show output size
    if Path(output_path).exists():
        output_size_gb = sum(f.stat().st_size for f in Path(output_path).rglob('*') if f.is_file()) / (1024**3)
        write_speed_mbps = (output_size_gb * 1024) / write_time if write_time > 0 else 0
        print(f"  Output size: {output_size_gb:.2f} GB")
        print(f"  Write speed: {write_speed_mbps:.1f} MB/s")

    # Cleanup temp
    import shutil
    shutil.rmtree(temp_path)
    print("Temporary files cleaned up")
        
except Exception as e:
    print(f"\n✗ Error during conversion: {e}")
    import traceback
    traceback.print_exc()



Stage 2: Temporary Zarr -> Final Zarr (target chunks)
  Source chunks: (64, 512, 512)
  Target chunks: (64, 64, 64)
  Reading from Zarr...
  Source chunks: (64, 512, 512)
  Target chunks: (64, 64, 64)
  Creating 5-level pyramid...
Writing OME-Zarr...

✓ Conversion complete!
  Output: /Users/tobiasschleiss/Documents/DTU/Thesis/output/output.ome.zarr

Timing breakdown:
  Writing: 7.8s
  Output size: 1.76 GB
  Write speed: 229.0 MB/s
Temporary files cleaned up


In [7]:
# Inspection of temp data
source = da.from_zarr("/Users/tobiasschleiss/Documents/DTU/Thesis/output/temp_rechunk.zarr", component='data')
    
print(f"  Source chunks: {source.chunksize}")
print(f"  Source shape: {source.shape}")

TypeError: shape is None