In [None]:
#Requires Zarr version less than 3

In [5]:
import h5py
import zarr
import time
from datetime import timedelta
import dask
import dask.array as da
from dask.diagnostics import ProgressBar
import numpy as np
from numcodecs import Blosc
from pathlib import Path
from tqdm import tqdm

print("All libraries imported succesfully!")
print(zarr.__version__)

All libraries imported succesfully!
2.18.7


In [6]:
# Conversion arguments
#input_path = "/dtu/3d-imaging-center/projects/2023_CoM-BraiN/analysis/OME_Output/small_wMB_4bin.h5"
#output_path = "/dtu/3d-imaging-center/projects/2023_CoM-BraiN/analysis/OME_Output/contiguous_test1.ome.zarr"
input_path = "/Users/tobiasschleiss/documents/dtu/thesis/input/new_small_wMB_4bin.h5"
output_path = "/Users/tobiasschleiss/Documents/DTU/Thesis/output/contiguous_test1.ome.zarr"

target_chunks = (64, 64, 64)
dataset_path = 'exchange/data'
compression_level=3

no_seeks = True

if no_seeks == False:
    block_shape = (64, 2200, 2200) #Approximatly 1.2GB with no disk seeks
else:
    block_shape = (640, 640, 640) #Approximatly 1GB with many disk seeks

In [7]:
# Inspect HDF5 file
with h5py.File(input_path, 'r') as f:
    if dataset_path not in f:
        print(f"  ERROR: Dataset '{dataset_path}' not found")
        print(f"  Available paths: {list(f.keys())}")
        
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
    data_size_gb = dataset.nbytes / (1024**3)
    dtype_size = dtype.itemsize
        
    print(f"  Shape: {shape}")
    print(f"  Dtype: {dtype}")
    print(f"  Size: {data_size_gb:.2f} GB")

  Shape: (1651, 2200, 2200)
  Dtype: float32
  Size: 29.77 GB


In [8]:
print("Stage 1: HDF5 -> Level 0 zarr (optimized for contigous reads)")

# Open HDF5
with h5py.File(input_path, 'r') as f:
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype 

    block_z, block_y, block_x = block_shape
    print(f"Selected block shape: {block_shape}")
    z_total, y_total, x_total = shape
        
    # Setup output zarr store    
    store = zarr.NestedDirectoryStore(output_path)
    root = zarr.open_group(store=store, mode='w')

    # Compressor for all levels
    compressor = Blosc(cname='zstd', clevel=compression_level, shuffle=Blosc.BITSHUFFLE)
        
    level_0 = root.create_dataset(
        '0',
        shape=shape,
        chunks=target_chunks,
        dtype=dtype,
        compressor=compressor
    )

    level0_start = time.time()
    block_count = 0
        
    # Calculate total blocks
    total_blocks = (
        ((z_total + block_z - 1) // block_z) *
        ((y_total + block_y - 1) // block_y) *
        ((x_total + block_x - 1) // block_x)
    )
        
    print(f"Processing {total_blocks} blocks...")

    # Iterate over dataset
    for z_start in range(0, z_total, block_z):
        z_end = min(z_start + block_z, z_total)
            
        for y_start in range(0, y_total, block_y):
            y_end = min(y_start + block_y, y_total)

            for x_start in range(0, x_total, block_x):
                x_end = min(x_start + block_x, x_total)
                    
                block_count += 1
                        
                # Read block from HDF5
                block = dataset[z_start:z_end, y_start:y_end, x_start:x_end]
                        
                # Write to zarr (zarr will internally chunk to target_chunks)
                level_0[z_start:z_end, y_start:y_end, x_start:x_end] = block
                        
                del block
                        
                # Progress reporting
                if block_count % 1 == 0 or block_count == total_blocks:
                    elapsed = time.time() - level0_start
                    rate = block_count / elapsed if elapsed > 0 else 0
                    eta = (total_blocks - block_count) / rate if rate > 0 else 0
                    progress = block_count / total_blocks * 100
                        
                    print(f"  Block {block_count:4d}/{total_blocks} ({progress:5.1f}%) - "
                            f"{rate:5.1f} blocks/s - ETA: {eta:6.0f}s")   
        
    elapsed_level0 = time.time() - level0_start
    throughput = (np.prod(shape) * dtype_size / 1e9) / elapsed
        
    print(f"\n✓ Level 0 complete in {elapsed_level0:.1f}s")
    print(f"  {timedelta(seconds=int(elapsed_level0))}")
    print(f"  Throughput: {throughput:.2f} GB/s")

Stage 1: HDF5 -> Level 0 zarr (optimized for contigous reads)
Selected block shape: (640, 640, 640)
Processing 48 blocks...


  store = zarr.NestedDirectoryStore(output_path)


  Block    1/48 (  2.1%) -   0.3 blocks/s - ETA:    149s
  Block    2/48 (  4.2%) -   0.4 blocks/s - ETA:    107s
  Block    3/48 (  6.2%) -   0.5 blocks/s - ETA:     90s
  Block    4/48 (  8.3%) -   0.6 blocks/s - ETA:     75s
  Block    5/48 ( 10.4%) -   0.5 blocks/s - ETA:     88s
  Block    6/48 ( 12.5%) -   0.5 blocks/s - ETA:     83s
  Block    7/48 ( 14.6%) -   0.5 blocks/s - ETA:     77s
  Block    8/48 ( 16.7%) -   0.6 blocks/s - ETA:     69s
  Block    9/48 ( 18.8%) -   0.5 blocks/s - ETA:     75s
  Block   10/48 ( 20.8%) -   0.5 blocks/s - ETA:     71s
  Block   11/48 ( 22.9%) -   0.5 blocks/s - ETA:     68s
  Block   12/48 ( 25.0%) -   0.6 blocks/s - ETA:     63s
  Block   13/48 ( 27.1%) -   0.6 blocks/s - ETA:     61s
  Block   14/48 ( 29.2%) -   0.6 blocks/s - ETA:     56s
  Block   15/48 ( 31.2%) -   0.6 blocks/s - ETA:     52s
  Block   16/48 ( 33.3%) -   0.7 blocks/s - ETA:     48s
  Block   17/48 ( 35.4%) -   0.6 blocks/s - ETA:     50s
  Block   18/48 ( 37.5%) -   0.