In [7]:
import h5py
import zarr
import numpy as np
import time
from datetime import timedelta
from numcodecs import Blosc
import dask
from dask.distributed import Client, LocalCluster, progress
from dask.diagnostics import ProgressBar
import dask.array as da
from pathlib import Path
from math import ceil

print("All libraries imported succesfully!")
print(zarr.__version__)

All libraries imported succesfully!
2.18.7


In [8]:
# Conversion arguments

input_path = "/Users/tobiasschleiss/documents/dtu/thesis/input/brain_2bin_cropSmall.h5"
output_path = "/Users/tobiasschleiss/Documents/DTU/Thesis/output/parallel.ome.zarr"
target_chunks = (64, 64, 64)
dataset_path = 'exchange/data'
max_mem_gb=32
safety_factor=0.7
pyramid_levels = 5
downsample_factor=2
compression_level=3
target_top_level_mb=10

n_workers = 4
threads_per_worker=1
memory_limit = "6GB"

In [9]:
# =========================
# CLUSTER
# =========================
cluster = LocalCluster(
    n_workers=n_workers,
    threads_per_worker=threads_per_worker,
    processes=True,
    memory_limit=memory_limit,
    dashboard_address=":8787",
)
client = Client(cluster)
print(f"✓ Dask dashboard: {client.dashboard_link}")

# =========================
# OPEN HDF5
# =========================
h5 = h5py.File(input_path, "r")
dset = h5[dataset_path]
shape = dset.shape

print(f"Dataset shape: {shape}, dtype={dset.dtype}")

read_chunks = (512, 512, 512)

print(read_chunks)

# =========================
# DASK ARRAY
# =========================
darr = da.from_array(
    dset,
    chunks=read_chunks,
    lock=True  # h5py is not thread-safe
)

# =========================
# RECHUNK (single stage)
# =========================
print("Rechunking...")
darr = darr.rechunk(target_chunks)

# =========================
# WRITE ZARR
# =========================
compressor = Blosc(
    cname="zstd",
    clevel=compression_level,
    shuffle=Blosc.BITSHUFFLE
)

start = time.time()
with ProgressBar():
    darr.to_zarr(
        output_path,
        component="0",
        compressor=compressor,
        overwrite=True,
        dimension_separator="/"
    )

elapsed = time.time() - start
total_gb = np.prod(dset.shape) * dset.dtype.itemsize / 1e9

print(f"\n✓ Done in {elapsed:.1f}s")
print(f"✓ Throughput: {total_gb / elapsed:.2f} GB/s")

# =========================
# CLEANUP
# =========================
h5.close()
client.close()
cluster.close()

✓ Dask dashboard: http://127.0.0.1:8787/status
Dataset shape: (150, 3768, 2008), dtype=float32
(512, 512, 512)
Rechunking...

✓ Done in 7.2s
✓ Throughput: 0.63 GB/s


In [4]:
# Inspection level_0
source = da.from_zarr(output_path, component='0')
    
print(f"  Source chunks: {source.chunksize}")
print(f"  Source shape: {source.shape}")

  Source chunks: (64, 64, 64)
  Source shape: (150, 3768, 2008)
