In [28]:
import h5py
import zarr
import time
from datetime import timedelta
import dask
import dask.array as da
import numpy as np
from numcodecs import Blosc
from pathlib import Path
from ome_zarr.io import parse_url
from ome_zarr.writer import write_multiscale

print("All libraries imported succesfully!")
print(zarr.__version__)

All libraries imported succesfully!
2.18.7


In [32]:
# Conversion arguments

input_path = "/Users/tobiasschleiss/documents/dtu/thesis/input/brain_2bin_cropSmall.h5"
output_path = "/Users/tobiasschleiss/Documents/DTU/Thesis/output/test.ome.zarr"
target_chunks = (128, 128, 128)
dataset_path = 'exchange/data'
max_mem_gb=0.2
safety_factor=0.6
pyramid_levels = 5
downsample_factor=2
compression_level=3
target_top_level_mb=10

n_workers = 2
worker_mem = 4 # memory per wroker in GB

In [33]:
# Inspect HDF5 file
with h5py.File(input_path, 'r') as f:
    if dataset_path not in f:
        print(f"  ERROR: Dataset '{dataset_path}' not found")
        print(f"  Available paths: {list(f.keys())}")
        
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
    h5_chunks = dataset.chunks
    data_size_gb = dataset.nbytes / (1024**3)
    data_size_mb = dataset.nbytes / (1024**2)
    dtype_size = dtype.itemsize
        
    print(f"  Shape: {shape}")
    print(f"  Dtype: {dtype}")
    print(f"  Size: {data_size_gb:.2f} GB")
    print(f"  HDF5 chunks: {h5_chunks if h5_chunks else 'Contiguous'}")

  Shape: (150, 3768, 2008)
  Dtype: float32
  Size: 4.23 GB
  HDF5 chunks: Contiguous


In [34]:
"""
Calculate optimal block size when full Z-slabs don't fit.
Returns block dimensions that align with target chunks.
"""
    
z_total, y_total, x_total = shape
target_z, target_y, target_x = target_chunks
    
available_bytes = max_mem_gb * 1e9 * safety_factor
    
print(f"\n{'='*60}")
print("Block Size Calculation (Memory-Constrained)")
print(f"{'='*60}")
print(f"Memory budget: {max_mem_gb:.2f} GB (safety: {int(safety_factor*100)}%)")
print(f"Dataset shape: {shape}")
print(f"Target chunks: {target_chunks}")
    
# Try to use target Z (best for zarr alignment)
block_z = target_z
bytes_for_full_plane = block_z * y_total * x_total * dtype_size
    
if bytes_for_full_plane <= available_bytes:
    # Full Z-slab fits!
    block_shape = (block_z, y_total, x_total)
    block_gb = bytes_for_full_plane / 1e9
    num_blocks = (z_total + block_z - 1) // block_z
        
    print(f"\n✓ Strategy: Full Z-slabs")
    print(f"  Block shape: {block_shape}")
    print(f"  Block size: {block_gb:.2f} GB")
    print(f"  Number of blocks: {num_blocks}")
        
    #return block_shape, 'full_slab'
    
# Full slab doesn't fit - use Y-blocks
print(f"\n⚠️  Full Z-slab ({bytes_for_full_plane/1e9:.2f} GB) doesn't fit")
    
# Calculate max Y that fits with target Z and full X
bytes_per_y_row = block_z * x_total * dtype_size
max_y = int(available_bytes / bytes_per_y_row)
print(f"max y rows: {max_y}")
    
if max_y < target_y:
    # Even one Y-row doesn't fit - need X-blocks too
    print(f"  target Y-row doesn't fit - using Y-X blocks")
        
    # Calculate Y and X that fit
    available_elements = available_bytes / (block_z * dtype_size)
        
    # Try to keep Y as large as possible, reduce X
    for y_mult in [8, 4, 2, 1]:
        block_y = target_y * y_mult
        if block_y > y_total:
            continue
            
        max_x = int(available_elements / block_y)
            
        # Align X to target chunks
        block_x = (max_x // target_x) * target_x
        block_x = max(target_x, block_x)
        block_x = min(block_x, x_total)
            
        block_bytes = block_z * block_y * block_x * dtype_size
            
        if block_bytes <= available_bytes:
            block_shape = (block_z, block_y, block_x)
            block_gb = block_bytes / 1e9
                
            num_z_blocks = (z_total + block_z - 1) // block_z
            num_y_blocks = (y_total + block_y - 1) // block_y
            num_x_blocks = (x_total + block_x - 1) // block_x
            total_blocks = num_z_blocks * num_y_blocks * num_x_blocks
                
            print(f"\n✓ Strategy: Y-X blocks")
            print(f"  Block shape: {block_shape}")
            print(f"  Block size: {block_gb:.2f} GB")
            print(f"  Blocks per dimension: Z:{num_z_blocks}, Y:{num_y_blocks}, X:{num_x_blocks}")
            print(f"  Total blocks: {total_blocks}")
                
            # return block_shape, 'yx_blocks'
        
    raise MemoryError(f"Cannot fit even minimal block size in {max_mem_gb:.2f} GB")
    
# Y-blocks with full X
# Align Y to target chunks
block_y = (max_y // target_y) * target_y
block_y = max(target_y, block_y)
block_y = min(block_y, y_total)
    
block_shape = (block_z, block_y, x_total)
block_gb = block_z * block_y * x_total * dtype_size / 1e9
    
num_z_blocks = (z_total + block_z - 1) // block_z
num_y_blocks = (y_total + block_y - 1) // block_y
total_blocks = num_z_blocks * num_y_blocks
    
print(f"\n✓ Strategy: Y-blocks (your approach!)")
print(f"  Block shape: {block_shape}")
print(f"  Block size: {block_gb:.2f} GB")
print(f"  Blocks per Z-slab: {num_y_blocks}")
print(f"  Total blocks: {total_blocks}")
print(f"  Note: {num_y_blocks} seeks per Z-slab vs 1 for full slab")
    
# return block_shape, 'y_blocks'


Block Size Calculation (Memory-Constrained)
Memory budget: 0.20 GB (safety: 60%)
Dataset shape: (150, 3768, 2008)
Target chunks: (128, 128, 128)

⚠️  Full Z-slab (3.87 GB) doesn't fit
max y rows: 116
  Even one Y-row doesn't fit - using Y-X blocks

✓ Strategy: Y-X blocks
  Block shape: (128, 1024, 128)
  Block size: 0.07 GB
  Blocks per dimension: Z:2, Y:4, X:16
  Total blocks: 128

✓ Strategy: Y-X blocks
  Block shape: (128, 512, 384)
  Block size: 0.10 GB
  Blocks per dimension: Z:2, Y:8, X:6
  Total blocks: 96

✓ Strategy: Y-X blocks
  Block shape: (128, 256, 896)
  Block size: 0.12 GB
  Blocks per dimension: Z:2, Y:15, X:3
  Total blocks: 90

✓ Strategy: Y-X blocks
  Block shape: (128, 128, 1792)
  Block size: 0.12 GB
  Blocks per dimension: Z:2, Y:30, X:2
  Total blocks: 120


MemoryError: Cannot fit even minimal block size in 0.20 GB

In [23]:
"""
HDF5 to Zarr conversion with automatic block sizing.
Falls back to Y-blocks or Y-X-blocks when memory constrained.
"""
    
print("="*60)
print("HDF5 to Zarr with Adaptive Blocking")
print("="*60)
    
with h5py.File(input_path, 'r') as f:
    dataset = f[dataset_path]
    shape = dataset.shape
    dtype = dataset.dtype
        
    print(f"\nDataset: {shape}, {dtype}")
        
    # Calculate optimal block size
    block_shape, strategy = calculate_optimal_block_size(-----
        shape, dtype, target_chunks, max_mem_gb, safety_factor
    )
        
    block_z, block_y, block_x = block_shape
    z_total, y_total, x_total = shape
        
    # Create zarr
    root = zarr.open_group(str(output_path), mode='w', zarr_format=2)
    compressor = Blosc(cname='zstd', clevel=compression_level, shuffle=Blosc.BITSHUFFLE)
        
    zarr_array = root.create_dataset(
        '0',
        shape=shape,
        chunks=target_chunks,
        dtype=dtype,
        compressor=compressor,
        dimension_separator='/'
    )
        
    print(f"\n{'='*60}")
    print("Writing Data")
    print(f"{'='*60}")
        
    start_time = time.time()
    block_count = 0
        
    # Calculate total blocks
    total_blocks = (
        ((z_total + block_z - 1) // block_z) *
        ((y_total + block_y - 1) // block_y) *
        ((x_total + block_x - 1) // block_x)
    )
        
    print(f"Processing {total_blocks} blocks...")
        
    # Iterate over blocks
    for z_start in range(0, z_total, block_z):
        z_end = min(z_start + block_z, z_total)
            
        for y_start in range(0, y_total, block_y):
            y_end = min(y_start + block_y, y_total)
                
            for x_start in range(0, x_total, block_x):
                x_end = min(x_start + block_x, x_total)
                    
                block_count += 1
                    
                # Read block from HDF5
                block = dataset[z_start:z_end, y_start:y_end, x_start:x_end]
                    
                # Write to zarr (zarr will internally chunk to target_chunks)
                zarr_array[z_start:z_end, y_start:y_end, x_start:x_end] = block
                    
                del block
                    
                # Progress reporting
                if block_count % 10 == 0 or block_count == total_blocks:
                    elapsed = time.time() - start_time
                    rate = block_count / elapsed if elapsed > 0 else 0
                    eta = (total_blocks - block_count) / rate if rate > 0 else 0
                    progress = block_count / total_blocks * 100
                    
                    print(f"  Block {block_count:4d}/{total_blocks} ({progress:5.1f}%) - "
                          f"{rate:5.1f} blocks/s - ETA: {eta:6.0f}s")
        
    elapsed = time.time() - start_time
    throughput = (np.prod(shape) * np.dtype(dtype).itemsize / 1e9) / elapsed
        
    print(f"\n✓ Level 0 complete in {elapsed:.1f}s")
    print(f"  Throughput: {throughput:.2f} GB/s")
        
    return root, strategy

HDF5 to Zarr with Adaptive Blocking


SyntaxError: 'return' outside function (2867855773.py, line 90)