In [4]:
import h5py
import zarr
import time
from datetime import timedelta
import dask
import dask.array as da
from dask.diagnostics import ProgressBar
import numpy as np
from numcodecs import Blosc
from pathlib import Path

print("All libraries imported succesfully!")
print(zarr.__version__)

All libraries imported succesfully!
2.18.7


In [5]:
# Conversion arguments

input_path = "/Users/tobiasschleiss/documents/dtu/thesis/input/brain_2bin_cropSmall.h5"
output_path = "/Users/tobiasschleiss/Documents/DTU/Thesis/output/test.ome.zarr"
target_chunks = (64, 64, 64)
dataset_path = 'exchange/data'
max_mem_gb=1
safety_factor=0.5
pyramid_levels = 5
downsample_factor=2
compression_level=3
target_top_level_mb=10

n_workers = 2
worker_mem = 4 # memory per wroker in GB

In [6]:
def benchmark_parallel_vs_sequential():
    """Quick test to see if parallelization helps"""
    import time
    
    # Test sequential
    print("Testing sequential...")
    start = time.time()
    with h5py.File(input_path, 'r') as f:
        for i in range(10):
            _ = f[dataset_path][i*100:(i+1)*100, :1000, :1000]
    seq_time = time.time() - start
    
    # Test parallel
    print("Testing parallel...")
    def read_block(i):
        with h5py.File(input_path, 'r') as f:
            return f[dataset_path][i*100:(i+1)*100, :1000, :1000]
    
    start = time.time()
    from concurrent.futures import ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=4) as executor:
        list(executor.map(read_block, range(10)))
    par_time = time.time() - start
    
    print(f"\nResults:")
    print(f"  Sequential: {seq_time:.2f}s")
    print(f"  Parallel:   {par_time:.2f}s")
    
    if par_time < seq_time * 0.8:
        print(f"  → Parallel is {seq_time/par_time:.1f}× faster - USE IT!")
    else:
        print(f"  → Sequential is faster - KEEP CURRENT APPROACH")

# Run this test on your system
benchmark_parallel_vs_sequential()

Testing sequential...
Testing parallel...

Results:
  Sequential: 1.23s
  Parallel:   0.16s
  → Parallel is 7.5× faster - USE IT!


In [8]:
import multiprocessing
import psutil

def check_parallelization_capability():
    """Check if parallelization would help"""
    
    # CPU cores
    physical_cores = psutil.cpu_count(logical=False)
    logical_cores = psutil.cpu_count(logical=True)
    
    # Memory
    total_memory_gb = psutil.virtual_memory().total / 1e9
    
    # Storage type (harder to detect, but you know it's HDD)
    storage_type = "SSD"  # You mentioned this
    
    print("="*60)
    print("Parallelization Analysis")
    print("="*60)
    print(f"Physical CPU cores: {physical_cores}")
    print(f"Logical cores (with hyperthreading): {logical_cores}")
    print(f"Total memory: {total_memory_gb:.1f} GB")
    print(f"Storage type: {storage_type}")
    
    # Decision logic
    print("\n" + "="*60)
    print("Recommendation:")
    print("="*60)
    
    if storage_type == "HDD":
        print("✗ HDD Storage: Parallelization NOT recommended")
        print("  Reason: Multiple parallel reads cause disk thrashing")
        print("  HDD has only one read/write head - sequential is faster")
        print("  Recommendation: Use single-threaded with large slabs")
        return False
    
    elif storage_type == "SSD":
        if physical_cores >= 4 and total_memory_gb >= 32:
            print("✓ SSD + 4+ cores + 32GB+ RAM: Parallelization recommended")
            print(f"  Recommended workers: {physical_cores}")
            return True
        else:
            print("△ SSD but limited resources: Marginal benefit")
            print("  Recommendation: Try with 2 workers")
            return True
    
    elif storage_type == "NVMe" or storage_type == "Network":
        print("✓ Fast storage: Parallelization highly recommended")
        print(f"  Recommended workers: {physical_cores * 2}")
        return True

check_parallelization_capability()

Parallelization Analysis
Physical CPU cores: 10
Logical cores (with hyperthreading): 10
Total memory: 34.4 GB
Storage type: SSD

Recommendation:
✓ SSD + 4+ cores + 32GB+ RAM: Parallelization recommended
  Recommended workers: 10


True