# Comparing data loading speeds with `kvikIO` and `Zarr` engine

Benchmark details:
- Loading a 1 year subset of the WeatherBench2 (~18.2GB)
- Calculate median time to load entire dataset in 23 mini-batches (of batch size 32),
  across 10 epochs.

In [1]:
import importlib
import time

import lightning as L
import numpy as np
import torch
from tqdm.auto import tqdm, trange

module = importlib.import_module(name="1_benchmark_kvikIOzarr")

In [2]:
# Global variables
time_dict: dict = {"kvikio": 0, "zarr": 0}

In [3]:
# Training loop data loading function
def training_loop(train_dataloader) -> list[float]:
    epoch_timings: list = []
    for epoch in trange(10):
        # Start timing
        tic: float = time.perf_counter()

        # Mini-batch processing
        for i, batch in tqdm(iterable=enumerate(train_dataloader), total=23):
            input, target, metadata = batch
            # Compute Mean Squared Error loss between t=0 and t=1, just for fun
            loss: torch.Tensor = torch.functional.F.mse_loss(input=input, target=target)
            # print(f"Batch {i}, MSE Loss: {loss}")

        # Stop timing
        toc: float = time.perf_counter()
        epoch_timings.append(toc - tic)
    return epoch_timings

## kvikIO engine

In [4]:
# Setup data
datamodule: L.LightningDataModule = module.WeatherBench2DataModule(engine="kvikio")
datamodule.setup()
train_dataloader = datamodule.train_dataloader()

Loading data using kvikio engine


In [5]:
# Training loop
epoch_timings: list[float] = training_loop(train_dataloader=train_dataloader)

# Report timings
total_time: float = np.sum(a=epoch_timings)
median_time: float = np.median(a=epoch_timings)
mean_time: float = np.mean(a=epoch_timings)
std_time: float = np.std(a=epoch_timings)
print(
    f"Total: {total_time:0.4f} seconds, "
    f"Median: {median_time:0.4f} seconds/epoch, "
    f"Mean: {mean_time:0.4f} ± {std_time:0.4f} seconds/epoch"
)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

Total: 113.9475 seconds, Median: 11.3795 seconds/epoch, Mean: 11.3948 ± 0.5356 seconds/epoch


In [6]:
time_dict["kvikio"] = median_time

## Zarr engine

In [7]:
# Setup data
datamodule: L.LightningDataModule = module.WeatherBench2DataModule(engine="zarr")
datamodule.setup()
train_dataloader = datamodule.train_dataloader()

Loading data using zarr engine


In [8]:
# Training loop
epoch_timings: list[float] = training_loop(train_dataloader=train_dataloader)

# Report timings
total_time: float = np.sum(a=epoch_timings)
median_time: float = np.median(a=epoch_timings)
mean_time: float = np.mean(a=epoch_timings)
std_time: float = np.std(a=epoch_timings)
print(
    f"Total: {total_time:0.4f} seconds, "
    f"Median: {median_time:0.4f} seconds/epoch, "
    f"Mean: {mean_time:0.4f} ± {std_time:0.4f} seconds/epoch"
)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

Total: 136.7952 seconds, Median: 13.6663 seconds/epoch, Mean: 13.6795 ± 0.0845 seconds/epoch


In [9]:
time_dict["zarr"] = median_time

## Summary results

In [10]:
# kivkIO engine should take less time than Zarr engine
assert time_dict["kvikio"] < time_dict["zarr"]

In [11]:
speedup: float = (time_dict["zarr"] - time_dict["kvikio"]) / time_dict["kvikio"]
print(f"kvikIO engine is {speedup * 100:0.2f}% faster than Zarr engine")

kvikIO engine is 20.10% faster than Zarr engine
