In [1]:
import sys
import numpy as np
import h5py

from plottr.data import datadict as dd
from plottr.data import datadict_storage as dds

# Simple timing for writing/reading a datadict

## Write

In [8]:
FN = './ddh5_test-1'

In [9]:
%%timeit
nrows = 10000

x = np.arange(nrows, dtype=np.float)
y = np.repeat(np.linspace(0., 1., 1001).reshape(1, -1), nrows, 0)
z = np.arange(y.size, dtype=np.float).reshape(y.shape)

# print(f"total size = {(x.nbytes + y.nbytes + z.nbytes) * 1e-6} MB")

data = dd.DataDict(
    x=dict(values=x, unit='nA'), 
    y=dict(values=y, unit='nB'),
    z=dict(values=z, unit='nC', axes=['x', 'y']),
)
if not data.validate():
    raise ValueError

dds.datadict_to_hdf5(data, FN)

71.7 ms ± 1.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Read back

In [10]:
%%timeit
ret_data = dds.datadict_from_hdf5(FN)
size = sum([ret_data.data_vals(k).nbytes for k in ['x', 'y', 'z']]) * 1e-6
# print(f"total size = {size} MB")

310 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Appending row by row

In [11]:
FN = './ddh5_test-2'
nrows = 100

In [12]:
%%timeit

x = np.array([0.])
y = np.linspace(0., 1., 1001).reshape(1, -1)
z = np.arange(y.size, dtype=np.float).reshape(y.shape)

data = dd.DataDict(
    x=dict(values=x, unit='nA'), 
    y=dict(values=y, unit='nB'),
    z=dict(values=z, unit='nC', axes=['x', 'y']),
)

dds.datadict_to_hdf5(data, FN, append_mode=dds.AppendMode.none)

for n in range(nrows):
    data = dd.DataDict(
        x=dict(values=np.array([n+1], dtype=np.float), unit='nA'), 
        y=dict(values=y, unit='nB'),
        z=dict(values=z, unit='nC', axes=['x', 'y']),
    )
    dds.datadict_to_hdf5(data, FN, append_mode=dds.AppendMode.all)

452 ms ± 52.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


It's important to note that the bulk of this time is just for opening the files. Below we can see that opening the HDF5 file in append mode takes us around 3 ms.

In [15]:
%%timeit
with h5py.File(FN+'.dd.h5', 'a') as f:
    # just do something of no effect.
    dsets = list(f['data'].keys())

3.26 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Bare HDF5 benchmarking

## appending row by row, resize every time

In [16]:
%%timeit

FN = './hdf5_test.h5'

nrows = 100

x = np.array([0.])
y = np.linspace(0., 1., 1001).reshape(1, -1)
z = np.arange(y.size, dtype=np.float).reshape(y.shape)

with h5py.File(FN, 'w', libver='latest') as f:   
    grp = f.create_group('data')
    for dn, d in ('x', x), ('y', y), ('z', z):
        grp.create_dataset(dn, maxshape=tuple([None] + list(d.shape[1:])), data=d)
        
for n in range(nrows):   
    with h5py.File(FN, 'a', libver='latest') as f:
        grp = f['data']
        for dn, d in ('x', x), ('y', y), ('z', z):
            ds = grp[dn]
            ds.resize(tuple([ds.shape[0]+1] + list(ds.shape[1:])))
            ds[-1:] = d
            ds.flush()
        f.flush()

460 ms ± 37.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
