## Setup

In [1]:
import zarr
zarr.__version__

'2.2.0a2.dev9+dirty'

In [2]:
import bsddb3
bsddb3.__version__

'6.2.5'

In [3]:
import lmdb
lmdb.__version__

'0.93'

In [4]:
import numpy as np

In [5]:
import dbm.gnu

In [6]:
import os
import shutil
bench_dir = '../data/bench'


def clean():
    if os.path.isdir(bench_dir):
        shutil.rmtree(bench_dir)
    os.makedirs(bench_dir)

    
def setup(a, name='foo/bar'):
    global fdict_z, hdict_z, lmdb_z, gdb_z, bdb_z, zip_z, dir_z
    
    clean()
    fdict_root = zarr.group(store=dict())
    hdict_root = zarr.group(store=zarr.DictStore())
    lmdb_root = zarr.group(store=zarr.LMDBStore(os.path.join(bench_dir, 'lmdb')))
    gdb_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'gdb'), open=dbm.gnu.open))
    bdb_root = zarr.group(store=zarr.DBMStore(os.path.join(bench_dir, 'bdb'), open=bsddb3.btopen))
    zip_root = zarr.group(store=zarr.ZipStore(os.path.join(bench_dir, 'zip'), mode='w'))
    dir_root = zarr.group(store=zarr.DirectoryStore(os.path.join(bench_dir, 'dir')))

    fdict_z = fdict_root.empty_like(name, a)
    hdict_z = hdict_root.empty_like(name, a)
    lmdb_z = lmdb_root.empty_like(name, a)
    gdb_z = gdb_root.empty_like(name, a)
    bdb_z = bdb_root.empty_like(name, a)
    zip_z = zip_root.empty_like(name, a)
    dir_z = dir_root.empty_like(name, a)

    # check compression ratio
    fdict_z[:] = a
    return fdict_z.info
    
    

## Main benchmarks

In [7]:
def save(a, z):
    if isinstance(z.store, zarr.ZipStore):
        # needed for zip benchmarks to avoid duplicate entries
        z.store.clear()
    z[:] = a
    if hasattr(z.store, 'flush'):
        z.store.flush()
    
    
def load(z, a):
    z.get_basic_selection(out=a)


## arange

In [8]:
a = np.arange(500000000)
setup(a)

0,1
Name,/foo/bar
Type,zarr.core.Array
Data type,int64
Shape,"(500000000,)"
Chunk shape,"(488282,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,4000000000 (3.7G)


### save

In [9]:
%timeit save(a, fdict_z)

319 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit save(a, hdict_z)

322 ms ± 8.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit save(a, lmdb_z)

347 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit save(a, gdb_z)

918 ms ± 182 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit save(a, bdb_z)

1.32 s ± 389 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit save(a, zip_z)

494 ms ± 20.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit save(a, dir_z)

614 ms ± 4.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### load

In [16]:
%timeit load(fdict_z, a)

427 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit load(hdict_z, a)

451 ms ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%timeit load(lmdb_z, a)

460 ms ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%timeit load(gdb_z, a)

482 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%timeit load(bdb_z, a)

538 ms ± 5.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%timeit load(zip_z, a)

613 ms ± 8.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%timeit load(dir_z, a)

526 ms ± 9.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## randint

In [25]:
np.random.seed(42)
a = np.random.randint(0, 2**30, size=500000000)
setup(a)

0,1
Name,/foo/bar
Type,zarr.core.Array
Data type,int64
Shape,"(500000000,)"
Chunk shape,"(488282,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,4000000000 (3.7G)


### save

In [26]:
%timeit -r3 save(a, fdict_z)

654 ms ± 8.74 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [27]:
%timeit -r3 save(a, hdict_z)

677 ms ± 62.5 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [28]:
%timeit -r3 save(a, lmdb_z)

926 ms ± 75.2 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [29]:
%timeit -r3 save(a, gdb_z)

4.25 s ± 58.9 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [30]:
%timeit -r3 save(a, bdb_z)

5.93 s ± 311 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [31]:
%timeit -r3 save(a, zip_z)

3.52 s ± 139 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [32]:
%timeit -r3 save(a, dir_z)

3.09 s ± 137 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


### load

In [33]:
%timeit -r3 load(fdict_z, a)

616 ms ± 121 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [34]:
%timeit -r3 load(hdict_z, a)

534 ms ± 13.8 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [35]:
%timeit -r3 load(lmdb_z, a)

582 ms ± 29.5 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [36]:
%timeit -r3 load(gdb_z, a)

1.23 s ± 4.51 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [37]:
%timeit -r3 load(bdb_z, a)

1.64 s ± 6.16 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [38]:
%timeit -r3 load(zip_z, a)

2.38 s ± 19.1 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [39]:
%timeit -r3 load(dir_z, a)

874 ms ± 41.8 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


### dask

In [40]:
import dask.array as da

In [41]:
def dask_op(source, sink, chunks=None):
    if isinstance(sink.store, zarr.ZipStore):
        sink.store.clear()
    if chunks is None:
        try:
            chunks = sink.chunks
        except AttributeError:
            chunks = source.chunks
    d = da.from_array(source, chunks=chunks, asarray=False, fancy=False, lock=False)
    result = (d // 2) * 2
    da.store(result, sink, lock=False)
    if hasattr(sink.store, 'flush'):
        sink.store.flush()
    

#### Compare sources

In [42]:
%time dask_op(fdict_z, fdict_z)

CPU times: user 16.7 s, sys: 1.42 s, total: 18.1 s
Wall time: 3.26 s


In [43]:
%time dask_op(hdict_z, fdict_z)

CPU times: user 18.2 s, sys: 172 ms, total: 18.4 s
Wall time: 3.03 s


In [44]:
%time dask_op(lmdb_z, fdict_z)

CPU times: user 17.6 s, sys: 72 ms, total: 17.7 s
Wall time: 2.79 s


In [45]:
%time dask_op(gdb_z, fdict_z)

CPU times: user 17.2 s, sys: 712 ms, total: 17.9 s
Wall time: 3.23 s


In [46]:
%time dask_op(bdb_z, fdict_z)

CPU times: user 20.4 s, sys: 1.29 s, total: 21.6 s
Wall time: 3.42 s


In [47]:
%time dask_op(zip_z, fdict_z)

CPU times: user 16 s, sys: 652 ms, total: 16.7 s
Wall time: 3.13 s


In [48]:
%time dask_op(dir_z, fdict_z)

CPU times: user 18 s, sys: 880 ms, total: 18.9 s
Wall time: 3 s


#### Compare sinks

In [49]:
%time dask_op(fdict_z, hdict_z)

CPU times: user 16.7 s, sys: 1.08 s, total: 17.8 s
Wall time: 3.11 s


In [50]:
%time dask_op(fdict_z, lmdb_z)

CPU times: user 17.4 s, sys: 1.37 s, total: 18.7 s
Wall time: 2.82 s


In [51]:
%time dask_op(fdict_z, gdb_z)

CPU times: user 19 s, sys: 3.02 s, total: 22 s
Wall time: 8.72 s


In [52]:
%time dask_op(fdict_z, bdb_z)

CPU times: user 14.2 s, sys: 3.38 s, total: 17.5 s
Wall time: 6.87 s


In [53]:
%time dask_op(fdict_z, zip_z)

CPU times: user 14.3 s, sys: 2.66 s, total: 16.9 s
Wall time: 3.95 s


In [54]:
%time dask_op(fdict_z, dir_z)

CPU times: user 16.5 s, sys: 3.54 s, total: 20 s
Wall time: 3.19 s


In [56]:
lmdb_z.store.close()
gdb_z.store.close()
bdb_z.store.close()
zip_z.store.close()