## Setup

In [1]:
import zarr
zarr.__version__

'2.2.0a2.dev2+dirty'

In [2]:
import bsddb3
bsddb3.__version__

'6.2.5'

In [3]:
import lmdb
lmdb.__version__

'0.93'

In [4]:
import numpy as np

In [5]:
import os
import shutil
bench_dir = '../data/bench'
if os.path.isdir(bench_dir):
    shutil.rmtree(bench_dir)
os.makedirs(bench_dir)

In [6]:
mem_store = dict()
lmdb_store = zarr.LMDBStore(os.path.join(bench_dir, 'lmdb'))
bdb_store = zarr.DBMStore(os.path.join(bench_dir, 'bdb'), open=bsddb3.btopen)
dir_store = zarr.DirectoryStore(os.path.join(bench_dir, 'dir'))

In [7]:
def save(a, z):
    z[:] = a
    if hasattr(z.store, 'sync'):
        z.store.sync()
    
    
def load(z, a):
    z.get_basic_selection(out=a)


## arange

In [8]:
a = np.arange(1000000000)
mem_z = zarr.empty_like(a, store=mem_store, overwrite=True)
lmdb_z = zarr.empty_like(a, store=lmdb_store, overwrite=True)
bdb_z = zarr.empty_like(a, store=bdb_store, overwrite=True)
dir_z = zarr.empty_like(a, store=dir_store, overwrite=True)

# check compression ratio
mem_z[:] = a
mem_z.info

0,1
Type,zarr.core.Array
Data type,int64
Shape,"(1000000000,)"
Chunk shape,"(244141,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,8000000000 (7.5G)
No. bytes stored,118578855 (113.1M)


In [9]:
%timeit save(a, mem_z)

768 ms ± 74.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit save(a, lmdb_z)

853 ms ± 58.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit save(a, bdb_z)

1.29 s ± 78.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit save(a, dir_z)

1.44 s ± 9.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit load(mem_z, a)

1.06 s ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit load(lmdb_z, a)

1.09 s ± 10 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit load(bdb_z, a)

1.11 s ± 4.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%timeit load(dir_z, a)

1.26 s ± 16.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## randint

In [17]:
a = np.random.randint(0, 2**30, size=1000000000)
mem_z = zarr.empty_like(a, store=mem_store, overwrite=True)
lmdb_z = zarr.empty_like(a, store=lmdb_store, overwrite=True)
bdb_z = zarr.empty_like(a, store=bdb_store, overwrite=True)
dir_z = zarr.empty_like(a, store=dir_store, overwrite=True)

# check compression ratio
mem_z[:] = a
mem_z.info

0,1
Type,zarr.core.Array
Data type,int64
Shape,"(1000000000,)"
Chunk shape,"(244141,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,8000000000 (7.5G)
No. bytes stored,4041602661 (3.8G)


In [18]:
%timeit save(a, mem_z)

1.26 s ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit save(a, lmdb_z)

6 s ± 985 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%timeit save(a, bdb_z)

9.78 s ± 1.25 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%timeit save(a, dir_z)

5.96 s ± 312 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%timeit load(mem_z, a)

1.3 s ± 60.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%timeit load(lmdb_z, a)

1.4 s ± 42.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%timeit load(bdb_z, a)

3.76 s ± 25.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%timeit load(dir_z, a)

1.81 s ± 64.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## dask

In [26]:
import dask.array as da

In [27]:
def dask_op(source, sink, chunks=None):
    if chunks is None:
        try:
            chunks = sink.chunks
        except AttributeError:
            chunks = source.chunks
    d = da.from_array(source, chunks=chunks, asarray=False, fancy=False, lock=False)
    result = (d // 2) * 2
    da.store(result, sink, lock=False)
    if hasattr(sink, 'sync'):
        sink.sync()
    

In [28]:
%time dask_op(mem_z, mem_z)

CPU times: user 31.4 s, sys: 3.16 s, total: 34.6 s
Wall time: 6.98 s


In [29]:
%time dask_op(mem_z, lmdb_z)

CPU times: user 34.3 s, sys: 3.24 s, total: 37.5 s
Wall time: 7.92 s


In [30]:
%time dask_op(lmdb_z, mem_z)

CPU times: user 33.9 s, sys: 204 ms, total: 34.1 s
Wall time: 6.23 s


In [31]:
%time dask_op(mem_z, bdb_z)

CPU times: user 32.6 s, sys: 4.25 s, total: 36.8 s
Wall time: 10 s


In [32]:
%time dask_op(bdb_z, mem_z)

CPU times: user 37.8 s, sys: 1.79 s, total: 39.6 s
Wall time: 7.14 s


In [33]:
%time dask_op(mem_z, dir_z)

CPU times: user 33.9 s, sys: 3.94 s, total: 37.8 s
Wall time: 7.35 s


In [34]:
%time dask_op(dir_z, mem_z)

CPU times: user 34.8 s, sys: 1.66 s, total: 36.5 s
Wall time: 6.85 s


In [35]:
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize
from bokeh.io import output_notebook
output_notebook()

In [36]:
with ResourceProfiler(dt=0.25) as rprof:
    dask_op(mem_z, mem_z)
rprof.visualize();

In [37]:
with ResourceProfiler(dt=0.25) as rprof:
    dask_op(mem_z, lmdb_z)
rprof.visualize();

In [38]:
with ResourceProfiler(dt=0.25) as rprof:
    dask_op(mem_z, bdb_z)
rprof.visualize();

In [39]:
with ResourceProfiler(dt=0.25) as rprof:
    dask_op(mem_z, dir_z)
rprof.visualize();

In [40]:
lmdb_store.close()
bdb_store.close()