## Setup

In [1]:
import zarr
zarr.__version__

'2.2.0a2.dev2+dirty'

In [2]:
import bsddb3
bsddb3.__version__

'6.2.5'

In [3]:
import lmdb
lmdb.__version__

'0.93'

In [4]:
import numpy as np

In [5]:
import dbm.gnu

In [6]:
import os
import shutil
bench_dir = '../data/bench'


def clean():
    if os.path.isdir(bench_dir):
        shutil.rmtree(bench_dir)
    os.makedirs(bench_dir)


In [7]:
clean()
mem_store = dict()
lmdb_store = zarr.LMDBStore(os.path.join(bench_dir, 'lmdb'))
gdb_store = zarr.DBMStore(os.path.join(bench_dir, 'gdb'), open=dbm.gnu.open)
bdb_store = zarr.DBMStore(os.path.join(bench_dir, 'bdb'), open=bsddb3.btopen)
zip_store = zarr.ZipStore(os.path.join(bench_dir, 'zip'), mode='w')
dir_store = zarr.DirectoryStore(os.path.join(bench_dir, 'dir'))

## Main benchmarks

mem_store = dict()
lmdb_store = zarr.LMDBStore(os.path.join(bench_dir, 'lmdb'))
bdb_store = zarr.DBMStore(os.path.join(bench_dir, 'bdb'), open=bsddb3.btopen)
gdb_store = zarr.DBMStore(os.path.join(bench_dir, 'gdb'), open=dbm.gnu.open, flag='nf')
zip_store = zarr.ZipStore(os.path.join(bench_dir, 'zip'), mode='w')
dir_store = zarr.DirectoryStore(os.path.join(bench_dir, 'dir'))

In [8]:
def save(a, z):
    if isinstance(z.store, zarr.ZipStore):
        # needed for zip benchmarks to avoid duplicate entries
        z.store.clear()
    z[:] = a
    if hasattr(z.store, 'flush'):
        z.store.flush()
    
    
def load(z, a):
    z.get_basic_selection(out=a)


## arange

In [9]:
a = np.arange(500000000)
mem_z = zarr.empty_like(a, store=mem_store)
lmdb_z = zarr.empty_like(a, store=lmdb_store)
gdb_z = zarr.empty_like(a, store=gdb_store)
bdb_z = zarr.empty_like(a, store=bdb_store)
zip_z = zarr.empty_like(a, store=zip_store)
dir_z = zarr.empty_like(a, store=dir_store)

# check compression ratio
mem_z[:] = a
mem_z.info

0,1
Type,zarr.core.Array
Data type,int64
Shape,"(500000000,)"
Chunk shape,"(488282,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,4000000000 (3.7G)
No. bytes stored,59269657 (56.5M)


In [10]:
%timeit save(a, mem_z)

369 ms ± 68.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit save(a, lmdb_z)

310 ms ± 16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit save(a, gdb_z)

805 ms ± 115 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit save(a, bdb_z)

1.42 s ± 197 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit save(a, zip_z)

479 ms ± 25.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit save(a, dir_z)

543 ms ± 34.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%timeit load(mem_z, a)

441 ms ± 33.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit load(lmdb_z, a)

443 ms ± 5.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%timeit load(gdb_z, a)

463 ms ± 9.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit load(bdb_z, a)

512 ms ± 7.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%timeit load(zip_z, a)

589 ms ± 12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%timeit load(dir_z, a)

509 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## randint

In [9]:
a = np.random.randint(0, 2**30, size=500000000)
mem_z = zarr.empty_like(a, store=mem_store, overwrite=True)
lmdb_z = zarr.empty_like(a, store=lmdb_store, overwrite=True)
gdb_z = zarr.empty_like(a, store=gdb_store, overwrite=True)
bdb_z = zarr.empty_like(a, store=bdb_store, overwrite=True)
zip_store.clear()
zip_z = zarr.empty_like(a, store=zip_store)
dir_z = zarr.empty_like(a, store=dir_store, overwrite=True)

# check compression ratio
mem_z[:] = a
mem_z.info

0,1
Type,zarr.core.Array
Data type,int64
Shape,"(500000000,)"
Chunk shape,"(488282,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,4000000000 (3.7G)
No. bytes stored,2020785315 (1.9G)


In [10]:
%timeit -r3 save(a, mem_z)

655 ms ± 39.1 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [11]:
%timeit -r3 save(a, lmdb_z)

1.02 s ± 210 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [12]:
%timeit -r3 save(a, gdb_z)

4.96 s ± 685 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [13]:
%timeit -r3 save(a, bdb_z)

6 s ± 602 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [14]:
%timeit -r3 save(a, zip_z)

3.38 s ± 46.8 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [15]:
%timeit -r3 save(a, dir_z)

2.71 s ± 252 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [19]:
%timeit -r3 load(mem_z, a)

536 ms ± 12.9 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [20]:
%timeit -r3 load(lmdb_z, a)

558 ms ± 21.2 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [21]:
%timeit -r3 load(gdb_z, a)

1.22 s ± 25.5 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [22]:
%timeit -r3 load(bdb_z, a)

1.6 s ± 6.66 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [23]:
%timeit -r3 load(zip_z, a)

2.42 s ± 44 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [24]:
%timeit -r3 load(dir_z, a)

797 ms ± 98.4 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


## dask

In [16]:
import dask.array as da

In [35]:
def dask_op(source, sink, chunks=None):
    if isinstance(sink.store, zarr.ZipStore):
        sink.store.clear()
    if chunks is None:
        try:
            chunks = sink.chunks
        except AttributeError:
            chunks = source.chunks
    d = da.from_array(source, chunks=chunks, asarray=False, fancy=False, lock=False)
    result = (d // 2) * 2
    da.store(result, sink, lock=False)
    if hasattr(sink.store, 'flush'):
        sink.store.flush()
    

### Compare sources

In [36]:
%time dask_op(mem_z, mem_z)

CPU times: user 17 s, sys: 100 ms, total: 17.1 s
Wall time: 2.78 s


In [37]:
%time dask_op(lmdb_z, mem_z)

CPU times: user 16.6 s, sys: 100 ms, total: 16.7 s
Wall time: 2.59 s


In [38]:
%time dask_op(gdb_z, mem_z)

CPU times: user 16.9 s, sys: 648 ms, total: 17.5 s
Wall time: 3.12 s


In [39]:
%time dask_op(bdb_z, mem_z)

CPU times: user 19.2 s, sys: 1.14 s, total: 20.4 s
Wall time: 3.14 s


In [40]:
%time dask_op(zip_z, mem_z)

CPU times: user 15.2 s, sys: 752 ms, total: 16 s
Wall time: 3.02 s


In [41]:
%time dask_op(dir_z, mem_z)

CPU times: user 17.5 s, sys: 948 ms, total: 18.5 s
Wall time: 2.91 s


### Compare sinks

In [42]:
%time dask_op(mem_z, lmdb_z)

CPU times: user 17.5 s, sys: 824 ms, total: 18.4 s
Wall time: 2.86 s


In [43]:
%time dask_op(mem_z, gdb_z)

CPU times: user 19.5 s, sys: 1.79 s, total: 21.3 s
Wall time: 6.99 s


In [44]:
%time dask_op(mem_z, bdb_z)

CPU times: user 14.2 s, sys: 2.59 s, total: 16.7 s
Wall time: 7.01 s


In [45]:
%time dask_op(mem_z, zip_z)

CPU times: user 14 s, sys: 1.6 s, total: 15.6 s
Wall time: 3.67 s


In [46]:
%time dask_op(mem_z, dir_z)

CPU times: user 17.7 s, sys: 2.15 s, total: 19.9 s
Wall time: 3.27 s


In [47]:
lmdb_store.close()
gdb_store.close()
bdb_store.close()
zip_store.close()