# Object arrays

In [1]:
import numpy as np

In [2]:
import zarr
zarr.__version__

'2.2.0a2.dev61'

In [3]:
import numcodecs
numcodecs.__version__

'0.4.1.dev3'

## API changes

Creation of an object array requires providing new ``object_codec`` argument:

In [4]:
z = zarr.empty(10, chunks=5, dtype=object, object_codec=numcodecs.MsgPack())
z

<zarr.core.Array (10,) object>

To maintain backwards compatibility with previously-created data, the object codec is treated as a filter and inserted as the first filter in the chain:

In [5]:
z.info

0,1
Type,zarr.core.Array
Data type,object
Shape,"(10,)"
Chunk shape,"(5,)"
Order,C
Read-only,False
Filter [0],MsgPack(encoding='utf-8')
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,builtins.dict
No. bytes,80


In [6]:
z[0] = 'foo'
z[1] = b'bar'  # msgpack gets this one wrong
z[2] = 1
z[3] = [2, 4, 6, 'baz']
z[4] = {'a': 'b', 'c': 'd'}
a = z[:]
a

array(['foo', 'bar', 1, list([2, 4, 6, 'baz']), {'a': 'b', 'c': 'd'}, None,
       None, None, None, None], dtype=object)

If no ``object_codec`` is provided, a ``ValueError`` is raised:

In [7]:
z = zarr.empty(10, chunks=5, dtype=object)

ValueError: an object_codec is required for object arrays

If a user tries to subvert the system and create an object array with no object codec, a runtime check is added to ensure no object arrays are passed down to the compressor (which could lead to nasty errors and/or segfaults):

In [8]:
z = zarr.empty(10, chunks=5, dtype=object, object_codec=numcodecs.MsgPack())
z._filters = None  # try to live dangerously, manually wipe filters

In [9]:
z[0] = 'foo'

RuntimeError: cannot write object array without object codec

Here is another way to subvert the system, wiping filters **after** storing some data. To cover this case a runtime check is added to ensure no object arrays are handled inappropriately during decoding (which could lead to nasty errors and/or segfaults).

In [10]:
from numcodecs.tests.common import greetings
z = zarr.array(greetings, chunks=5, dtype=object, object_codec=numcodecs.MsgPack())
z[:]

array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!',
       'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', 'こんにちは世界',
       '世界，你好！', 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object)

In [11]:
z._filters = []  # try to live dangerously, manually wipe filters
z[:]

RuntimeError: cannot read object array without object codec

## Object codec benchmarks

In [17]:
a = np.random.choice(greetings, size=1000000).astype(object)
a

array(['Hej Världen!', 'こんにちは世界', 'Servus Woid!', ..., 'Helló, világ!',
       'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object)

In [12]:
msgpack_codec = numcodecs.MsgPack()
json_codec = numcodecs.JSON()
pickle_codec = numcodecs.Pickle()
cat_codec = numcodecs.Categorize(greetings, dtype=object, astype='u1')

In [13]:
zstd1 = numcodecs.Zstd(1)
zstd5 = numcodecs.Zstd(5)
zstd9 = numcodecs.Zstd(9)

In [14]:
def benchmark_codec(codec):
    print(codec)
    print('encode')
    %timeit codec.encode(a)
    enc = codec.encode(a)
    print('decode')
    %timeit codec.decode(enc)
    print('size         : {:,}'.format(len(enc)))
    print('size (zstd 1): {:,}'.format(len(zstd1.encode(enc))))
    print('size (zstd 5): {:,}'.format(len(zstd5.encode(enc))))
    print('size (zstd 9): {:,}'.format(len(zstd9.encode(enc))))
    

In [16]:
benchmark_codec(msgpack_codec)

MsgPack(encoding='utf-8')
encode
121 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
219 ms ± 3.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 18,999,255
size (zstd 1): 1,575,652
size (zstd 5): 1,409,429
size (zstd 9): 1,310,588


In [17]:
benchmark_codec(json_codec)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
180 ms ± 8.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
415 ms ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 33,334,342
size (zstd 1): 1,840,323
size (zstd 5): 1,675,332
size (zstd 9): 1,523,912


In [18]:
benchmark_codec(pickle_codec)

Pickle(protocol=4)
encode
240 ms ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
204 ms ± 4.92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 20,837,805
size (zstd 1): 1,607,658
size (zstd 5): 1,436,410
size (zstd 9): 1,334,162


In [19]:
benchmark_codec(cat_codec)

Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])
encode
216 ms ± 6.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
29 ms ± 221 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 1,000,000
size (zstd 1): 458,146
size (zstd 5): 493,686
size (zstd 9): 490,583


In [19]:
import fastparquet


class FastParquetCodec(numcodecs.abc.Codec):
    """Hacked codec using fastparquet utf8 encoding, for benchmarking purposes only."""
    
    codec_id = 'xxx-fastparquet'
    
    def encode(self, buf):
        buf = np.asanyarray(buf)
        ba = fastparquet.speedups.array_encode_utf8(buf)
        enc = fastparquet.speedups.pack_byte_array(ba.tolist())
        return enc
    
    def decode(self, buf, out=None):
        ba = fastparquet.speedups.unpack_byte_array(buf, a.size)  # hack n for now, just to get a sense of max speed
        dec = fastparquet.speedups.array_decode_utf8(np.array(ba, dtype=object))
        if out is not None:
            out[:] = dec
            return out
        return dec
    

In [20]:
fp_codec = FastParquetCodec()

In [21]:
benchmark_codec(fp_codec)

FastParquetCodec()
encode
85.1 ms ± 1.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
215 ms ± 3.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 21,832,801
size (zstd 1): 1,761,609
size (zstd 5): 1,545,885
size (zstd 9): 1,359,361
