## Setup

In [1]:
import numpy as np
import numcodecs
numcodecs.__version__

'0.9.1'

## Unicode string benchmarks

In [2]:
import fastparquet


class FastParquetCodec(numcodecs.abc.Codec):
    """Hacked codec using fastparquet utf8 encoding, for benchmarking purposes only."""
    
    codec_id = 'xxx-fastparquet'
    
    def encode(self, buf):
        buf = np.asanyarray(buf)
        n = buf.size
        ba = fastparquet.speedups.array_encode_utf8(buf)
        enc = fastparquet.speedups.pack_byte_array(ba.tolist())
        return n, enc  # hack for now, return n
    
    def decode(self, data, out=None):
        n, enc = data
        ba = fastparquet.speedups.unpack_byte_array(enc, n)
        dec = fastparquet.speedups.array_decode_utf8(np.array(ba, dtype=object))
        if out is not None:
            out[:] = dec
            return out
        return dec

In [3]:
zstd1 = numcodecs.Zstd(1)
zstd5 = numcodecs.Zstd(5)
zstd9 = numcodecs.Zstd(9)


def benchmark_codec(codec, a):
    print(codec)
    print('encode')
    %timeit codec.encode(a)
    enc = codec.encode(a)
    print('decode')
    %timeit codec.decode(enc)
    if isinstance(codec, FastParquetCodec):
        enc = enc[1]  # hack
    print('size         : {:,}'.format(len(enc)))
    print('size (zstd 1): {:,}'.format(len(zstd1.encode(enc))))
    print('size (zstd 5): {:,}'.format(len(zstd5.encode(enc))))
    print('size (zstd 9): {:,}'.format(len(zstd9.encode(enc))))


In [4]:
from numcodecs.tests.common import greetings
msgpack_codec = numcodecs.MsgPack()
json_codec = numcodecs.JSON()
pickle_codec = numcodecs.Pickle()
cat_codec = numcodecs.Categorize(greetings, dtype=object, astype='u1')
vlen_codec = numcodecs.VLenUTF8()
fp_codec = FastParquetCodec()

### Greetings benchmark

In [5]:
np.random.seed(42)
data = np.random.choice(greetings, size=1000000).astype(object)
data

array(['Γεια σου κόσμε!', 'Hei maailma!', 'Zdravo svete!', ...,
       'Servus Woid!', 'เฮลโลเวิลด์', 'Zdravo svete!'], dtype=object)

In [6]:
%time enc = vlen_codec.encode(data)

CPU times: user 104 ms, sys: 32.9 ms, total: 137 ms
Wall time: 137 ms


In [7]:
%time dec = vlen_codec.decode(enc)

CPU times: user 172 ms, sys: 30.1 ms, total: 202 ms
Wall time: 202 ms


In [8]:
benchmark_codec(msgpack_codec, data)

MsgPack(raw=False, use_bin_type=True, use_single_float=False)
encode
83.5 ms ± 3.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
288 ms ± 6.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 18,913,397
size (zstd 1): 1,529,314
size (zstd 5): 1,405,819
size (zstd 9): 1,178,324


In [9]:
benchmark_codec(json_codec, data)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
291 ms ± 6.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
420 ms ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 33,322,605
size (zstd 1): 1,840,791
size (zstd 5): 1,675,175
size (zstd 9): 1,360,789


In [10]:
benchmark_codec(pickle_codec, data)

Pickle(protocol=5)
encode
285 ms ± 7.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
273 ms ± 6.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 20,835,273
size (zstd 1): 1,565,100
size (zstd 5): 1,435,771
size (zstd 9): 1,204,419


In [11]:
benchmark_codec(cat_codec, data)

Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])
encode
263 ms ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
46.7 ms ± 2.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 1,000,000
size (zstd 1): 458,196
size (zstd 5): 490,680
size (zstd 9): 490,487


In [12]:
benchmark_codec(vlen_codec, data)

VLenUTF8()
encode
125 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
234 ms ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 21,830,275
size (zstd 1): 1,762,783
size (zstd 5): 1,546,616
size (zstd 9): 1,216,314


In [13]:
benchmark_codec(fp_codec, data)

FastParquetCodec()
encode
118 ms ± 3.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
325 ms ± 31.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 21,830,271
size (zstd 1): 1,762,787
size (zstd 5): 1,546,612
size (zstd 9): 1,216,426


### Lorem benchmark

In [14]:
from faker import Faker
fake = Faker()

In [15]:
data2 = np.array(' '.join(fake.sentences(nb=200000)).split(), dtype=object)
len(data2), data2[:10]

(1101020,
 array(['Most', 'top', 'magazine', 'bed', 'successful.', 'Center',
        'exactly', 'and', 'hour', 'wide'], dtype=object))

In [16]:
benchmark_codec(msgpack_codec, data2)

MsgPack(raw=False, use_bin_type=True, use_single_float=False)
encode
76.9 ms ± 602 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
191 ms ± 21.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 7,399,529
size (zstd 1): 3,308,650
size (zstd 5): 2,706,469
size (zstd 9): 2,697,930


In [17]:
benchmark_codec(json_codec, data2)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
226 ms ± 18.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
271 ms ± 29.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 9,601,571
size (zstd 1): 2,895,764
size (zstd 5): 2,713,287
size (zstd 9): 2,681,954


In [18]:
benchmark_codec(pickle_codec, data2)

Pickle(protocol=5)
encode
342 ms ± 36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
193 ms ± 6.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 9,607,048
size (zstd 1): 3,052,431
size (zstd 5): 2,754,370
size (zstd 9): 2,828,895


In [19]:
benchmark_codec(vlen_codec, data2)

VLenUTF8()
encode
116 ms ± 6.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
140 ms ± 13.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,702,579
size (zstd 1): 3,638,377
size (zstd 5): 3,458,719
size (zstd 9): 3,023,123


In [20]:
benchmark_codec(fp_codec, data2)

FastParquetCodec()
encode
126 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
211 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 10,702,575
size (zstd 1): 3,638,601
size (zstd 5): 3,458,675
size (zstd 9): 3,022,583


## Byte strings benchmark

In [21]:
vlen_bytes_codec = numcodecs.VLenBytes()

In [22]:
np.random.seed(42)
greetings_bytes = [g.encode('utf-8') for g in greetings]
data3 = np.random.choice(greetings_bytes, size=1000000).astype(object)
data3

array([b'\xce\x93\xce\xb5\xce\xb9\xce\xb1 \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xb5!',
       b'Hei maailma!', b'Zdravo svete!', ..., b'Servus Woid!',
       b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c',
       b'Zdravo svete!'], dtype=object)

In [23]:
vlen_bytes_codec.decode(vlen_bytes_codec.encode(data3))

array([b'\xce\x93\xce\xb5\xce\xb9\xce\xb1 \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xb5!',
       b'Hei maailma!', b'Zdravo svete!', ..., b'Servus Woid!',
       b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c',
       b'Zdravo svete!'], dtype=object)

In [24]:
benchmark_codec(pickle_codec, data3)

Pickle(protocol=5)
encode
253 ms ± 9.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
111 ms ± 1.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 20,835,273
size (zstd 1): 1,565,112
size (zstd 5): 1,435,770
size (zstd 9): 1,204,445


In [25]:
benchmark_codec(vlen_bytes_codec, data3)

VLenBytes()
encode
32.3 ms ± 418 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
71.5 ms ± 1.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 21,830,275
size (zstd 1): 1,762,783
size (zstd 5): 1,546,616
size (zstd 9): 1,216,314


## Array benchmarks

In [26]:
np.random.seed(42)
data4 = np.array([np.random.randint(0, 100, size=np.random.randint(0, 20)).astype('i4')
                  for i in range(100000)], dtype=object)
data4

array([array([51, 92, 14, 71, 60, 20], dtype=int32),
       array([82, 86, 74, 74, 87, 99], dtype=int32),
       array([23,  2, 21, 52,  1, 87, 29], dtype=int32), ...,
       array([19, 62, 18], dtype=int32),
       array([93, 20,  7, 50], dtype=int32), array([51, 28], dtype=int32)],
      dtype=object)

In [27]:
vlen_arr_codec = numcodecs.VLenArray('<i4')

In [28]:
vlen_arr_codec.decode(vlen_arr_codec.encode(data4))

array([array([51, 92, 14, 71, 60, 20], dtype=int32),
       array([82, 86, 74, 74, 87, 99], dtype=int32),
       array([23,  2, 21, 52,  1, 87, 29], dtype=int32), ...,
       array([19, 62, 18], dtype=int32),
       array([93, 20,  7, 50], dtype=int32), array([51, 28], dtype=int32)],
      dtype=object)

In [29]:
benchmark_codec(vlen_arr_codec, data4)

VLenArray(dtype='<i4')
encode
27.5 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
58.2 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 4,195,540
size (zstd 1): 1,299,769
size (zstd 5): 1,119,369
size (zstd 9): 1,196,642


In [30]:
benchmark_codec(pickle_codec, data4)

Pickle(protocol=5)
encode
313 ms ± 8.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
141 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 6,296,822
size (zstd 1): 1,619,421
size (zstd 5): 1,507,086
size (zstd 9): 1,493,343
