In [1]:
import numpy as np
import numcodecs
numcodecs.__version__

'0.4.2.dev0+dirty'

In [2]:
from numcodecs.tests.common import greetings
greetings

['¡Hola mundo!',
 'Hej Världen!',
 'Servus Woid!',
 'Hei maailma!',
 'Xin chào thế giới',
 'Njatjeta Botë!',
 'Γεια σου κόσμε!',
 'こんにちは世界',
 '世界，你好！',
 'Helló, világ!',
 'Zdravo svete!',
 'เฮลโลเวิลด์']

In [3]:
np.random.seed(42)
data = np.random.choice(greetings, size=1000000).astype(object)
data

array(['Γεια σου κόσμε!', 'Hei maailma!', 'Zdravo svete!', ...,
       'Servus Woid!', 'เฮลโลเวิลด์', 'Zdravo svete!'], dtype=object)

In [4]:
zstd1 = numcodecs.Zstd(1)
zstd5 = numcodecs.Zstd(5)
zstd9 = numcodecs.Zstd(9)


def benchmark_codec(codec, a):
    print(codec)
    print('encode')
    %timeit codec.encode(a)
    enc = codec.encode(a)
    print('decode')
    %timeit codec.decode(enc)
    print('size         : {:,}'.format(len(enc)))
    print('size (zstd 1): {:,}'.format(len(zstd1.encode(enc))))
    print('size (zstd 5): {:,}'.format(len(zstd5.encode(enc))))
    print('size (zstd 9): {:,}'.format(len(zstd9.encode(enc))))


In [5]:
msgpack_codec = numcodecs.MsgPack()
json_codec = numcodecs.JSON()
pickle_codec = numcodecs.Pickle()
cat_codec = numcodecs.Categorize(greetings, dtype=object, astype='u1')
vlen_codec = numcodecs.VLenUTF8()

In [6]:
benchmark_codec(msgpack_codec, data)

MsgPack(encoding='utf-8')
encode
130 ms ± 6.53 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
240 ms ± 7.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 18,996,503
size (zstd 1): 1,576,435
size (zstd 5): 1,409,320
size (zstd 9): 1,310,380


In [7]:
benchmark_codec(json_codec, data)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
184 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
469 ms ± 44.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 33,322,595
size (zstd 1): 1,840,414
size (zstd 5): 1,675,163
size (zstd 9): 1,522,853


In [8]:
benchmark_codec(pickle_codec, data)

Pickle(protocol=4)
encode
304 ms ± 58.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
225 ms ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 20,835,275
size (zstd 1): 1,608,227
size (zstd 5): 1,436,093
size (zstd 9): 1,333,676


In [9]:
benchmark_codec(cat_codec, data)

Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])
encode
286 ms ± 58.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
36.7 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 1,000,000
size (zstd 1): 458,191
size (zstd 5): 493,638
size (zstd 9): 490,483


In [6]:
benchmark_codec(vlen_codec, data)

VLenUTF8()
encode
78.2 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
155 ms ± 202 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 21,830,279
size (zstd 1): 1,762,828
size (zstd 5): 1,546,671
size (zstd 9): 1,358,818


In [7]:
from faker import Faker
fake = Faker()

In [8]:
data2 = np.array(' '.join(fake.sentences(nb=200000)).split(), dtype=object)
len(data2), data2[:10]

(1101578,
 array(['Reprehenderit', 'labore', 'numquam.', 'Labore', 'quod', 'fugit.',
        'Quisquam', 'reprehenderit', 'libero', 'corrupti'], dtype=object))

In [13]:
benchmark_codec(msgpack_codec, data2)

MsgPack(encoding='utf-8')
encode
105 ms ± 7.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
170 ms ± 4.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 8,380,619
size (zstd 1): 2,631,293
size (zstd 5): 2,358,494
size (zstd 9): 2,265,121


In [14]:
benchmark_codec(json_codec, data2)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
121 ms ± 2.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
141 ms ± 461 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,584,987
size (zstd 1): 2,547,087
size (zstd 5): 2,438,584
size (zstd 9): 2,312,637


In [15]:
benchmark_codec(pickle_codec, data2)

Pickle(protocol=4)
encode
242 ms ± 7.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
138 ms ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,593,806
size (zstd 1): 2,584,223
size (zstd 5): 2,456,961
size (zstd 9): 2,455,358


In [9]:
benchmark_codec(vlen_codec, data2)

VLenUTF8()
encode
53.3 ms ± 945 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
82.7 ms ± 232 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 11,682,782
size (zstd 1): 2,959,017
size (zstd 5): 2,860,869
size (zstd 9): 2,557,625
