## Setup

In [1]:
import numpy as np
import numcodecs
numcodecs.__version__

'0.4.2.dev6+dirty'

In [2]:
import fastparquet


class FastParquetCodec(numcodecs.abc.Codec):
    """Hacked codec using fastparquet utf8 encoding, for benchmarking purposes only."""
    
    codec_id = 'xxx-fastparquet'
    
    def encode(self, buf):
        buf = np.asanyarray(buf)
        n = buf.size
        ba = fastparquet.speedups.array_encode_utf8(buf)
        enc = fastparquet.speedups.pack_byte_array(ba.tolist())
        return n, enc  # hack for now, return n
    
    def decode(self, data, out=None):
        n, enc = data
        ba = fastparquet.speedups.unpack_byte_array(enc, n)
        dec = fastparquet.speedups.array_decode_utf8(np.array(ba, dtype=object))
        if out is not None:
            out[:] = dec
            return out
        return dec

In [3]:
zstd1 = numcodecs.Zstd(1)
zstd5 = numcodecs.Zstd(5)
zstd9 = numcodecs.Zstd(9)


def benchmark_codec(codec, a):
    print(codec)
    print('encode')
    %timeit codec.encode(a)
    enc = codec.encode(a)
    print('decode')
    %timeit codec.decode(enc)
    if isinstance(codec, FastParquetCodec):
        enc = enc[1]  # hack
    print('size         : {:,}'.format(len(enc)))
    print('size (zstd 1): {:,}'.format(len(zstd1.encode(enc))))
    print('size (zstd 5): {:,}'.format(len(zstd5.encode(enc))))
    print('size (zstd 9): {:,}'.format(len(zstd9.encode(enc))))


In [4]:
from numcodecs.tests.common import greetings
msgpack_codec = numcodecs.MsgPack()
json_codec = numcodecs.JSON()
pickle_codec = numcodecs.Pickle()
cat_codec = numcodecs.Categorize(greetings, dtype=object, astype='u1')
vlen_codec = numcodecs.VLenUTF8()
fp_codec = FastParquetCodec()

## Greetings benchmark

In [5]:
np.random.seed(42)
data = np.random.choice(greetings, size=1000000).astype(object)
data

array(['Γεια σου κόσμε!', 'Hei maailma!', 'Zdravo svete!', ...,
       'Servus Woid!', 'เฮลโลเวิลด์', 'Zdravo svete!'], dtype=object)

In [6]:
benchmark_codec(msgpack_codec, data)

MsgPack(encoding='utf-8')
encode
122 ms ± 823 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
226 ms ± 3.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 18,996,503
size (zstd 1): 1,576,435
size (zstd 5): 1,409,320
size (zstd 9): 1,310,380


In [7]:
benchmark_codec(json_codec, data)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
178 ms ± 4.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
422 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 33,322,595
size (zstd 1): 1,840,414
size (zstd 5): 1,675,163
size (zstd 9): 1,522,853


In [8]:
benchmark_codec(pickle_codec, data)

Pickle(protocol=4)
encode
238 ms ± 25.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
212 ms ± 3.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 20,835,275
size (zstd 1): 1,608,227
size (zstd 5): 1,436,093
size (zstd 9): 1,333,676


In [9]:
benchmark_codec(cat_codec, data)

Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])
encode
221 ms ± 10.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
34.8 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 1,000,000
size (zstd 1): 458,191
size (zstd 5): 493,638
size (zstd 9): 490,483


In [10]:
benchmark_codec(vlen_codec, data)

VLenUTF8()
encode
23.3 ms ± 962 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
157 ms ± 268 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 21,830,287
size (zstd 1): 1,762,831
size (zstd 5): 1,546,643
size (zstd 9): 1,358,821


In [11]:
benchmark_codec(fp_codec, data)

FastParquetCodec()
encode
96.9 ms ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
227 ms ± 6.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
size         : 21,830,271
size (zstd 1): 1,762,809
size (zstd 5): 1,546,612
size (zstd 9): 1,358,813


## Lorem benchmark

In [12]:
from faker import Faker
fake = Faker()

In [13]:
data2 = np.array(' '.join(fake.sentences(nb=200000)).split(), dtype=object)
len(data2), data2[:10]

(1100827, array(['Est', 'excepturi', 'animi', 'velit', 'vero', 'amet.', 'Modi',
        'voluptates', 'explicabo', 'provident'], dtype=object))

In [14]:
benchmark_codec(msgpack_codec, data2)

MsgPack(encoding='utf-8')
encode
99.4 ms ± 7.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
141 ms ± 334 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 8,372,944
size (zstd 1): 2,627,063
size (zstd 5): 2,354,835
size (zstd 9): 2,262,203


In [15]:
benchmark_codec(json_codec, data2)

JSON(encoding='utf-8', allow_nan=True, check_circular=True, ensure_ascii=True,
     indent=None, separators=(',', ':'), skipkeys=False, sort_keys=True,
     strict=True)
encode
120 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
144 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,574,596
size (zstd 1): 2,544,155
size (zstd 5): 2,435,165
size (zstd 9): 2,310,001


In [16]:
benchmark_codec(pickle_codec, data2)

Pickle(protocol=4)
encode
235 ms ± 7.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
decode
133 ms ± 577 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 10,571,711
size (zstd 1): 2,581,019
size (zstd 5): 2,454,825
size (zstd 9): 2,452,222


In [17]:
benchmark_codec(vlen_codec, data2)

VLenUTF8()
encode
21.2 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
88.4 ms ± 6.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 11,675,433
size (zstd 1): 2,956,604
size (zstd 5): 2,858,397
size (zstd 9): 2,554,779


In [18]:
benchmark_codec(fp_codec, data2)

FastParquetCodec()
encode
86.5 ms ± 8.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
decode
151 ms ± 3.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
size         : 11,675,417
size (zstd 1): 2,955,903
size (zstd 5): 2,857,882
size (zstd 9): 2,555,282
