In [1]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pandas.util.testing import rands
        
NUNIQUE = 1000
STRING_SIZE = 50
LENGTH = 10_000_000
REPEATS = LENGTH // NUNIQUE

uniques = np.array([rands(STRING_SIZE) for i in range(NUNIQUE)], dtype='O')
indices = np.random.randint(0, NUNIQUE, size=LENGTH).astype('i4')        
data = uniques.take(indices)

In [2]:
import gc
class memory_use:
    
    def __init__(self):
        self.start_use = pa.total_allocated_bytes()        
        self.pool = pa.default_memory_pool()
        self.start_peak_use = self.pool.max_memory()
        
    def __enter__(self):
        return
    
    def __exit__(self, type, value, traceback):
        gc.collect()
        print("Change in memory use: {}"
              .format(pa.total_allocated_bytes() - self.start_use))
        print("Change in peak use: {}"
              .format(self.pool.max_memory() - self.start_peak_use))

In [3]:
dict_data = pa.DictionaryArray.from_arrays(indices, uniques)

In [4]:
pa.default_memory_pool().max_memory()

72320

In [5]:
table = pa.table([dict_data], names=['f0'])
with memory_use():
    out_stream = pa.BufferOutputStream()
    pq.write_table(table, out_stream)
    contents = out_stream.getvalue()

Change in memory use: 16777216
Change in peak use: 753475648


In [6]:
%%timeit
out_stream = pa.BufferOutputStream()
pq.write_table(table, out_stream)
contents = out_stream.getvalue()

820 ms ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
len(contents)

12576182

In [8]:
%timeit returned_table = pq.read_table(contents)

495 ms ± 8.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit returned_table = pq.read_table(contents, read_dictionary=['f0'])

93.1 ms ± 3.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
dense_data = dict_data.cast(pa.utf8())
table = pa.table([dense_data], names=['f0'])

In [11]:
%%timeit
out_stream = pa.BufferOutputStream()
pq.write_table(table, out_stream)
contents = out_stream.getvalue()

405 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
out_stream = pa.BufferOutputStream()
pq.write_table(table, out_stream)
contents = out_stream.getvalue()

In [13]:
%%timeit
returned_table = pq.read_table(contents)

430 ms ± 8.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
pq.read_table(contents)

pyarrow.Table
f0: string