In [5]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from pandas.util.testing import rands
        
NUNIQUE = 1000
STRING_SIZE = 50
LENGTH = 10_000_000
REPEATS = LENGTH // NUNIQUE

data = [rands(STRING_SIZE) for i in range(NUNIQUE)] * REPEATS
table = pa.table([data], names=['f0'])

out_stream = pa.BufferOutputStream()
pq.write_table(table, out_stream)
contents = out_stream.getvalue()

In [6]:
len(contents)

1129939

In [12]:
import gc
class memory_use:
    
    def __init__(self):
        self.start_use = pa.total_allocated_bytes()
    
    def __enter__(self):
        return
    
    def __exit__(self, type, value, traceback):
        gc.collect()
        print(pa.total_allocated_bytes() - self.start_use)

0


In [13]:
with memory_use():
    memory_use_no_dict = pq.read_table(pa.BufferReader(contents))

541250112


In [15]:
with memory_use():
    memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])

41304128


In [16]:
%timeit memory_use_no_dict = pq.read_table(pa.BufferReader(contents))

1.79 s ± 7.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])

106 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
541250112 / (1 << 20)

516.1763305664062

In [19]:
41304128 / (1 << 20)

39.39068603515625