In [None]:
# import glob
import io
import pyarrow as pa
import pyarrow.parquet as pq

### 1. Read a single file that fits just in memory to do a Data Science task

Load the data into RAM so that we only profile the actual code and not the disk cache.

In [2]:
with open('../data/yellow_tripdata_2016-01.parquet', 'rb') as f:
    parquet_bytes = f.read()

In [3]:
%%timeit
reader = pa.BufferReader(parquet_bytes)
df = pq.read_table(reader).to_pandas()

1.08 s ± 79.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
with open('../data/str_yellow_tripdata_2016-01.parquet', 'rb') as f:
    parquet_bytes = f.read()

In [5]:
%%timeit
reader = pa.BufferReader(parquet_bytes)
df = pq.read_table(reader).to_pandas()

3.68 s ± 75.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
with open('../data/cat_yellow_tripdata_2016-01.parquet', 'rb') as f:
    parquet_bytes = f.read()

In [7]:
%%timeit
reader = pa.BufferReader(parquet_bytes)
df = pq.read_table(reader).to_pandas(categories=['str'])

1.45 s ± 8.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### 2. Read multiple files (e.g. for online algorithms)

In [8]:
files = glob.glob('../data/yellow_tripdata_2016-*.parquet')

In [9]:
%%timeit
for f in files:
    df = pq.read_table(f).to_pandas()
    # Here you would normally update e.g. your online algorithm.
    # We skip any work here as we only want to measure I/O time.

7.1 s ± 74.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### 3. Store/Checkpoint your current state

In [10]:
df = pq.read_table('../data/yellow_tripdata_2016-01.parquet').to_pandas()

In [11]:
%%timeit
buf = io.BytesIO()
df.to_parquet(buf, engine='pyarrow')

5.66 s ± 70.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
df = pq.read_table('../data/str_yellow_tripdata_2016-01.parquet').to_pandas()

In [13]:
%%timeit
buf = io.BytesIO()
df.to_parquet(buf, engine='pyarrow')

10.1 s ± 95.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
df = pq.read_table('../data/cat_yellow_tripdata_2016-01.parquet').to_pandas(categories=["str"])

In [15]:
%%timeit
buf = io.BytesIO()
df.to_parquet(buf, engine='pyarrow')

6.56 s ± 88.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
