In [1]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

Here is a campaign contribution dataset from the 2012 presidential election as a Parquet file

In [2]:
fec = pq.read_table('fec-2012.parquet')

In [3]:
fec.schema

cmte_id: string
cand_id: string
cand_nm: string
contbr_nm: string
contbr_city: string
contbr_st: string
contbr_zip: int64
contbr_employer: string
contbr_occupation: string
contb_receipt_amt: double
contb_receipt_dt: string
receipt_desc: string
memo_cd: string
memo_text: string
form_tp: string
file_num: int64
metadata
--------
{b'pandas': b'{"index_columns": [], "column_indexes": [], "columns": [{"name":'
            b' "cmte_id", "field_name": "cmte_id", "pandas_type": "unicode", "'
            b'numpy_type": "object", "metadata": null}, {"name": "cand_id", "f'
            b'ield_name": "cand_id", "pandas_type": "unicode", "numpy_type": "'
            b'object", "metadata": null}, {"name": "cand_nm", "field_name": "c'
            b'and_nm", "pandas_type": "unicode", "numpy_type": "object", "meta'
            b'data": null}, {"name": "contbr_nm", "field_name": "contbr_nm", "'
            b'pandas_type": "unicode", "numpy_type": "object", "metadata": nul'
            b'l}, {"name": "cont

The extra metadata is a pandas-specific detail (where the file was produced), so ignore that

In [4]:
pa.total_allocated_bytes()

180296064

I'm going to write 50 copies of the table end-to-end in a stream so we have a 7+ gigabyte file to work with

In [5]:
with open('fec.arrow', 'wb') as f:
    writer = pa.ipc.RecordBatchStreamWriter(f, fec.schema)
    for i in range(50):
        writer.write(fec)
    writer.close()

In [6]:
!ls -l

total 8718892
-rw-r--r-- 1 wesm wesm       8687 Sep  1 16:22 Demo1-MemoryMapping.ipynb
-rw-r--r-- 1 wesm wesm       9096 Sep  1 16:22 Demo2-Flight.ipynb
-rw-r--r-- 1 wesm wesm   26351036 Sep  1 16:22 fec-2012.parquet
-rw-r--r-- 1 wesm wesm 8901354244 Sep  4 10:32 fec.arrow
-rw-r--r-- 1 wesm wesm        359 Sep  4 10:31 README.md
-rw-r--r-- 1 wesm wesm     403260 Sep  4 10:30 slides.pdf


In [7]:
mmap = pa.memory_map('fec.arrow')
f = pa.ipc.open_stream(mmap)

Now we're going to "parse" the stream to obtain Arorw data structures referencing the memory map

In [8]:
%%time
t = f.read_all()

CPU times: user 724 ms, sys: 59.4 ms, total: 783 ms
Wall time: 782 ms


Note that the read does take a little bit of time (~700ms) because of the 50 table chunks referencing the memory map that have to be reconstructed 

In [9]:
t[2].unique()

<pyarrow.lib.StringArray object at 0x7fd86f32fdb0>
[
  "Bachmann, Michelle",
  "Romney, Mitt",
  "Obama, Barack",
  "Roemer, Charles E. 'Buddy' III",
  "Pawlenty, Timothy",
  "Johnson, Gary Earl",
  "Paul, Ron",
  "Santorum, Rick",
  "Cain, Herman",
  "Gingrich, Newt",
  "McCotter, Thaddeus G",
  "Huntsman, Jon",
  "Perry, Rick"
]

The amount of allocated memory is unchanged because of memory mapping

In [13]:
pa.total_allocated_bytes()

180296384

In [10]:
t[0].chunk(5)[1000]

'C00431171'

In [11]:
t[0].chunk(5).buffers()

[None,
 <pyarrow.lib.Buffer at 0x7fd86f352ab0>,
 <pyarrow.lib.Buffer at 0x7fd86f352570>]

In [12]:
t[0].num_chunks

50