In [14]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

Here is a campaign contribution dataset from the 2012 presidential election as a Parquet file

In [15]:
fec = pq.read_table('fec-2012.parquet')

In [16]:
print(fec.schema.to_string(show_field_metadata=False))

cmte_id: string
cand_id: string
cand_nm: string
contbr_nm: string
contbr_city: string
contbr_st: string
contbr_zip: string
contbr_employer: string
contbr_occupation: string
contb_receipt_amt: double
contb_receipt_dt: string
receipt_desc: string
memo_cd: string
memo_text: string
form_tp: string
file_num: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 2218


The extra metadata is a pandas-specific detail (where the file was produced), so ignore that

In [17]:
pa.total_allocated_bytes()

364795968

I'm going to write 50 copies of the table end-to-end in a stream so we have a 7+ gigabyte file to work with

In [18]:
with open('fec.arrow', 'wb') as f:
    writer = pa.ipc.RecordBatchStreamWriter(f, fec.schema)
    for i in range(50):
        writer.write(fec)
    writer.close()

In [19]:
!ls -l

total 8921544
-rw------- 1 wesm wesm       3921 Apr  3 11:11 Demo1-MemoryMapping.ipynb
-rw------- 1 wesm wesm       9148 Apr  3 11:10 Demo2-Flight.ipynb
-rw------- 1 wesm wesm   27867532 Feb 26 13:40 fec-2012.parquet
-rw------- 1 wesm wesm 9107358528 Apr  3 11:32 fec.arrow
-rw------- 1 wesm wesm        600 Feb 26 13:31 README.md
-rw------- 1 wesm wesm     403260 Feb 26 13:31 slides.pdf


In [20]:
mmap = pa.memory_map('fec.arrow')
f = pa.ipc.open_stream(mmap)

Now we're going to "parse" the stream to obtain Arorw data structures referencing the memory map

In [21]:
%%time
t = f.read_all()

CPU times: user 3.79 ms, sys: 0 ns, total: 3.79 ms
Wall time: 3.05 ms


In [22]:
len(t)

50086550

In [23]:
pa.total_allocated_bytes()

364795968

Note that the read does take a little bit of time (~700ms) because of the 50 table chunks referencing the memory map that have to be reconstructed 

In [24]:
t[2]

<pyarrow.lib.ChunkedArray object at 0x7f388c1d45f0>
[
  [
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    ...
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick"
  ],
  [
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    "Bachmann, Michelle",
    ...
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick",
    "Perry, Rick"
  ],
  [
    "Bachm

In [25]:
t[2].unique()

<pyarrow.lib.StringArray object at 0x7f388bf43c90>
[
  "Bachmann, Michelle",
  "Romney, Mitt",
  "Obama, Barack",
  "Roemer, Charles E. 'Buddy' III",
  "Pawlenty, Timothy",
  "Johnson, Gary Earl",
  "Paul, Ron",
  "Santorum, Rick",
  "Cain, Herman",
  "Gingrich, Newt",
  "McCotter, Thaddeus G",
  "Huntsman, Jon",
  "Perry, Rick"
]

The amount of allocated memory is unchanged because of memory mapping

In [None]:
pa.total_allocated_bytes()

In [None]:
t[0].chunk(5)[1000]

In [None]:
t[0].chunk(5).buffers()

In [None]:
t[0].num_chunks