# In memory data model
Apache Arrow defines columnar array data structures by composing type metadata with memory buffers, like the ones explained in the documentation on Memory and IO. These data structures are exposed in Python through a series of interrelated classes:

- Type Metadata: Instances of pyarrow.DataType, which describe a logical array type
- Schemas: Instances of pyarrow.Schema, which describe a named collection of types. These can be thought of as the column types in a table-like object.
- Arrays: Instances of pyarrow.Array, which are atomic, contiguous columnar data structures composed from Arrow Buffer objects
- Record Batches: Instances of pyarrow.RecordBatch, which are a collection of Array objects with a particular Schema
- Tables: Instances of pyarrow.Table, a logical table data structure in which each column consists of one or more pyarrow.Array objects of the same type.

We will examine these in the sections below in a series of examples.

## Type Metadata

In [1]:
import pyarrow as pa

In [2]:
t1 = pa.int32()
t1

DataType(int32)

In [3]:
t2 = pa.string()
t2

DataType(string)

In [4]:
t3 = pa.binary()
t3

DataType(binary)

In [5]:
t4 = pa.binary(10)
t4

FixedSizeBinaryType(fixed_size_binary[10])

In [6]:
t5 = pa.timestamp("ms")
t5

TimestampType(timestamp[ms])

In [7]:
f0 = pa.field("int32_field", t1)
f0

pyarrow.Field<int32_field: int32>

In [8]:
f0.name

'int32_field'

In [9]:
f0.type

DataType(int32)

In [10]:
t6 = pa.list_(t1)
t6

ListType(list<item: int32>)

In [11]:
fields = [
    pa.field("s0", t1),
    pa.field("s1", t2),
    pa.field("s2", t4),
    pa.field("s3", t6)
]

In [12]:
t7 = pa.struct(fields)
t7

StructType(struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>)

## Schemas

In [13]:
my_schema = pa.schema(fields)
my_schema

s0: int32
s1: string
s2: fixed_size_binary[10]
s3: list<item: int32>
  child 0, item: int32

In [18]:
buf = my_schema.serialize()
buf.to_pybytes()

'<\x01\x00\x00\x10\x00\x00\x00\x0c\x00\x0e\x00\x06\x00\x05\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x00\x01\x03\x00\x10\x00\x00\x00\x00\x00\n\x00\x08\x00\x00\x00\x04\x00\x00\x00\n\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\xd4\x00\x00\x00\x94\x00\x00\x00`\x00\x00\x00\x04\x00\x00\x00N\xff\xff\xff\x00\x00\x01\x0cH\x00\x00\x00\x10\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00|\xff\xff\xffn\xff\xff\xff\x00\x00\x01\x02\x1c\x00\x00\x00\x0c\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\\\xff\xff\xff\x00\x00\x00\x01 \x00\x00\x00\x04\x00\x00\x00item\x00\x00\x00\x00\x02\x00\x00\x00s3\x00\x00\xa6\xff\xff\xff\x00\x00\x01\x0f \x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x08\x00\x04\x00\x06\x00\x00\x00\n\x00\x00\x00\x02\x00\x00\x00s2\x00\x00\xd6\xff\xff\xff\x00\x00\x01\x05\x18\x00\x00\x00\x10\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x04\x00\x04\x00\x04\x00\x00\x00\x02\x00\x00\x00s1\x00\x00\x00\x00\x12\x00\x14\x00\x08\x00\x06\x00\x07\x00\x0c\x00\x00\x00\x10\

## Arrays

In [72]:
arr = pa.array([1, None, 2,4,8], type=pa.uint32())
print arr

<pyarrow.lib.UInt32Array object at 0x10e3c2fc8>
[
  1,
  NA,
  2,
  4,
  8
]


In [73]:
arr.null_count

1

In [74]:
bufs = arr.buffers()

In [75]:
buf0 = bufs[0]
buf1 = bufs[1]

In [76]:
buf0.to_pybytes()

'\x1d\x00\x00\x00'

In [77]:
buf1.to_pybytes()

'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x04\x00\x00\x00\x08\x00\x00\x00'

In [78]:
buf1.size

20

In [80]:
pa.formatting.value_format(arr)

'<pyarrow.lib.UInt32Array object at 0x10e3c2fc8>\n[\n  1,\n  NA,\n  2,\n  4,\n  8\n]'

In [82]:
arr_nonull = pa.array([1, 2, 3, 4, 8] , type=pa.uint32())

In [84]:
bufs_nonull = arr_nonull.buffers()

In [85]:
len(bufs_nonull)

2

In [86]:
bufs_nonull[0].to_pybytes()

'\x1f\x00\x00\x00'

In [87]:
bufs_nonull[1].to_pybytes()

'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x08\x00\x00\x00'

In [96]:
arr_list = pa.array([["j", "o", "e"], None, ["m", "a", "r", "k"], []], type=pa.list_(pa.binary(1)))

In [97]:
arr_list.type

ListType(list<item: fixed_size_binary[1]>)

In [99]:
bufs_list = arr_list.buffers()

In [100]:
buf0 = bufs_list[0]
buf1 = bufs_list[1]
buf2 = bufs_list[2]
buf3 = bufs_list[3]

In [107]:
buf0.to_pybytes()

'\r'

In [103]:
buf1.to_pybytes()

'\x00\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x07\x00\x00\x00\x07\x00\x00\x00'

In [104]:
buf2.to_pybytes()

'\x7f'

In [102]:
buf3.to_pybytes()

'joemark'

In [118]:
for x in arr_list:
    if x is pa.lib.NAType: 
        continue
    print x
    print type(x)
    for y in x:
        print y

['j', 'o', 'e']
<type 'pyarrow.lib.ListValue'>
'j'
'o'
'e'
NA
<type 'pyarrow.lib.NAType'>


TypeError: 'pyarrow.lib.NAType' object is not iterable

In [129]:
ty = pa.struct([
    pa.field("name", pa.string()), 
    pa.field("value", pa.int32())
])
print ty

struct<name: string, value: int32>


In [132]:
arr_struct = pa.array([{"name" : "joe", "value" : 1}, {"name" : None, "value" : 2}, None, {"name" : "mark", "value" : 4}], 
                      type=ty)

In [133]:
arr_struct

<pyarrow.lib.StructArray object at 0x10e3d6998>
[
  {'name': u'joe', 'value': 1},
  {'name': None, 'value': 2},
  NA,
  {'name': u'mark', 'value': 4}
]

In [134]:
bufs = arr_struct.buffers()

In [135]:
len(bufs)

6

In [139]:
bufs[0].to_pybytes()

'\x0b'

In [140]:
bufs[1].to_pybytes()

'\t'

In [141]:
bufs[2].to_pybytes()

'\x00\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x07\x00\x00\x00'

In [142]:
bufs[3].to_pybytes()

'joemark'

In [143]:
bufs[4].to_pybytes()

'\x0b\x00\x00\x00'

In [144]:
bufs[5].to_pybytes()

'\x01\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00'

In [19]:
arr = pa.array([1,2,None,3])
arr

<pyarrow.lib.Int64Array object at 0x10e2c78e8>
[
  1,
  2,
  NA,
  3
]

In [20]:
some = arr.buffers()[1]
some.to_pybytes()

'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00'

In [21]:
pa.array([1, 2], type=pa.uint16())

<pyarrow.lib.UInt16Array object at 0x10e2c7cb0>
[
  1,
  2
]

In [22]:
arr.type

DataType(int64)

In [23]:
len(arr)

4

In [24]:
arr.null_count

1

## List Arrays

In [25]:
nested_arr = pa.array([[[]], None, [[1,2],[3,4]], [[None], [1]]])

In [26]:
nested_arr.type

ListType(list<item: list<item: int64>>)

In [27]:
print(nested_arr.type)

list<item: list<item: int64>>


## Struct Arrays

In [30]:
ty = pa.struct([
    pa.field("x", pa.int8()),
    pa.field("y", pa.bool_())
])
print ty

struct<x: int8, y: bool>


In [31]:
cmplx_arr = pa.array([{"x" : 1, "y" : True}, {"x" : 2, "y" : False}], type=ty)
print cmplx_arr

<pyarrow.lib.StructArray object at 0x10e2db4c8>
[
  {'y': True, 'x': 1},
  {'y': False, 'x': 2}
]


In [32]:
another_arr = pa.array([(3, True), (4, False)], type=ty)
print another_arr

<pyarrow.lib.StructArray object at 0x10e2db730>
[
  {'y': True, 'x': 3},
  {'y': False, 'x': 4}
]


## Union Arrays

In [33]:
xs = pa.array([5,6,7])
ys = pa.array([False, True, False])
zs = pa.array([b"viktor", b"jim", b"maria"])

In [34]:
types = pa.array([0,1,2], type=pa.int8())

In [35]:
union_arr = pa.UnionArray.from_sparse(types, [xs, ys, zs])

In [36]:
print union_arr.type

union[sparse]<0: int64=0, 1: bool=1, 2: binary=2>


In [37]:
union_arr

<pyarrow.lib.UnionArray object at 0x10e2dbb50>
[
  5,
  True,
  'maria'
]

In [38]:
xs = pa.array([5,6, 7])
ys = pa.array([False, True])
types = pa.array([0,1,1,0,0], type = pa.int8())
offsets = pa.array([0,0,1,1,2], type=pa.int32())
dense_union_arr = pa.UnionArray.from_dense(types, offsets, [xs, ys])

In [39]:
print dense_union_arr.type

union[dense]<0: int64=0, 1: bool=1>


In [40]:
dense_union_arr

<pyarrow.lib.UnionArray object at 0x10e2db940>
[
  5,
  False,
  True,
  6,
  7
]

In [41]:
indices = pa.array([0,1,0,1,2,0,None,2])
dictionary = pa.array(["foo", "bar", "baz"])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

In [42]:
dict_array

<pyarrow.lib.DictionaryArray object at 0x10e2de460>
[
  'foo',
  'bar',
  'foo',
  'bar',
  'baz',
  'foo',
  NA,
  'baz'
]

## Record Batches

In [43]:
data = [
    pa.array([1,2,3,4]),
    pa.array(["foo", "bar", "baz", None]),
    pa.array([True, None, False, True])
]

In [44]:
batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
batch.num_columns

3

In [45]:
batch.num_rows

4

In [47]:
batch.to_pandas()

Unnamed: 0,f0,f1,f2
0,1,foo,True
1,2,bar,
2,3,baz,False
3,4,,True


In [50]:
print batch.schema

f0: int64
f1: binary
f2: bool


In [51]:
batch.to_pydict()

OrderedDict([(u'f0', [1, 2, 3, 4]),
             (u'f1', ['foo', 'bar', 'baz', None]),
             (u'f2', [True, None, False, True])])

In [81]:
batch.schema

f0: int64
f1: binary
f2: bool

In [82]:
batch2 = batch.slice(1,3)

In [85]:
print batch2[1]

<pyarrow.lib.BinaryArray object at 0x10e401fc8>
[
  'bar',
  'baz',
  NA
]


## Tables

In [86]:
batches = [batch] * 5
table = pa.Table.from_batches(batches)

In [87]:
table

pyarrow.Table
f0: int64
f1: binary
f2: bool

In [88]:
c = table[0]
c

<pyarrow.lib.Column object at 0x10e35a630>
chunk 0: <pyarrow.lib.Int64Array object at 0x10e405310>
[
  1,
  2,
  3,
  4
]
chunk 1: <pyarrow.lib.Int64Array object at 0x10e405368>
[
  1,
  2,
  3,
  4
]
chunk 2: <pyarrow.lib.Int64Array object at 0x10e4053c0>
[
  1,
  2,
  3,
  4
]
chunk 3: <pyarrow.lib.Int64Array object at 0x10e405418>
[
  1,
  2,
  3,
  4
]
chunk 4: <pyarrow.lib.Int64Array object at 0x10e405470>
[
  1,
  2,
  3,
  4
]

In [89]:
c.to_pandas()

0     1
1     2
2     3
3     4
4     1
5     2
6     3
7     4
8     1
9     2
10    3
11    4
12    1
13    2
14    3
15    4
16    1
17    2
18    3
19    4
Name: f0, dtype: int64