In [1]:
import numpy as np
import nanoarrow as na
import pyarrow as pa

### Intermezzo: inspecting the buffers using PyArrow and nanoarrow

### PyArrow

The `buffers()` method returns a list of all buffers for a specific array. The information for each buffer inlcudes:
- adress of the buffer
- buffer size in bytes
- whether the buffer is mutable or not (buffers are generally mutable - changable, but an Array is an immutable container)

In [None]:
column4 = pa.array(['python', 'data', 'conference', None, "Berlin"], type=pa.string())
column4.buffers()

[<pyarrow.Buffer address=0x7f4375608180 size=1 is_cpu=True is_mutable=True>,
 <pyarrow.Buffer address=0x7f4375608080 size=24 is_cpu=True is_mutable=True>,
 <pyarrow.Buffer address=0x7f43756081c0 size=26 is_cpu=True is_mutable=True>]

In [None]:
validity_bitmap_buffer = column4.buffers()[0]
pa.Array.from_buffers(pa.bool_(), len(column4), [None, validity_bitmap_buffer])

<pyarrow.lib.BooleanArray object at 0x7f4363e46980>
[
  true,
  true,
  true,
  false,
  true
]

In [None]:
offsets_buffer = column4.buffers()[1]
pa.Array.from_buffers(pa.int32(), len(column4)+1, [None, offsets_buffer])

<pyarrow.lib.Int32Array object at 0x7f4363e78b20>
[
  0,
  6,
  10,
  20,
  20,
  26
]

In [None]:
values_buffer = column4.buffers()[2]
values_buffer.to_pybytes()

b'pythondataconferenceBerlin'

### Inspecting buffers using nanoarrow

In [None]:
na_column4 = na.c_array(column4)
na_column4

In [None]:
na.c_array_view(na_column4)

Similar for binary

In [None]:
column4 = pa.array(['python', 'data', 'conference', None, "Berlin"], type=pa.binary())
column4.buffers()

In [None]:
na_column4 = na.c_array(column4)
na.c_array_view(na_column4)

In [None]:
validity_bitmap_buffer = column4.buffers()[0]
offsets_buffer = column4.buffers()[1]
values_buffer = column4.buffers()[2]

pa.Array.from_buffers(pa.bool_(), len(column4), [None, validity_bitmap_buffer]), pa.Array.from_buffers(pa.int32(), len(column4)+1, [None, offsets_buffer]), values_buffer.to_pybytes()

String vs large string

In [None]:
column4 = pa.array(['python', 'data', 'conference', None, "Berlin"], type=pa.string())
na.c_array_view(na.c_array(column4))

In [None]:
column4 = pa.array(['python', 'data', 'conference', None, "Berlin"], type=pa.large_string())
na.c_array_view(na.c_array(column4))