In [98]:
from pathlib import Path
import pyarrow as pa
import pyarrow.compute as pc   
import pyarrow.dataset as ds   
import pyarrow.parquet as pq   
from rich import print as rprint
from rich.pretty import pprint

# path = Path('large2_raw_dataset')
path = Path('tiny_ingested_dataset')
path = Path('tiny_compacted_dataset')

!tree $path | head -n 100

tiny_compacted_dataset
└── year=2020
    ├── month=1
    │   └── d4293db4-e00b-4a41-8c64-4c2a05af9f4a-0.parquet
    ├── month=2
    │   └── d4293db4-e00b-4a41-8c64-4c2a05af9f4a-0.parquet
    ├── month=3
    │   └── d4293db4-e00b-4a41-8c64-4c2a05af9f4a-0.parquet
    └── month=4
        └── d4293db4-e00b-4a41-8c64-4c2a05af9f4a-0.parquet

5 directories, 4 files


# Dataset read

This is only a pointer to the dataset:

In [80]:
dataset = ds.dataset(path, partitioning='hive')
rprint('Dataset files:', len(dataset.files))
rprint(dataset.schema)

In [81]:
for i, fragment in enumerate(dataset.get_fragments()):
    datetime = fragment.to_table().column('datetime')
    rprint(f'\n{fragment.path}\nnum_rows = {fragment.count_rows():8d}  num_row_groups={fragment.num_row_groups}', 
           pc.min(datetime).as_py(), pc.max(datetime).as_py())
    if i > 20: break

In [83]:
batch_rows = [batch.num_rows for batch in dataset.to_batches()]  # carefull, this scans the entire dataset!
len(batch_rows), sum(batch_rows), batch_rows[:20]

(3000,
 300000000,
 [100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000,
  100000])

# Query


In [84]:
path

PosixPath('large2_ingested_dataset')

In [85]:
columns = ['datetime', 'cat_col_01', 'num_col_01']
filters = pq._filters_to_expression([
    ('month', '<', 4),
    ('day', '<', 5),
    ('cat_col_00', '=', 'foo')
])

table = dataset.to_table(columns=columns, filter=filters)

In [87]:
table.to_pandas()

Unnamed: 0,datetime,cat_col_01,num_col_01
0,2020-01-01 18:00:00.216,red,-0.362310
1,2020-01-01 18:00:02.160,green,1.620285
2,2020-01-01 18:00:04.752,red,-0.753150
3,2020-01-01 18:00:05.400,red,-0.701831
4,2020-01-01 18:00:06.048,green,1.125831
...,...,...,...
3733115,2022-01-04 17:59:56.112,blue,-0.845091
3733116,2022-01-04 17:59:57.408,blue,-0.561497
3733117,2022-01-04 17:59:58.272,red,-1.398892
3733118,2022-01-04 17:59:58.704,green,0.921700
