In [1]:
import os
import pandas as pd
import pyarrow.dataset as ds

In [2]:
# 🔧 CONFIGURATION
LOCAL_PARQUET_DIR = "logs_partitioned"  # same as output-dir used in run_sqlxport.sh
PARTITION_COLUMN = "service"

In [3]:
# 📂 List all partitioned subfolders
print("Available partitions:")
for root, dirs, files in os.walk(LOCAL_PARQUET_DIR):
    for d in dirs:
        if d.startswith(f"{PARTITION_COLUMN}="):
            print(f"  - {d}")
    break

Available partitions:
  - service=auth
  - service=billing


In [4]:
# 🧾 Load all Parquet files in the directory
dataset = ds.dataset(LOCAL_PARQUET_DIR, format="parquet", partitioning="hive")
df = dataset.to_table().to_pandas()

In [5]:
# 🧠 Inspect schema and content
df.dtypes

id                    int64
message              object
timestamp    datetime64[ns]
service              object
dtype: object

In [6]:
# 👀 Preview rows
df.head()

Unnamed: 0,id,message,timestamp,service
0,2,Test message 2,2025-06-05 19:12:28.812518,auth
1,4,Test message 4,2025-06-05 19:12:28.812518,auth
2,6,Test message 6,2025-06-05 19:12:28.812518,auth
3,8,Test message 8,2025-06-05 19:12:28.812518,auth
4,10,Test message 10,2025-06-05 19:12:28.812518,auth


In [7]:
# 📊 Group by partition column
df.groupby(PARTITION_COLUMN).size()

service
auth       500
billing    500
dtype: int64