In [3]:
import pyarrow.dataset as ds, pyarrow as pa, numpy as np, math, tqdm, pathlib, pandas as pd

WL_MIN, WL_MAX      = 1_000_000, 50_000_000

blocks_path = pathlib.Path("../data/blocks.parquet")
wl_path     = pathlib.Path("../data/workloads_daily.parquet")

# ---------- 1. uniqueness of qpu_units in blocks ----------
seen_sizes = set()
dup_found   = False
for batch in ds.dataset(blocks_path).to_batches(columns=["qpu_units"], batch_size=100_000):
    arr = batch["qpu_units"].to_numpy()
    intersect = set(arr) & seen_sizes
    if intersect:
        dup_found = True
        print("❌ DUPLICATE SIZE(S):", list(intersect)[:5])
        break
    seen_sizes.update(arr)
print("Blocks size uniqueness:", "✔ OK" if not dup_found else "FAILED")

# ---------- 2. workload qpu_units ⊆ blocks.qpu_units ----------
missing = set()
for batch in ds.dataset(wl_path).to_batches(columns=["qpu_units"], batch_size=500_000):
    arr = batch["qpu_units"].to_numpy()
    mask = np.isin(arr, list(seen_sizes), assume_unique=True, invert=True)
    if mask.any():
        missing.update(arr[mask])
        if len(missing) > 10:
            break
print("Workload size coverage:", "✔ OK" if not missing else f"❌ Missing {len(missing)} size(s)")

# ---------- 3. per-day ranges ----------
# ---------- 3. per-day ranges (corrected) ----------
wl_tbl  = ds.dataset(wl_path)
daily_wl = (wl_tbl
            .to_table(columns=["day","n_workloads"])
            .group_by("day")
            .aggregate([("n_workloads", "sum")])
            .to_pandas())

rng_ok = daily_wl["n_workloads_sum"].between(WL_MIN, WL_MAX).all()

blk_tbl = ds.dataset(blocks_path)
blk_cnt = blk_tbl.to_table(columns=["lease_day"]).to_pandas()["lease_day"].value_counts()
blk_ok  = blk_cnt.between(1_000, 10_000).all()

print("Daily block count range :", "✔ OK" if blk_ok else "❌ OUT OF RANGE")
print("Daily workload count rng:", "✔ OK" if rng_ok else "❌ OUT OF RANGE")

Blocks size uniqueness: ✔ OK
Workload size coverage: ✔ OK
Daily block count range : ✔ OK
Daily workload count rng: ✔ OK
