In [1]:
# pip -q install pandas pyarrow numpy

In [4]:
import os, pathlib, math, time, numpy as np, pandas as pd

DATA_DIR = pathlib.Path("../data")
DATA_DIR.mkdir(exist_ok=True)
RNG_SEED = 42
rng = np.random.default_rng(RNG_SEED)

print("Folders ready ▶", DATA_DIR.resolve())

Folders ready ▶ /Users/tej/Career/Projects/Quantum Compute Optimization/v1/data


In [5]:
# ------------------------------------------------------------
# Cell 3 – Config constants
# ------------------------------------------------------------
DAYS                = 180                      # six-month horizon
BLOCKS_MIN, BLOCKS_MAX = 1_000, 10_000
WL_MIN, WL_MAX      = 1_000_000, 50_000_000
QPU_MAX             = 100_000_000             # ← max QPU units

In [6]:
# ------------------------------------------------------------
# Cell 4 – Helper to pick *unique* QPU sizes without huge arrays
# ------------------------------------------------------------
def pick_unique_sizes(n_needed: int, used: set, high: int) -> list[int]:
    """Return `n_needed` integers in [1, high] not present in `used`."""
    out = []
    while len(out) < n_needed:
        m   = n_needed - len(out)
        # sample WITHOUT replacement from the entire range (cheap)
        trial = rng.choice(high, size=m, replace=False) + 1
        # filter out collisions
        trial = [int(x) for x in trial if x not in used]
        out.extend(trial)
        used.update(trial)
    return out

In [7]:
# ------------------------------------------------------------
# Cell 5  –  Streaming writer helpers (PyArrow)
# ------------------------------------------------------------
import pyarrow as pa, pyarrow.parquet as pq
from tqdm.auto import tqdm     # nice progress bar

blocks_schema = pa.schema([
    ("block_id",      pa.string()),
    ("qpu_units",     pa.int32()),
    ("type_initial",  pa.string()),
    ("lease_day",     pa.int16()),
    ("acq_cost",      pa.float32())
])

wl_schema = pa.schema([
    ("day",          pa.int16()),
    ("qpu_units",    pa.int32()),
    ("n_workloads",  pa.int64())
])

blocks_path = DATA_DIR / "blocks.parquet"
wl_path     = DATA_DIR / "workloads_daily.parquet"
blocks_writer = pq.ParquetWriter(blocks_path, blocks_schema, compression="snappy")
wl_writer     = pq.ParquetWriter(wl_path, wl_schema,     compression="snappy")

In [8]:
# ------------------------------------------------------------
# Cell 6  –  Streamed simulation loop  (uses < 0.5 GB)
# ------------------------------------------------------------
t0 = time.time()
all_used_sizes = set()

for day in tqdm(range(DAYS), desc="simulating"):
    # ---------- blocks ----------
    n_blocks = rng.integers(BLOCKS_MIN, BLOCKS_MAX + 1)
    sizes    = pick_unique_sizes(n_blocks, all_used_sizes, QPU_MAX)
    
    blk_tbl = pa.Table.from_pydict({
        "block_id"     : [f"B-{day:03}-{i:05}" for i in range(n_blocks)],
        "qpu_units"    : sizes,
        "type_initial" : [None]*n_blocks,
        "lease_day"    : [day]*n_blocks,
        "acq_cost"     : [0.20]*n_blocks
    }, schema=blocks_schema)
    blocks_writer.write_table(blk_tbl)
    
    # ---------- workload counts ----------
    n_wl = rng.integers(WL_MIN, WL_MAX + 1)
    wl_sizes, freqs = np.unique(
        rng.choice(list(all_used_sizes), size=n_wl, replace=True),
        return_counts=True)
    
    wl_tbl = pa.Table.from_pydict({
        "day"         : np.full_like(wl_sizes, day, dtype=np.int16),
        "qpu_units"   : wl_sizes.astype(np.int32),
        "n_workloads" : freqs.astype(np.int64)
    }, schema=wl_schema)
    wl_writer.write_table(wl_tbl)
    
blocks_writer.close()
wl_writer.close()
print(f"completed in {time.time()-t0:.1f}s")

simulating:   0%|          | 0/180 [00:00<?, ?it/s]

completed in 301.1s


In [9]:
# ------------------------------------------------------------
# Cell 7  –  Post-write stats
# ------------------------------------------------------------
print("Blocks  file:", blocks_path, "→", round(blocks_path.stat().st_size/1e6,2), "MB")
print("Workload file:", wl_path, "→", round(wl_path.stat().st_size/1e6,2), "MB")



Blocks  file: ../data/blocks.parquet → 12.21 MB
Workload file: ../data/workloads_daily.parquet → 514.6 MB


In [10]:
# quick sample read (couple of MB only)
import pandas as pd
sample_blocks = pd.read_parquet(blocks_path, columns=["block_id","qpu_units"]).head()
display(sample_blocks)

Unnamed: 0,block_id,qpu_units
0,B-000-00000,37417910
1,B-000-00001,99702449
2,B-000-00002,10674071
3,B-000-00003,92483044
4,B-000-00004,62600725


In [11]:
sample_blocks = 0