In [2]:
import pandas as pd, numpy as np,pathlib

TRACE = pathlib.Path("../data/trace.parquet")
OUT   = pathlib.Path("../data/trace_chunks.parquet")

CHUNK_BYTES = 4_000_000   # 4 MB segments

df = pd.read_parquet(TRACE).sort_values("ts").reset_index(drop=True)

# Ensure vid_idx exists (needed by LSTM policy)
if "vid_idx" not in df.columns:
    cats = df.video.astype("category")
    df["vid_idx"] = cats.cat.codes

# How many chunks per row
df["n_chunks"] = np.maximum(1, (df["bytes"] // CHUNK_BYTES).astype(int))

# Repeat rows
df_rep = df.loc[df.index.repeat(df["n_chunks"])].copy()
df_rep["chunk_id"] = df_rep.groupby(level=0).cumcount()

# Adjust last chunk size
df_rep["chunk_bytes"] = CHUNK_BYTES
mask_last = df_rep["chunk_id"] == (df_rep["n_chunks"] - 1)
df_rep.loc[mask_last, "chunk_bytes"] = (
    df_rep.loc[mask_last, "bytes"] - CHUNK_BYTES*(df_rep.loc[mask_last, "n_chunks"] - 1)
)

# Build cache key
df_rep["key"] = df_rep.apply(lambda r: f"{r.video}_{r.ladder}_{r.chunk_id}", axis=1)

# Keep final columns
keep = df_rep[["ts","user","video","ladder","key","chunk_bytes","vid_idx"]].rename(
    columns={"chunk_bytes":"bytes"}
)

keep.to_parquet(OUT, compression="zstd")
print("Wrote", OUT, "rows:", len(keep))

Wrote ../data/trace_chunks.parquet rows: 699813
