# Notebook: split by FILL + clean per-block (B2OC/B2CC)

Load ROOT trees per block, exclude known bad runs, write one cleaned per-block file, and write per-FILL files that pass a minimum-entries cut.

**Important notes**
- **Input:** `data/real_5to8_raw/…` - this folder only has blocks **5–8** (earlier blocks were tests). Monte Carlo is in `data/monte_carlo/` and not used here.
- **Output:** `data/processed/`
- **Filenames:** per-block → `YYYY_DECAY_B{block}.root`; per-FILL → `YYYY_DECAY_B{block}_F{fill}.root`.
- **Bad runs:** defined below.
- **Threshold:** `MIN_ENTRIES` controls writing per-FILL files.

In [3]:
import ROOT as r
import glob
from pathlib import Path
from typing import Optional

r.ROOT.EnableImplicitMT()  # allow ROOT to parallelize

# Fixed locations
DATA_RAW = Path("data/real_5to8_raw")   # inputs: blocks 5–8
DATA_PROCESSED = Path("data/processed") # outputs
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

YEAR = 2024
MIN_ENTRIES = 20_000  # per-FILL write threshold

# Bad runs
BAD_RUNS = {303681, 304475, 304487, 304488, 304489, 304490, 304491}

# Inputs (wildcards or single files under DATA_RAW)
files_info_b2oc = [
    {"filename": str(DATA_RAW / "00289228_00000001_1.highstats-Small-B2OC-UP.root"),   "block": 5},
    {"filename": str(DATA_RAW / "00289233_00000001_1.highstats-Small-B2OC-DOWN.root"), "block": 6},
    {"filename": str(DATA_RAW / "00289239_00000001_1.highstats-Small-B2OC-DOWN.root"), "block": 7},
    {"filename": str(DATA_RAW / "00289237_00000001_1.highstats-Small-B2OC-UP.root"),   "block": 8},
]

files_info_b2cc = [
    {"filename": str(DATA_RAW / "00289229_0000000*_1.highstats-Small-B2CC-UP.root"),   "block": 5},
    {"filename": str(DATA_RAW / "00289331_0000000*_1.highstats-Small-B2CC-DOWN.root"), "block": 6},
    {"filename": str(DATA_RAW / "00289235_0000000*_1.highstats-Small-B2CC-DOWN.root"), "block": 7},
    {"filename": str(DATA_RAW / "00289231_0000000*_1.highstats-Small-B2CC-UP.root"),   "block": 8},
]

# Snapshot options
SNAPSHOT_OPTS = r.RDF.RSnapshotOptions()
SNAPSHOT_OPTS.fMode = "RECREATE"
SNAPSHOT_OPTS.fCompressionAlgorithm = 1  # ZLIB
SNAPSHOT_OPTS.fCompressionLevel = 4


In [4]:
def _expand(pattern: str) -> list[str]:
    """Expand a wildcard or accept a single path; return sorted file list."""
    paths = sorted(glob.glob(pattern))
    if not paths:
        print(f"no files for: {pattern}")
    return paths

def _outfile(decay: str, block: int, fill: Optional[int]) -> Path:
    """Build output path under data/processed with deterministic naming."""
    name = f"{YEAR}_{decay}_B{block}.root" if fill is None else f"{YEAR}_{decay}_B{block}_F{fill}.root"
    return DATA_PROCESSED / name

def _bad_expr() -> str:
    """Return a ROOT cut to exclude bad runs."""
    return "true" if not BAD_RUNS else " && ".join(f"run != {r}" for r in sorted(BAD_RUNS))

def process_blocks(files_info: list[dict], treename: str, decay_label: str) -> None:
    """
    Merge all input files per block, apply (block && !bad_runs),
    write one per-block file and per-FILL files (>= MIN_ENTRIES).
    """
    by_block: dict[int, list[str]] = {}
    for info in files_info:
        block = int(info["block"])
        files = _expand(info["filename"])
        if not files:
            continue
        by_block.setdefault(block, []).extend(files)

    if not by_block:
        print("nothing to process")
        return

    bad = _bad_expr()

    for block in sorted(by_block):
        files = sorted(set(by_block[block]))  # de-duplicate within block
        print(f"\n[{decay_label}] block={block} | {len(files)} file(s)")

        rdf = r.RDataFrame(treename, files)
        base = rdf.Filter(f"(block == {block}) && ({bad})")

        # per-block
        n_block = int(base.Count().GetValue())
        base.Snapshot(treename, str(_outfile(decay_label, block, fill=None)), [], SNAPSHOT_OPTS)
        print(f"per-block  -> {_outfile(decay_label, block, fill=None)}  ({n_block} entries)")

        # per-FILL
        try:
            fills = sorted(set(map(int, base.AsNumpy(['FILL'])['FILL'])))
        except Exception as e:
            print(f"cannot read FILL: {e}; skip per-FILL")
            continue

        print(f"fills: {len(fills)}")
        for F in fills:
            sub = base.Filter(f"FILL == {F}")
            nF = int(sub.Count().GetValue())
            if nF < MIN_ENTRIES:
                print(f"  skip F{F}: {nF} < {MIN_ENTRIES}")
                continue
            sub.Snapshot(treename, str(_outfile(decay_label, block, fill=F)), [], SNAPSHOT_OPTS)
            print(f"  F{F} -> {_outfile(decay_label, block, fill=F)}  ({nF} entries)")


In [5]:
process_blocks(files_info_b2oc, treename="ST-b2oc", decay_label="B2OC")
process_blocks(files_info_b2cc, treename="ST-b2cc", decay_label="B2CC")

no files for: data/real_5to8_raw/00289228_00000001_1.highstats-Small-B2OC-UP.root
no files for: data/real_5to8_raw/00289233_00000001_1.highstats-Small-B2OC-DOWN.root
no files for: data/real_5to8_raw/00289239_00000001_1.highstats-Small-B2OC-DOWN.root
no files for: data/real_5to8_raw/00289237_00000001_1.highstats-Small-B2OC-UP.root
nothing to process
no files for: data/real_5to8_raw/00289229_0000000*_1.highstats-Small-B2CC-UP.root
no files for: data/real_5to8_raw/00289331_0000000*_1.highstats-Small-B2CC-DOWN.root
no files for: data/real_5to8_raw/00289235_0000000*_1.highstats-Small-B2CC-DOWN.root
no files for: data/real_5to8_raw/00289231_0000000*_1.highstats-Small-B2CC-UP.root
nothing to process
