# Notebook: split by FILL + clean per-block (B2OC/B2CC)

Load ROOT trees per block, exclude known bad runs, write one cleaned per-block file, and write per-FILL files that pass a minimum-entries cut.

**Important notes**
- **Input:** `data/real_5to8_raw/…` - this folder only has blocks **5–8** (earlier blocks were tests). Monte Carlo is in `data/monte_carlo/` and not used here.
- **Output:** `data/processed/`
- **Filenames:** per-block → `YYYY_DECAY_B{block}.root`; per-FILL → `YYYY_DECAY_B{block}_F{fill}.root`.
- **Bad runs:** defined below.
- **Threshold:** `MIN_ENTRIES` controls writing per-FILL files.

In [1]:
import ROOT as r
import glob
from pathlib import Path
from typing import Optional

r.ROOT.EnableImplicitMT()  # allow ROOT to parallelize

# Fixed locations
DATA_RAW = Path("data/real_5to8_raw")   # inputs: blocks 5–8
DATA_PROCESSED = Path("data/processed") # outputs
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

YEAR = 2024 # Will be used for name of the files
MIN_ENTRIES = 20_000  # per-FILL write threshold

# Bad runs to be excluded
BAD_RUNS = {303681, 304475, 304487, 304488, 304489, 304490, 304491}

# Inputs (wildcards or single files under DATA_RAW)
files_info_b2oc = [
    {"filename": str(DATA_RAW / "00289228_00000001_1.highstats-Small-B2OC-UP.root"),   "block": 5},
    {"filename": str(DATA_RAW / "00289233_00000001_1.highstats-Small-B2OC-DOWN.root"), "block": 6},
    {"filename": str(DATA_RAW / "00289239_00000001_1.highstats-Small-B2OC-DOWN.root"), "block": 7},
    {"filename": str(DATA_RAW / "00289237_00000001_1.highstats-Small-B2OC-UP.root"),   "block": 8},
]

files_info_b2cc = [
    {"filename": str(DATA_RAW / "00289229_0000000*_1.highstats-Small-B2CC-UP.root"),   "block": 5},
    {"filename": str(DATA_RAW / "00289331_0000000*_1.highstats-Small-B2CC-DOWN.root"), "block": 6},
    {"filename": str(DATA_RAW / "00289235_0000000*_1.highstats-Small-B2CC-DOWN.root"), "block": 7},
    {"filename": str(DATA_RAW / "00289231_0000000*_1.highstats-Small-B2CC-UP.root"),   "block": 8},
]

# Snapshot options
SNAPSHOT_OPTS = r.RDF.RSnapshotOptions()
SNAPSHOT_OPTS.fMode = "RECREATE"
SNAPSHOT_OPTS.fCompressionAlgorithm = 1  # ZLIB
SNAPSHOT_OPTS.fCompressionLevel = 4


In [2]:
def _outfile(decay: str, block: int, fill: Optional[int]) -> Path:
    """Build output path under data/processed with deterministic naming."""
    name = f"{YEAR}_{decay}_B{block}.root" if fill is None else f"{YEAR}_{decay}_B{block}_F{fill}.root"
    return DATA_PROCESSED / name

def _bad_expr() -> str:
    """Return a ROOT cut to exclude bad runs."""
    return "true" if not BAD_RUNS else " && ".join(f"run != {r}" for r in sorted(BAD_RUNS))

def process_blocks(files_info: list[dict], treename: str, decay_label: str) -> None:
    """
    Merge all input files per block, apply (block && !bad_runs),
    write one per-block file and per-FILL files (>= MIN_ENTRIES).
    """
    by_block: dict[int, list[str]] = {}
    for info in files_info:
        block = int(info["block"])
        files = sorted(glob.glob(info["filename"]))
        if not files:
            continue
        by_block.setdefault(block, []).extend(files)

    if not by_block:
        print("nothing to process")
        return

    bad = _bad_expr()

    for block in sorted(by_block):
        files = sorted(set(by_block[block]))
        print(f"\n[{decay_label}] block={block} | {len(files)} file(s)")

        rdf = r.RDataFrame(treename, files)

        # Check block name and exclude bad runs
        base = rdf.Filter(f"(block == {block}) && ({bad})")

        # ---- per-block ----
        n_block = int(base.Count().GetValue())
        cols_block = [str(c) for c in base.GetColumnNames()]
        base.Snapshot(treename, str(_outfile(decay_label, block, fill=None)), cols_block, SNAPSHOT_OPTS)
        print(f"per-block  -> {_outfile(decay_label, block, fill=None)}  ({n_block} entries)")

        # ---- per-FILL ----
        try:
            fills = sorted(set(map(int, base.AsNumpy(['FILL'])['FILL'])))
        except Exception as e:
            print(f"cannot read FILL: {e}; skip per-FILL")
            continue

        print(f"fills: {len(fills)}")
        for F in fills:
            sub = base.Filter(f"FILL == {F}")
            nF = int(sub.Count().GetValue())
            if nF < MIN_ENTRIES:
                print(f"  skip F{F}: {nF} < {MIN_ENTRIES}")
                continue
            cols_sub = [str(c) for c in sub.GetColumnNames()]
            sub.Snapshot(treename, str(_outfile(decay_label, block, fill=F)), cols_sub, SNAPSHOT_OPTS)
            print(f"  F{F} -> {_outfile(decay_label, block, fill=F)}  ({nF} entries)")


In [None]:
# Process and clean (create process files) for D-Pi
process_blocks(files_info_b2oc, treename="ST-b2oc", decay_label="B2OC")


[B2OC] block=5 | 1 file(s)
per-block  -> data/processed/2024_B2OC_B5.root  (12453113 entries)
fills: 25
  F10059 -> data/processed/2024_B2OC_B5_F10059.root  (97942 entries)
  F10061 -> data/processed/2024_B2OC_B5_F10061.root  (299599 entries)
  F10066 -> data/processed/2024_B2OC_B5_F10066.root  (336671 entries)
  F10069 -> data/processed/2024_B2OC_B5_F10069.root  (757938 entries)
  F10070 -> data/processed/2024_B2OC_B5_F10070.root  (108898 entries)
  F10072 -> data/processed/2024_B2OC_B5_F10072.root  (732723 entries)
  F10073 -> data/processed/2024_B2OC_B5_F10073.root  (770704 entries)
  F10074 -> data/processed/2024_B2OC_B5_F10074.root  (737512 entries)
  F10075 -> data/processed/2024_B2OC_B5_F10075.root  (194991 entries)
  F10077 -> data/processed/2024_B2OC_B5_F10077.root  (548795 entries)
  F10082 -> data/processed/2024_B2OC_B5_F10082.root  (142982 entries)
  F10084 -> data/processed/2024_B2OC_B5_F10084.root  (1366337 entries)
  F10086 -> data/processed/2024_B2OC_B5_F10086.root  (2

In [None]:
# Process and clean (create process files) for J/Psi-K
process_blocks(files_info_b2cc, treename="ST-b2cc", decay_label="B2CC")

In [None]:
# Quick check of one output file (or input files, but for that we need to change the path to DATA_RAW)
# This code is only for the purpose of manual inspection
file_to_check = DATA_PROCESSED / "2024_B2CC_B8_F10232.root"

if not Path(file_to_check).exists():
    print(f"File not found: {file_to_check}")
else:
    f = r.TFile.Open(str(file_to_check))
    if not f or f.IsZombie():
        print(f"Could not open file: {file_to_check}")
    else:
        tree = None
        for name in ("ST-b2cc", "ST-b2oc"):
            obj = f.Get(name)
            if obj and obj.InheritsFrom("TTree"):
                tree = obj
                break

        if tree is None:
            print("No TTree found in file.")
        else:
            print(f"Tree: {tree.GetName()}")
            print(f"Entries: {tree.GetEntries()}")
            print("Branches:")
            for branch in tree.GetListOfBranches():
                print(f"  - {branch.GetName()}")

        f.Close()

Tree: ST-b2oc
Entries: 11683229
Branches:
  - event
  - run
  - block
  - GPSTIME
  - FILL
  - nLongTracks
  - nPVs
  - nVeloTracks
  - Bp_DTF_OwnPV_CHI2DOF
  - Bp_DTF_OwnPV_MASS
  - Bp_DTF_OwnPV_CTAU
  - Bp_DTF_OwnPV_CTAUERR
  - Bp_DTF_OwnPV_FD
  - Bp_M
  - Bp_PT
  - Bp_P
  - Bp_BPVIPCHI2
  - Db_M
  - Db_PT
  - Db_P
  - Db_BPVIPCHI2
  - Kp_TRCHI2
  - Kp_PT
  - Kp_P
  - Kp_BPVIPCHI2
  - Kp_PID_K
  - Kp_PROBNN_K
  - Kp_PROBNN_PI
  - pim_TRCHI2
  - pim_PT
  - pim_P
  - pim_PID_K
  - pim_BPVIPCHI2
  - pim_PROBNN_K
  - pim_PROBNN_PI
  - pip_PARTICLE_ID
  - pip_TRCHI2
  - pip_PT
  - pip_P
  - pip_BPVIPCHI2
  - pip_PID_K
  - pip_PROBNN_K
  - pip_PROBNN_PI
  - nFTClusters
  - nUTClusters
  - nVPClusters
