# Notebook: de-duplicate per event by Bp_P clustering

Per file and per event:
- drop entries with `Bp_P ≤ 0` if any exist
- cluster by relative difference in `Bp_P` (threshold `epsilon`) for entries of same event
- keep one representative per cluster (random selection)
- write cleaned file and print duplicate stats before/after

In [None]:
import ROOT as r
import numpy as np
from pathlib import Path
from collections import Counter
from typing import Optional, List, Dict, Tuple
import os
import random

r.ROOT.EnableImplicitMT()

# I/O
BASE_IN  = Path("data/processed")
BASE_OUT = Path("data/processed_clean_bp_p")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# Clustering threshold (relative). 0.005 = 0.5%
EPSILON = 0.005


In [None]:
def _list_root_files(root_dir: Path, include: Optional[str] = None) -> List[Path]:
    """All .root files under root_dir. Optional filename filter."""
    files = sorted(root_dir.rglob("*.root"))
    if include:
        files = [f for f in files if include in f.name]
    return files

def _open_tree(filename: Path, treename: str):
    """Open ROOT file and get tree by name. Returns (file, tree) or (None, None)."""
    f = r.TFile.Open(str(filename))
    if not f or f.IsZombie():
        print(f"skip (cannot open): {filename}")
        return None, None
    t = f.Get(treename)
    if not t or not t.InheritsFrom("TTree"):
        f.Close()
        return None, None
    return f, t

def _choose_representative(bp_p_list: List[Tuple[float, int]]) -> int:
    """Pick a random entry index from the list."""
    return random.choice(bp_p_list)[1]

def _cluster_keep_indices_fast(bp_p_list: List[Tuple[float, int]], epsilon: float) -> set:
    """
    Cluster consecutive values in log-space with a relative threshold.

    Here is the math of the logic:
      We want entries p_i, p_{i-1} to be in the same cluster if their
      relative difference is at most ε:

          |p_i - p_{i-1}| / p_{i-1} <= ε          (1)

      Assuming sorted order p_i >= p_{i-1} > 0, this is equivalent to

          p_i / p_{i-1} <= 1 + ε                  (2)

      Taking logs:

          log(p_i) - log(p_{i-1}) <= log(1 + ε)   (3)

      So in code we set gap_max = log1p(ε) and split a cluster whenever

          log(p_i) - log(p_{i-1}) > gap_max.

    From each cluster, keep one representative (chosen randomly).
    """
    if len(bp_p_list) <= 1:
        return {bp_p_list[0][1]} if bp_p_list else set()

    # sort by p
    bp_sorted = sorted(bp_p_list, key=lambda x: x[0])  # (p, idx)
    logs = np.log([p for p, _ in bp_sorted])
    gap_max = np.log1p(epsilon)

    keep = set()
    start = 0
    n = len(bp_sorted)
    # sweep once; split whenever the log-gap exceeds the threshold
    for i in range(1, n):
        if logs[i] - logs[i-1] > gap_max:
            # finalize cluster [start, i)
            keep.add(_choose_representative(bp_sorted[start:i]))
            start = i
    # last cluster
    keep.add(_choose_representative(bp_sorted[start:n]))
    return keep

def _duplicate_stats_from_events(events: Dict[int, List[Tuple[float,int]]]) -> Tuple[int, int, float]:
    """
    Given {event: [(p, idx), ...]}, return:
      total_valid, duplicate_count, duplicate_fraction
    duplicate_count sums (len(list)-1) over events with >1 entries.
    """
    total = sum(len(v) for v in events.values())
    dupes = sum(len(v) - 1 for v in events.values() if len(v) > 1)
    frac = (dupes / total) if total > 0 else 0.0
    return total, dupes, frac


In [None]:
def clean_decay(base_in: Path, base_out: Path, include_token: str, treename: str, epsilon: float) -> None:
    """
    Process only files whose name contains include_token (e.g., 'B2OC' or 'B2CC').
    Remove duplicate entries for each event by grouping similar Bp_P values.
    """
    # Ensure output dir exists and is writable
    base_out.mkdir(parents=True, exist_ok=True)
    if not os.access(base_out, os.W_OK):
        raise PermissionError(f"Output directory not writable: {base_out}")

    files = _list_root_files(base_in, include=include_token)
    if not files:
        print(f"no .root files under {base_in} matching '{include_token}'")
        return

    for fpath in files:
        fin, t = _open_tree(fpath, treename)
        if not t:
            print(f"skip (tree '{treename}' not found) in {fpath.name}")
            if fin: fin.Close()
            continue

        # Read only what's needed for the scan (I/O speedup)
        t.SetBranchStatus("*", 0)
        t.SetBranchStatus("event", 1)
        t.SetBranchStatus("Bp_P", 1)

        # Collect per-event entries; skip Bp_P <= 0 or missing
        events: Dict[int, List[Tuple[float, int]]] = {}
        n = t.GetEntries()
        zeros = 0
        missing = 0
        for i in range(n):
            t.GetEntry(i)
            try:
                ev = int(getattr(t, "event"))
                p  = float(getattr(t, "Bp_P"))
            except Exception:
                missing += 1
                continue
            if not np.isfinite(p) or p <= 0.0:
                zeros += 1
                continue
            events.setdefault(ev, []).append((p, i))

        total_before, dupes_before, frac_before = _duplicate_stats_from_events(events)
        print(f"{fpath.name}: valid={total_before}, zeros={zeros}, missing={missing}, duplicates(before)={dupes_before} ({frac_before:.3%})")

        # If nothing valid, skip writing, and exit
        if total_before == 0:
            fin.Close()
            print(f"No valid entries; skipping write for {fpath.name}")
            exit()

        # Build keep indices via clustering
        keep: set = set()
        for bp_list in events.values():
            keep |= _cluster_keep_indices_fast(bp_list, epsilon)

        fin.Close()

        # Write cleaned file via TEntryList + CopyTree
        out_path = base_out / fpath.name
        fout = r.TFile(str(out_path), "RECREATE")
        if not fout or fout.IsZombie():
            print(f"ERROR: cannot create output file: {out_path}")
            if fout: fout.Close()
            exit()

        fin2 = r.TFile.Open(str(fpath))
        if not fin2 or fin2.IsZombie():
            print(f"ERROR: cannot reopen input file for writing stage: {fpath}")
            fout.Close()
            if fin2: fin2.Close()
            exit()

        t2 = fin2.Get(treename)
        if not t2 or not t2.InheritsFrom("TTree"):
            print(f"ERROR: tree '{treename}' missing when rewriting {fpath.name}")
            fin2.Close(); fout.Close()
            exit()

        # Build entry list of kept indices
        elist = r.TEntryList("elist", "kept entries")
        for i in sorted(keep):
            elist.Enter(i)

        t2.SetEntryList(elist)
        newt = t2.CopyTree("")
        newt.Write()

        fout.Write()
        fout.Close()
        fin2.Close()

        # Post-check on written file
        fcheck = r.TFile.Open(str(out_path))
        if not fcheck or fcheck.IsZombie():
            print(f"ERROR: cannot open written file for check: {out_path}")
            if fcheck: fcheck.Close()
            exit()

        tcheck = fcheck.Get(treename)
        if not tcheck or not hasattr(tcheck, "GetEntries"):
            print(f"ERROR: written file has no tree '{treename}' (write failure?).")
            fcheck.ls()
            fcheck.Close()
            exit()

        n_after = int(tcheck.GetEntries())
        events_after: Dict[int, List[int]] = {}
        # read only event branch for the check
        tcheck.SetBranchStatus("*", 0)
        tcheck.SetBranchStatus("event", 1)
        for i in range(n_after):
            tcheck.GetEntry(i)
            ev = int(getattr(tcheck, "event"))
            events_after.setdefault(ev, []).append(i)

        dupes_after = sum(len(v) - 1 for v in events_after.values() if len(v) > 1)
        frac_after = (dupes_after / n_after) if n_after > 0 else 0.0
        fcheck.Close()

        print(f" -> cleaned: kept={n_after} / {total_before}, duplicates(after)={dupes_after} ({frac_after:.3%})")


In [None]:
# B2OC
clean_decay(BASE_IN, BASE_OUT, include_token="B2OC", treename="ST-b2oc", epsilon=EPSILON)

2024_B2OC_B5.root: valid=12453113, zeros=0, missing=0, duplicates(before)=260306 (2.090%)


In [None]:
def final_report(base_out: Path, treename: str, include_token: str):
    files = _list_root_files(base_out, include=include_token)
    if not files:
        print(f"no files in {base_out} for token {include_token}")
        return
    for fpath in files:
        try:
            f = r.TFile.Open(str(fpath))
            if not f or f.IsZombie():
                print(f"skip (cannot open): {fpath}")
                continue
        except OSError as e:
            print(f"skip (OSError opening file): {fpath} — {e}")
            continue
        t = f.Get(treename)
        if not t or not t.InheritsFrom("TTree"):
            print(f"{fpath.name}: no tree '{treename}'")
            f.Close()
            continue
        n = int(t.GetEntries())
        t.SetBranchStatus("*", 0)
        t.SetBranchStatus("event", 1)
        events = []
        for i in range(n):
            t.GetEntry(i)
            events.append(int(getattr(t, "event")))
        counts = Counter(events)
        dupes = sum(c - 1 for c in counts.values() if c > 1)
        frac = (dupes / n) if n > 0 else 0.0
        print(f"{fpath.name}: {n} entries, duplicates={dupes} ({frac:.3%})")
        f.Close()

print("\nCleaned outputs — B2OC:")
final_report(BASE_IN, "ST-b2oc", "B2OC")

print("\nCleaned outputs — B2CC:")
final_report(BASE_IN, "ST-b2cc", "B2CC")



Cleaned outputs — B2OC:
2024_B2OC_B5.root: 12453113 entries, duplicates=260306 (2.090%)
2024_B2OC_B5_F10059.root: 97942 entries, duplicates=2204 (2.250%)
2024_B2OC_B5_F10061.root: 299599 entries, duplicates=6404 (2.138%)
2024_B2OC_B5_F10066.root: 336671 entries, duplicates=7171 (2.130%)
2024_B2OC_B5_F10069.root: 757938 entries, duplicates=16152 (2.131%)
2024_B2OC_B5_F10070.root: 108898 entries, duplicates=2213 (2.032%)
2024_B2OC_B5_F10072.root: 732723 entries, duplicates=15052 (2.054%)
2024_B2OC_B5_F10073.root: 770704 entries, duplicates=16132 (2.093%)
2024_B2OC_B5_F10074.root: 737512 entries, duplicates=15185 (2.059%)
2024_B2OC_B5_F10075.root: 194991 entries, duplicates=4008 (2.055%)
2024_B2OC_B5_F10077.root: 548795 entries, duplicates=11497 (2.095%)

Cleaned outputs — B2CC:
no files in data/processed for token B2CC
