# ROOT File Deduplication Script

Cleans ROOT TTrees under `data/processed/` by removing duplicate `Bp_P` momenta per event.  
Entries are clustered by relative difference (ε = 0.005) and one random representative is kept.  
Cleaned files are saved in `data/processed_clean_bp_p/`, preserving the original folder structure.

In [3]:
import os, glob, random
import numpy as np
import ROOT as r
from scipy.cluster.hierarchy import fclusterdata

r.ROOT.EnableImplicitMT()

BASE_IN  = "data/processed"
BASE_OUT = "data/processed_clean_bp_p"
EPSILON  = 0.005
TREENAMES = ("ST-b2oc", "ST-b2cc")

In [None]:
file_list = glob.glob(os.path.join(BASE_IN, "*.root"))
print(f"[dedup] scanning {BASE_IN}: {len(file_list)} files found")

for infile in file_list:
    filename = os.path.basename(infile)
    outfile  = os.path.join(BASE_OUT, filename)
    os.makedirs(BASE_OUT, exist_ok=True)

    f = r.TFile.Open(infile)
    if not f or f.IsZombie():
        print(f"cannot open: {infile}")
        if f: f.Close()
        continue

    # pick whichever tree exists
    tree = None
    treename = None
    for tn in TREENAMES:
        t = f.Get(tn)
        if t and t.InheritsFrom("TTree"):
            tree, treename = t, tn
            break
    if tree is None:
        print(f"no expected tree {TREENAMES} in {infile}")
        f.Close()
        continue

    nentries = tree.GetEntries()
    print(f"[dedup] {filename} | tree='{treename}' | entries={nentries}")

    # collect Bp_P by event
    events = {}
    for i in range(nentries):
        tree.GetEntry(i)
        ev   = getattr(tree, "event", None)
        bp_p = getattr(tree, "Bp_P", None)
        if ev is None or bp_p is None:
            continue
        if not np.isfinite(bp_p) or bp_p <= 0.0:
            print(f"[dedup] {filename} | skipping entry {i} with invalid Bp_P={bp_p}")
            continue
        events.setdefault(int(ev), []).append((float(bp_p), i))

    # cluster per event, keep one index per cluster
    keep_indices = set()
    for ev, lst in events.items():
        if len(lst) == 1:
            keep_indices.add(lst[0][1])
            continue
        arr = np.array([[x[0]] for x in lst])  # shape (k,1)
        clusters = fclusterdata(
            arr, t=EPSILON, criterion="distance", method="single",
            metric=lambda u, v: abs(u[0] - v[0]) / min(u[0], v[0]) if min(u[0], v[0]) > 0 else np.inf
        )
        for cl in set(clusters):
            members = [j for j, c in enumerate(clusters) if c == cl]
            chosen = random.choice(members)
            keep_indices.add(lst[chosen][1])

    print(f"[dedup] {filename} | keeping {len(keep_indices)} entries")

    # write cleaned file (flat structure)
    out_f = r.TFile(outfile, "RECREATE")
    new_tree = tree.CloneTree(0)
    for i in range(nentries):
        if i in keep_indices:
            tree.GetEntry(i)
            new_tree.Fill()
    new_tree.Write()
    out_f.Close()
    f.Close()

    print(f"cleaned -> {outfile}")