In [1]:
import itertools
import logging
from pathlib import Path
import numba as nb

import awkward as ak
import click
import h5py
import numpy as np
import vector

from src.data.cms.convert_to_h5 import MIN_JETS, N_JETS, N_FJETS

vector.register_awkward()

logging.basicConfig(level=logging.INFO)

In [2]:
# a function that loads jets from hhh_test.h5
def load_jets(in_file):
    # load jets from the h5
    pt = ak.Array(in_file["INPUTS"]["Jets"]["pt"])
    eta = ak.Array(in_file["INPUTS"]["Jets"]["eta"])
    phi = ak.Array(in_file["INPUTS"]["Jets"]["phi"])
    btag = ak.Array(in_file["INPUTS"]["Jets"]["btag"])
    mass = ak.Array(in_file["INPUTS"]["Jets"]["mass"])
    mask = ak.Array(in_file["INPUTS"]["Jets"]["MASK"])

    jets = ak.zip(
        {
            "pt": pt,
            "eta": eta,
            "phi": phi,
            "btag": btag,
            "mass": mass,
            "mask": mask
        },
        with_name="Momentum4D",
    )
    
    return jets

In [3]:
# a function that loads fat jets from hhh_test.h5
def load_fjets(in_file):
     # load fatjets from h5
    fj_pt = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_pt"])
    fj_eta = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_eta"])
    fj_phi = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_phi"])
    fj_mass = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_mass"])
    fj_mask = ak.Array(in_file["INPUTS"]["BoostedJets"]["MASK"])

    fjets = ak.zip(
        {
            "pt": fj_pt,
            "eta": fj_eta,
            "phi": fj_phi,
            'mass': fj_mass,
            'mask': fj_mask
        },
        with_name="Momentum4D"
    )
    
    return fjets

In [4]:
FJET_DR = 0.8

@nb.njit
def match_fjet_to_jet(fjets, jets, builder):
    for fjets_event, jets_event in zip(fjets, jets):
        builder.begin_list()
        for i, jet in enumerate(jets_event):
            match_idx = -1
            for j, fjet in enumerate(fjets_event):
                if jet.deltaR(fjet) < FJET_DR:
                    match_idx = j
            builder.append(match_idx)
        builder.end_list()

    return builder

In [5]:
@nb.njit
def ja_list_2_ak(JET_ASSIGNMENTS, NrH, jcounts, builder):
    builder.begin_list()
    for NrH_e, jcounts_e in zip(NrH, jcounts):
        builder.begin_list()
        ja = JET_ASSIGNMENTS[NrH_e][jcounts_e]
        for comb in ja:
            builder.begin_list()
            for b1_b2 in comb:
                builder.begin_list()
                for bx in b1_b2:
                    builder.append(bx)
                builder.end_list()
            builder.end_list()
        builder.end_list()
    
    builder.end_list()
    return builder

In [6]:
@nb.njit
def chi2_matching(js, jcounts, NrHs, jet_assignments, builder):
    HIGGS_MASS = 125
    builder.begin_list()
    for js_e, jcount_e, ja_e, NrH_e in zip(js, jcounts, jet_assignments, NrHs):
        builder.begin_list()
        if NrH_e == 0:
            continue

        chi2_argmin = 0
        chi2_min = 99999
        for i, comb in enumerate(ja_e):
            chi2 = 0
            for b1_b2 in comb:
                j_b1_idx = b1_b2[0]
                j_b2_idx = b1_b2[1]
                
                j_b1 = js_e[j_b1_idx]
                j_b2 = js_e[j_b2_idx]
                mjj = (j_b1 + j_b2).mass
                
                chi2 += np.square(mjj - HIGGS_MASS)
            if chi2<chi2_min:
                argmin = i

            
        best_comb = ja_e[chi2_argmin]
        for b1_b2 in best_comb:
            builder.begin_list()
            for bx in b1_b2:
                builder.append(bx)
            builder.end_list()
        
        builder.end_list()
        
    builder.end_list()

    return builder

In [7]:
# for ja_e in jet_assignments_ak:
#     for comb in ja_e:
#         for b1_b2 in comb:
#             print(b1_b2[0], b1_b2[1])

In [39]:
def to_np_array(ak_array, axis=-1, max_n=10, pad=0):
    return ak.fill_none(ak.pad_none(ak_array, max_n, clip=True, axis=axis), pad, axis=axis).to_numpy()

In [None]:
def fjIdx2Prob

In [8]:
pred_file = "//home/billyli/UCSD/hhh/reports/bv2/pred_baseline.h5"
test_file = "//home/billyli/UCSD/hhh/reports/bv2/hhh_test.h5"

In [49]:
# load jets and fat jets from test h5 file
in_file = h5py.File(test_file)
js = load_jets(in_file)
js_idx = ak.local_index(js)
fjs = load_fjets(in_file)
fj_idx = ak.local_index(fjs)

# select real fjets based on pT and mass cut
fj_mask = fjs['mask']
fjmass_cond = (fjs['mass']>110) & (fjs['mass']<140)
fjpt_cond = fjs['pt']>300
fj_cond = fjmass_cond & fjpt_cond & fj_mask
fjs_selected = fjs[fj_cond]

# save the qualified fjets indices
# they will be bH candidates
bh_fj_idx = fj_idx[fj_cond]
bh_fj_idx = to_np_array(bh_fj_idx, max_n=3, pad=-1)

# convert indices to AP and DP
bhs_dp = np.zeros(shape=bh_fj_idx.shape)
fjs_ap = np.zeros(shape=bh_fj_idx.shape)
bhs_dp[bh_fj_idx!=-1] = 1
fjs_ap[bh_fj_idx!=-1] = 1

# find ak4jets that matched to selected ak8jets (dR check)
matched_fj_idx = match_fjet_to_jet(fjs_selected, js, ak.ArrayBuilder()).snapshot()

# remove overlapped ak4jets and padded jets
unoverlapped = matched_fj_idx==-1
unmasked = js['mask']
j_cond = unoverlapped & unmasked
js_selected = js[j_cond]
idx_js_selected = js_idx[j_cond]

# get the auxiliary information for chi2_matching 
jcounts = ak.count(js_selected, axis=-1)
n_bhs_matched = ak.count(fjs_selected, axis=-1)
NrHs = (3-n_bhs_matched).to_numpy()

# chi2 

# mask events that don't have enough resolved jets to match to 3H (X bH + Y rH)
# those events will be extracted later
suff = jcounts >= NrHs*2
js_suff = js_selected[suff]
jcounts_suff = jcounts[suff]
NrHs_suff = NrHs[suff]

# construct jet assignment look-up array that has 
# all combinations of input jets
# for different numbers of resolved higgs and jets
JET_ASSIGNMENTS_ak = []
for nH in range(0, 1+3):
    JET_ASSIGNMENTS_ak.append([])
    for nj in range(0, nH*2):
        JET_ASSIGNMENTS_ak[nH].append([])
    for nj in range(nH*2, N_JETS + 1):
        JET_ASSIGNMENTS_ak[nH].append([])
        a = list(itertools.combinations(range(nj), 2))
        b = np.array([ assignment for assignment in itertools.combinations(a, nH) if len(np.unique(assignment)) == nH*2])
        JET_ASSIGNMENTS_ak[nH][nj] = b

JET_ASSIGNMENTS_ak = ak.Array(JET_ASSIGNMENTS_ak)

# find the jet assignment combinations for each event
jet_assignments_ak = ja_list_2_ak(JET_ASSIGNMENTS_ak, NrHs_suff, jcounts_suff, ak.ArrayBuilder()).snapshot()
jet_assignments_ak = ak.flatten(jet_assignments_ak, axis=1)

# assign ak4 jets to the resolved higgs by chi2
rh_j_idx_suff = chi2_matching(js_suff[0:1000], jcounts_suff[0:1000], NrHs_suff[0:1000], jet_assignments_ak[0:1000], ak.ArrayBuilder()).snapshot()
rh_j_idx_suff = ak.flatten(rh_j_idx_suff, axis=1)
rh_j_idx_suff = ak.fill_none(ak.pad_none(rh_j_idx_suff, 3, clip=True, axis=-2), ak.Array(np.array([-1, -1], dtype=np.int64)), axis=-2).to_numpy()

# convert resolved jet indices to AP and DP
rhs_dp = np.zeros(shape=rh_j_idx_suff.shape[0:-1])
js_dp = np.zeros(shape=rh_j_idx_suff.shape)

# For events that don't have enough jets
# try reconstruct one less higgs


# save all assignment to the h5file
datasets = {}
datasets["TARGETS/bh1/bb"] = bh_fj_idx[:,0]
datasets["TARGETS/bh2/bb"] = bh_fj_idx[:,1]
datasets["TARGETS/bh3/bb"] = bh_fj_idx[:,2]

datasets["TARGETS/bh1/detection_probability"] = bhs_dp[:,0]
datasets["TARGETS/bh2/detection_probability"] = bhs_dp[:,1]
datasets["TARGETS/bh3/detection_probability"] = bhs_dp[:,2]

with h5py.File(pred_file, "w") as output:
    for dataset_name, all_data in all_datasets.items():
        concat_data = np.concatenate(all_data, axis=0)
        output.create_dataset(dataset_name, data=concat_data)

NameError: name 'all_datasets' is not defined

In [10]:
rh_rj_idx_suff_flatten = ak.flatten(rh_rj_idx_suff, axis=1)

In [45]:
rh_rj_idx_suff_flatten


AttributeError: no field named 'shape'

In [47]:
rh_rj_idx_suff = ak.fill_none(ak.pad_none(rh_rj_idx_suff_flatten, 3, clip=True, axis=-2), ak.Array(np.array([-1, -1], dtype=np.int64)), axis=-2).to_numpy()
rh_rj_idx_suff.shape[0:-1]
rh_rj_idx_suff[0]

array([[0, 1],
       [2, 3],
       [4, 5]])

In [43]:
ak.fill_none(ak.pad_none(rh_rj_idx_suff_flatten, 3, clip=True, axis=-2), ak.Array(np.array([-1, -1], dtype=np.int64)), axis=-2)

In [12]:
rh_rj_idx_suff

In [13]:
bh_fj_idx.to_numpy()

ValueError: cannot convert to RegularArray because subarray lengths are not regular (in compiled code: https://github.com/scikit-hep/awkward/blob/awkward-cpp-23/awkward-cpp/src/cpu-kernels/awkward_ListOffsetArray_toRegularArray.cpp#L22)

This error occurred while calling

    ak.to_numpy(
        <Array [[], [], [], [], ..., [], [0], []] type='42829 * var * int64'>
        allow_missing = True
    )

In [25]:
bhs_dp = np.zeros(shape=bh_fj_idx.shape)
bhs_ap = np.zeros(shape=bh_fj_idx.shape)
bhs_dp[bh_fj_idx!=-1] = 1
bhs_ap[bh_fj_idx!=-1] = 1

In [21]:
bh_fj_idx = to_np_array(bh_fj_idx, max_n=3, pad=-1)


In [24]:
bh_fj_idx.shape

(42829, 3)