In [1]:
import itertools
import logging
from pathlib import Path
import numba as nb

import awkward as ak
import click
import h5py
import numpy as np
import vector

#from src.data.cms.convert_to_h5 import MIN_JETS, N_JETS, N_FJETS

vector.register_awkward()

logging.basicConfig(level=logging.INFO)

In [2]:
import xgboost as xgb

In [3]:
# a function that loads jets from hhh_test.h5
def load_jets(in_file):
    # load jets from the h5
    pt = ak.Array(in_file["INPUTS"]["Jets"]["pt"])
    eta = ak.Array(in_file["INPUTS"]["Jets"]["eta"])
    phi = ak.Array(in_file["INPUTS"]["Jets"]["phi"])
    btag = ak.Array(in_file["INPUTS"]["Jets"]["btag"])
    mass = ak.Array(in_file["INPUTS"]["Jets"]["mass"])
    mask = ak.Array(in_file["INPUTS"]["Jets"]["MASK"])

    jets = ak.zip(
        {
            "pt": pt,
            "eta": eta,
            "phi": phi,
            "btag": btag,
            "mass": mass,
            "mask": mask
        },
        with_name="Momentum4D",
    )
    
    return jets

In [4]:
# a function that loads fat jets from hhh_test.h5
def load_fjets(in_file):
     # load fatjets from h5
    fj_pt = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_pt"])
    fj_eta = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_eta"])
    fj_phi = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_phi"])
    fj_mass = ak.Array(in_file["INPUTS"]["BoostedJets"]["fj_mass"])
    fj_mask = ak.Array(in_file["INPUTS"]["BoostedJets"]["MASK"])

    fjets = ak.zip(
        {
            "pt": fj_pt,
            "eta": fj_eta,
            "phi": fj_phi,
            'mass': fj_mass,
            'mask': fj_mask
        },
        with_name="Momentum4D"
    )
    
    return fjets

In [5]:
@nb.njit
def match_fjet_to_jet(fjets, jets, builder, FJET_DR = 0.8):
    for fjets_event, jets_event in zip(fjets, jets):
        builder.begin_list()
        for i, jet in enumerate(jets_event):
            match_idx = -1
            for j, fjet in enumerate(fjets_event):
                if jet.deltaR(fjet) < FJET_DR:
                    match_idx = j
            builder.append(match_idx)
        builder.end_list()

    return builder

In [6]:
def to_np_array(ak_array, axis=-1, max_n=10, pad=0):
    return ak.fill_none(ak.pad_none(ak_array, max_n, clip=True, axis=axis), pad, axis=axis).to_numpy()

### BDT WP by background misidentification rate
Tight: 0.3%

Medium: 1%

Loose: 2%

In [7]:
WP_tight = 0.95626426
WP_medium = 0.93498826
WP_loose = 0.911348

In [8]:
WP = WP_loose
pred_file = "../../predictions/hh_chi2_loose_bdt_baseline_predictions_west.h5"

In [9]:
test_file = "../../data/hh_mh125_testing.h5"
bdt_file = "../../models/bdt_trained_on_hhh_qcd.json"

In [10]:
in_file = h5py.File(test_file)

In [11]:
in_file["INPUTS"]["BoostedJets"].keys()

<KeysViewHDF5 ['MASK', 'fj_charge', 'fj_chargedenergyfrac', 'fj_cosphi', 'fj_ehadovereem', 'fj_eta', 'fj_mass', 'fj_ncharged', 'fj_neutralenergyfrac', 'fj_nneutral', 'fj_phi', 'fj_pt', 'fj_sdmass', 'fj_sinphi', 'fj_tau21', 'fj_tau32']>

In [12]:
in_file["INPUTS"]["BoostedJets"]['MASK']

<HDF5 dataset "MASK": shape (60369, 2), type "|b1">

In [13]:
# preliminary
N_JETS = 10
HIGGS_MASS = 125

### Reconstruct boosted H

In [14]:
def get_test_XY(file):
    bh1 = file["TARGETS"]["bh1"]["bb"][:]
    bh2 = file["TARGETS"]["bh2"]["bb"][:]

    mask_fj1_bh1 = (bh1 == 0).astype(float)
    mask_fj1_bh2 = (bh2 == 0).astype(float)
    mask_fj1 = mask_fj1_bh1 + mask_fj1_bh2

    mask_fj2_bh1 = (bh1 == 1).astype(float)
    mask_fj2_bh2 = (bh2 == 1).astype(float)
    mask_fj2 = mask_fj2_bh1 + mask_fj2_bh2

    mask_signal = np.stack([mask_fj1, mask_fj2], axis=1).flatten()

    # get zero mask
    mask_zero = file["INPUTS"]["BoostedJets"]["MASK"][:].astype(float).flatten()

    feature_names = [
        "fj_pt",
        "fj_eta",
        # "fj_phi",
        "fj_mass",
        "fj_sdmass",
        # "fj_charge",
        "fj_chargedenergyfrac",
        "fj_ncharged",
        # "fj_neutralenergyfrac",
        "fj_nneutral",
        "fj_tau21",
        "fj_tau32",
    ]
    arrays = []
    for key in feature_names:
        feature = file["INPUTS"]["BoostedJets"][key][:].astype(float).flatten()
        arrays.append(feature)
    data = np.stack(arrays, axis=1)
    labels = mask_signal.astype(bool)
    print(data.shape)
    print(labels.shape)
    return data, labels

In [15]:
hh_data, hh_labels = get_test_XY(in_file)

(120738, 9)
(120738,)


In [16]:
# reconstruct BDT dataset
feature_names = [
        "fj_pt",
        "fj_eta",
        # "fj_phi",
        "fj_mass",
        "fj_sdmass",
        # "fj_charge",
        "fj_chargedenergyfrac",
        "fj_ncharged",
        # "fj_neutralenergyfrac",
        "fj_nneutral",
        "fj_tau21",
        "fj_tau32",
    ]
test = xgb.DMatrix(data=hh_data, label=hh_labels, feature_names=feature_names)

In [17]:
# load model
param = {}

param["seed"] = 42  # set seed for reproducibility

# Booster parameters
param["eta"] = 0.1  # learning rate
param["max_depth"] = 5  # maximum depth of a tree
# param["subsample"] = 0.8  # fraction of events to train tree on
# param["colsample_bytree"] = 0.8  # fraction of features to train tree on

# Learning task parameters
# param["scale_pos_weight"] = scale_pos_weight
param["objective"] = "binary:logistic"  # objective function
param[
    "eval_metric"
] = "error"  # evaluation metric for cross validation, note: last one is used for early stopping
param = list(param.items())

num_trees = 150  # number of trees to make
booster = xgb.Booster(param, model_file=bdt_file)

In [18]:
# predict data
pred_label = (booster.predict(test) > WP).reshape(-1, 2)
print(pred_label.shape)

(60369, 2)


In [19]:
# load jets and fat jets from test h5 file
jets = load_jets(in_file)
js_idx = ak.local_index(jets)
fjs = load_fjets(in_file)
fj_idx = ak.local_index(fjs)

# select real fjets based on pT and mass cut
fj_mask = fjs['mask']
fj_cond = pred_label & fj_mask
fjs_selected = fjs[fj_cond]

# save the qualified fjets indices
# they will be bH candidates
bh_fj_idx = fj_idx[fj_cond]
bh_fj_idx = to_np_array(bh_fj_idx, max_n=2, pad=-1)

# convert indices to AP and DP
bhs_dp = np.zeros(shape=bh_fj_idx.shape)
fjs_ap = np.zeros(shape=bh_fj_idx.shape)
bhs_dp[bh_fj_idx!=-1] = 1
fjs_ap[bh_fj_idx!=-1] = 1

### Select un_padded jets

In [20]:
# find ak4jets that matched to selected ak8jets (dR check)
not_padded = jets['mask']
j_cond = not_padded
js_selected = jets[j_cond]

### Reconstruct resolved H

In [21]:
n_higgs = 2
N_JETS = 10

In [22]:
 MIN_JETS = 2 * n_higgs
# compute possible jet assignments lookup table
JET_ASSIGNMENTS = {}
for nj in range(MIN_JETS, N_JETS + 1):
    a = list(itertools.combinations(range(nj), 2))
    b = np.array([(i, j, k) for i, j, k in itertools.combinations(a, 3) if len(set(i + j + k)) == MIN_JETS])
    JET_ASSIGNMENTS[nj] = b

In [23]:
nj = 2 * n_higgs
mjj = (jets[:, JET_ASSIGNMENTS[nj][:, :, 0]] + jets[:, JET_ASSIGNMENTS[nj][:, :, 1]]).mass
chi2 = ak.sum(np.square(mjj - HIGGS_MASS), axis=-1)
chi2_argmin = ak.argmin(chi2, axis=-1)

In [24]:
# just consider top 2*N_rH jets

rH_b1 = JET_ASSIGNMENTS[nj][chi2_argmin][:, :, 0]
rH_b2 = JET_ASSIGNMENTS[nj][chi2_argmin][:, :, 1]

rH_dp = np.ones(shape=rH_b1.shape)
rH_ap = np.ones(shape=rH_b2.shape)

In [25]:
# save all assignment to the h5file
# boosted 
datasets = {}
datasets["TARGETS/bh1/bb"] = bh_fj_idx[:,0]+10
datasets["TARGETS/bh2/bb"] = bh_fj_idx[:,1]+10

datasets["TARGETS/bh1/detection_probability"] = bhs_dp[:,0]
datasets["TARGETS/bh2/detection_probability"] = bhs_dp[:,1]

datasets["TARGETS/bh1/assignment_probability"] = bhs_dp[:,0]
datasets["TARGETS/bh2/assignment_probability"] = bhs_dp[:,1]

# resolved
for i in range(1, n_higgs+1):
    datasets[f"TARGETS/h{i}/b1"] = rH_b1[:, i-1]
    datasets[f"TARGETS/h{i}/b2"] = rH_b2[:, i-1]

    datasets[f"TARGETS/h{i}/detection_probability"] = rH_dp[:, i-1]
    datasets[f"TARGETS/h{i}/assignment_probability"] = rH_ap[:, i-1]

In [26]:
all_datasets = {}
for dataset_name, data in datasets.items():
    if dataset_name not in all_datasets:
        all_datasets[dataset_name] = []
    all_datasets[dataset_name].append(data)

In [27]:
with h5py.File(pred_file, "w") as output:
    for jet_type_name, jet_type in in_file["INPUTS"].items():
        for feature_name, feature in jet_type.items():
            dataset_name = f"INPUTS/{jet_type_name}/{feature_name}"
            data = np.array(feature)
            output.create_dataset(dataset_name, data=data)
    for dataset_name, all_data in all_datasets.items():
        concat_data = np.concatenate(all_data, axis=0)
        output.create_dataset(dataset_name, data=concat_data)

In [28]:
pred_h5 = h5py.File(pred_file)

In [29]:
pred_h5['TARGETS']['h2'].keys()

<KeysViewHDF5 ['assignment_probability', 'b1', 'b2', 'detection_probability']>

In [30]:
pred_h5.keys()

<KeysViewHDF5 ['INPUTS', 'TARGETS']>

In [31]:
pred_h5['TARGETS']['h2']['b1'][2]

0

In [32]:
pred_h5['TARGETS']['h1']['b2'][2]

1

In [33]:
pred_h5['TARGETS']['h1']['detection_probability']

<HDF5 dataset "detection_probability": shape (60369,), type "<f8">

In [34]:
rH_dp.shape

(60369, 3)