### Implement packages and Load Data

In [1]:
import itertools
import logging
from pathlib import Path
import numba as nb

import awkward as ak
import click
import h5py as h5
import numpy as np
import vector

from coffea.hist.plot import clopper_pearson_interval
import matplotlib.pyplot as plt

# from src.data.cms.convert_to_h5 import MIN_JETS, N_JETS, N_FJETS

vector.register_awkward()

logging.basicConfig(level=logging.INFO)

(Set coffea.deprecations_as_errors = True to get a stack trace now.)
ImportError: coffea.hist is deprecated


In [2]:
# read test target file
test_file = "//Users/billyli/UCSD/hhh/reports/bv2/hhh_test.h5"
test_h5 = h5.File(test_file)

# read baseline prediction
baseline_file = "//Users/billyli/UCSD/hhh/reports/bv2/pred_baseline.h5"
b_h5 = h5.File(baseline_file)

# read spanet prediction
spanet_file = "//Users/billyli/UCSD/hhh/reports/bv2/dp_on/pred_v53.h5"
s_h5 = h5.File(spanet_file)

### Abstract
Overlapped-jets removal is motivated by the fact that some resolved Higgs (rH) can have the corresponding boosted Higgs (bH). As those rH and bH are two different representations of the same Higgs, we should only choose one of them to avoid double counting when evaluating purity/efficiency. As a natural solution to prioritize bH, overlapped-jets removal is to remove the jets overlapped with the bH by a delta_R condiction. One will calculate dR between each jet and the bH candidate. The jets that have a dR less than dR_min will be removed. The removal is carried out after reconstructing bH as we prioritize the bH reconstruction.

It is possible that some rH jets having no bH correspondance but still removed if we set the delta_R_min too large. On the other hand, some bHs' corresponding rHs' jets could be left remaining if the delta_R_min is too small.

The goal of this notebook is not to pick the optimal deltaR_min, but to have a grasp of how well the default deltaR_min = 0.8 does.

### Methods

### Gen-level Overlap Removal Study
Below is to provide some info about how often a resolved Higgs and a boosted Higgs overlap

* N1: number of resolved H without overlapping removal
* N2: number of resolved H with overlapping removal
* N3: number of Higgs that are bothe resolved and boosted

N1-N2-N3 = Number of Higgs that were removed - Number of Higgs that are both resolved and boosted

Also we can generate a confusion matrix of bi-cat (a higgs that is both bH and rH) and removal. 

### General Outline

1. Load functions to reco bH
2. Load functions to remove overlap
3. Find the necessary information

### 1.

In [3]:
def sel_pred_bH_by_dp(dps, aps, bb_ps, dp_cut, ap_cut=1/13):
    # parse predicted bb assignment by DP
    dp_filter = dps>dp_cut
    ap_filter = aps>ap_cut
    ak8_filter = bb_ps>9
    filter = dp_filter&ak8_filter
    
    bb_ps_passed = bb_ps.mask[filter]
    bb_ps_passed = ak.drop_none(bb_ps_passed)
    
    return bb_ps_passed

In [4]:
def sel_target_bH_by_mask(bb_ts, bh_pts, bh_masks):
    bb_ts_selected = bb_ts.mask[bh_masks]
    bb_ts_selected = ak.drop_none(bb_ts_selected)
    
    bh_selected_pts = bh_pts.mask[bh_masks]
    bh_selected_pts = ak.drop_none(bh_selected_pts)
    
    return bb_ts_selected, bh_selected_pts

In [5]:
# A pred look up table is in shape
# [event,
#    pred_H, 
#       [correct, pred_H_pt]]
def gen_pred_bH_LUT(bb_ps_passed, bb_ts_selected, fj_pts):
    LUT = []
    # for each event
    for bb_t_event, bb_p_event, fj_pt_event in zip(bb_ts_selected, bb_ps_passed, fj_pts):
        # for each predicted bb assignment, check if any target H have a same bb assignment
        LUT_event = []
        for i, bb_p in enumerate(bb_p_event):
            correct = 0
            predH_pt = fj_pt_event[bb_p-10]
            for bb_t in bb_t_event:
                if bb_p == bb_t+10:
                    correct = 1
            LUT_event.append([correct, predH_pt])
        LUT.append(LUT_event)
    return LUT

In [6]:
# A target look up table is in shape
# [event,
#    target_H, 
#        target_bb_assign,
#           [retrieved, targetH_pt]]
def gen_target_bH_LUT(bb_ps_passed, bb_ts_selected, targetH_pts):
    LUT = []
    # for each event
    for bb_t_event, bb_p_event, targetH_pts_event in zip(bb_ts_selected, bb_ps_passed, targetH_pts):
        # for each target fatjet, check if the predictions have a p fatject same with the t fatjet
        LUT_event = []
        for i, bb_t in enumerate(bb_t_event):
            retrieved = 0
            targetH_pt = targetH_pts_event[i]
            for bb_p in bb_p_event:
                if bb_p == bb_t+10:
                    retrieved = 1
            LUT_event.append([retrieved, targetH_pt])
        LUT.append(LUT_event)
    return LUT

In [7]:
# generate pred/target LUT
# each entry corresponds to [recoH correct or not, reco H pt]
# or 
# [targetH retrieved or not, target H pt]
def parse_boosted_w_target(testfile, predfile, dp_cut=0.8):
    # Collect H pt, mask, target and predicted jet and fjets for 3 Hs in each event
    # h pt
    bh1_pt = np.array(testfile['TARGETS']['bh1']['pt'])
    bh2_pt = np.array(testfile['TARGETS']['bh2']['pt'])
    bh3_pt = np.array(testfile['TARGETS']['bh3']['pt'])

    # mask
    bh1_mask = np.array(testfile['TARGETS']['bh1']['mask'])
    bh2_mask = np.array(testfile['TARGETS']['bh2']['mask'])
    bh3_mask = np.array(testfile['TARGETS']['bh3']['mask'])

    # target assignment
    bb_bh1_t = np.array(testfile["TARGETS"]["bh1"]['bb'])
    bb_bh2_t = np.array(testfile["TARGETS"]["bh2"]['bb'])
    bb_bh3_t = np.array(testfile["TARGETS"]["bh3"]['bb'])

    try:
        # pred assignment
        bb_bh1_p = np.array(predfile["TARGETS"]["bh1"]['bb'])
        bb_bh2_p = np.array(predfile["TARGETS"]["bh2"]['bb'])
        bb_bh3_p = np.array(predfile["TARGETS"]["bh3"]['bb'])
    
        # boosted Higgs detection probability
        dp_bh1 = np.array(predfile["TARGETS"]["bh1"]['detection_probability'])
        dp_bh2 = np.array(predfile["TARGETS"]["bh2"]['detection_probability'])
        dp_bh3 = np.array(predfile["TARGETS"]["bh3"]['detection_probability'])

        # fatjet assignment probability
        ap_bh1 = np.array(predfile["TARGETS"]["bh1"]['assignment_probability'])
        ap_bh2 = np.array(predfile["TARGETS"]["bh2"]['assignment_probability'])
        ap_bh3 = np.array(predfile["TARGETS"]["bh3"]['assignment_probability'])
    except:
        # pred assignment
        bb_bh1_p = np.array(predfile["TARGETS"]["bh1"]['bb'])+10
        bb_bh2_p = np.array(predfile["TARGETS"]["bh2"]['bb'])+10
        bb_bh3_p = np.array(predfile["TARGETS"]["bh3"]['bb'])+10
    
         # boosted Higgs detection probability
        dp_bh1 = np.array(predfile["TARGETS"]["bh1"]['mask']).astype('float')
        dp_bh2 = np.array(predfile["TARGETS"]["bh2"]['mask']).astype('float')
        dp_bh3 = np.array(predfile["TARGETS"]["bh3"]['mask']).astype('float')

        # fatjet assignment probability
        ap_bh1 = np.array(predfile["TARGETS"]["bh1"]['mask']).astype('float')
        ap_bh2 = np.array(predfile["TARGETS"]["bh2"]['mask']).astype('float')
        ap_bh3 = np.array(predfile["TARGETS"]["bh3"]['mask']).astype('float')
    
    # collect fatjet pt
    fj_pt = np.array(testfile['INPUTS']['BoostedJets']['fj_pt'])
    
    # convert some arrays to ak array
    dps = np.concatenate((dp_bh1.reshape(-1, 1), dp_bh2.reshape(-1, 1), dp_bh3.reshape(-1, 1)), axis=1)
    dps = ak.Array(dps)
    aps = np.concatenate((ap_bh1.reshape(-1, 1), ap_bh2.reshape(-1, 1), ap_bh3.reshape(-1, 1)), axis=1)
    aps = ak.Array(aps)
    bb_ps = np.concatenate((bb_bh1_p.reshape(-1, 1), bb_bh2_p.reshape(-1, 1), bb_bh3_p.reshape(-1, 1)), axis=1)
    bb_ps = ak.Array(bb_ps)
    bb_ts = np.concatenate((bb_bh1_t.reshape(-1, 1), bb_bh2_t.reshape(-1, 1), bb_bh3_t.reshape(-1, 1)), axis=1)
    bb_ts = ak.Array(bb_ts)
    fj_pt = ak.Array(fj_pt)
    bh_masks = np.concatenate((bh1_mask.reshape(-1, 1), bh2_mask.reshape(-1, 1), bh3_mask.reshape(-1, 1)), axis=1)
    bh_masks = ak.Array(bh_masks)
    bh_pts = np.concatenate((bh1_pt.reshape(-1, 1), bh2_pt.reshape(-1, 1), bh3_pt.reshape(-1, 1)), axis=1)
    bh_pts = ak.Array(bh_pts)
    
    # select predictions and targets
    bb_ts_selected, targetH_selected_pts = sel_target_bH_by_mask(bb_ts, bh_pts, bh_masks)
    bb_ps_selected = sel_pred_bH_by_dp(dps, aps, bb_ps, dp_cut)
    
    # generate correct/retrieved LUT for pred/target respectively
    LUT_pred = gen_pred_bH_LUT(bb_ps_selected, bb_ts_selected, fj_pt)
    LUT_target = gen_target_bH_LUT(bb_ps_selected, bb_ts_selected, targetH_selected_pts)
    
    # reconstruct bH to remove overlapped ak4 jets
    fj_eta = np.array(testfile['INPUTS']['BoostedJets']['fj_eta'])
    fj_phi = np.array(testfile['INPUTS']['BoostedJets']['fj_phi'])
    fj_mass = np.array(testfile['INPUTS']['BoostedJets']['fj_mass'])
    
    fjs = ak.zip(
        {
            "pt": fj_pt,
            "eta": fj_eta,
            "phi": fj_phi,
            "mass": fj_mass,
        },
        with_name="Momentum4D"
    )
    fj_reco = fjs[bb_ps_selected-10]
    
    return LUT_pred, LUT_target, fj_reco

### 2.

In [8]:
def get_unoverlapped_jet_index(fjs, js, dR_min=0.8):
    overlapped = ak.sum(js[:, np.newaxis].deltaR(fjs)<dR_min, axis=-2)>0
    jet_index_passed = ak.local_index(js).mask[~overlapped]
    jet_index_passed = ak.drop_none(jet_index_passed)
    return jet_index_passed

### 3.

#### Getting N1

In [9]:
def get_unmasked_arr(arr, mask):
    return ak.drop_none(arr.mask[mask])

In [10]:
testfile = test_h5
# N1
h1_pt = ak.Array(testfile['TARGETS']['h1']['pt'])
h2_pt = ak.Array(testfile['TARGETS']['h2']['pt'])
h3_pt = ak.Array(testfile['TARGETS']['h3']['pt'])

# mask
h1_mask = ak.Array(testfile['TARGETS']['h1']['mask'])
h2_mask = ak.Array(testfile['TARGETS']['h2']['mask'])
h3_mask = ak.Array(testfile['TARGETS']['h3']['mask'])


In [11]:
N1 = ak.count(get_unmasked_arr(h1_pt, h1_mask))+ak.count(get_unmasked_arr(h2_pt, h2_mask))+ak.count(get_unmasked_arr(h3_pt, h3_mask))

#### Getting N2

In [12]:
# assume perfect reconstruction
_, _, fjs_reco = parse_boosted_w_target(test_h5, test_h5)

In [13]:
# target assignments
b1_h1_t = np.array(testfile["TARGETS"]["h1"]['b1']).astype('int')
b1_h2_t = np.array(testfile["TARGETS"]["h2"]['b1']).astype('int')
b1_h3_t = np.array(testfile["TARGETS"]["h3"]['b1']).astype('int')

b2_h1_t = np.array(testfile["TARGETS"]["h1"]['b2']).astype('int')
b2_h2_t = np.array(testfile["TARGETS"]["h2"]['b2']).astype('int')
b2_h3_t = np.array(testfile["TARGETS"]["h3"]['b2']).astype('int')


b1_ts = np.concatenate((b1_h1_t.reshape(-1, 1), b1_h2_t.reshape(-1, 1), b1_h3_t.reshape(-1, 1)), axis=1)
b1_ts = ak.Array(b1_ts)

b2_ts = np.concatenate((b2_h1_t.reshape(-1, 1), b2_h2_t.reshape(-1, 1), b2_h3_t.reshape(-1, 1)), axis=1)
b2_ts = ak.Array(b2_ts)

# reconstruct jet 4-momentum objects
j_pt = np.array(testfile['INPUTS']['Jets']['pt'])
j_eta = np.array(testfile['INPUTS']['Jets']['eta'])
j_phi = np.array(testfile['INPUTS']['Jets']['phi'])
j_mass = np.array(testfile['INPUTS']['Jets']['mass'])
js = ak.zip(
    {
        "pt": j_pt,
        "eta": j_eta,
        "phi": j_phi,
        "mass": j_mass,
    },
    with_name="Momentum4D"
)
    
goodJetIdx = get_unoverlapped_jet_index(fjs_reco, js, dR_min=0.8)


In [14]:
N2 = 0
for tb1_e, tb2_e, goodJetIdx_e in zip(b1_ts, b2_ts, goodJetIdx):
    for b1Hx, b2Hx in zip(tb1_e, tb2_e):
        if (b1Hx in goodJetIdx_e) & (b2Hx in goodJetIdx_e):
            N2 += 1

#### Getting N3

In [15]:
# mask
# mask
h1_mask = np.array(testfile['TARGETS']['h1']['mask'])
h2_mask = np.array(testfile['TARGETS']['h2']['mask'])
h3_mask = np.array(testfile['TARGETS']['h3']['mask'])
h_masks = np.concatenate((h1_mask.reshape(-1, 1), h2_mask.reshape(-1, 1), h3_mask.reshape(-1, 1)), axis=1)

bh1_mask = np.array(testfile['TARGETS']['bh1']['mask'])
bh2_mask = np.array(testfile['TARGETS']['bh2']['mask'])
bh3_mask = np.array(testfile['TARGETS']['bh3']['mask'])

bh_masks = np.concatenate((bh1_mask.reshape(-1, 1), bh2_mask.reshape(-1, 1), bh3_mask.reshape(-1, 1)), axis=1)

In [16]:
N3=np.sum(h_masks&bh_masks)

#### Summary

In [17]:
# this percents of resolved Higgs should not be removed
(N1-N2-N3)/N1

0.017112822393350446

In [18]:
# this number of resolved Higgs removed for overlapping with a boosted Higgs
N1-N2

7944

In [19]:
# this number of resolved Higgs can also be reconstructed as a boosted Higgs
N3

6544

In [20]:
N3/N1

0.07999022124434665

#### Confusion Matrix

In [21]:
N_overlap_bicat = 0
N_overlap_resolve = 0
N_unoverlap_bicat = 0
N_unoverlap_resolve = 0

bAndr = h_masks&bh_masks
for tb1_e, tb2_e, goodJetIdx_e, h_masks_e, bh_masks_e, in zip(b1_ts, b2_ts, goodJetIdx, h_masks, bh_masks):
    for b1Hx, b2Hx, resolve, boost in zip(tb1_e, tb2_e, h_masks_e, bh_masks_e):
        
        if (b1Hx in goodJetIdx_e) & (b2Hx in goodJetIdx_e):
            overlap = False
        else:
            overlap=True
            
        if resolve & boost:
            bicat = True
        else:
            bicat = False
            
        if overlap & bicat:
            N_overlap_bicat += 1
        elif ~overlap & bicat:
            N_unoverlap_bicat += 1
        elif overlap & resolve:
            N_overlap_resolve += 1
        elif ~overlap & resolve:
            N_unoverlap_resolve += 1

In [22]:
print("        ", "Resolved+Boosted |",    "Resolved only")
print("Removed:      ", N_overlap_bicat, "            ",N_overlap_resolve)
print("Not Removed:     ", N_unoverlap_bicat, "         ", N_unoverlap_resolve)

         Resolved+Boosted | Resolved only
Removed:       6544              1400
Not Removed:      0           73866
