In [28]:
from collections import defaultdict
import awkward as ak
import numba
import numpy as np
import pandas as pd
import h5py
import vector
vector.register_numba()
vector.register_awkward()

#import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
#import mplhep as hep
#hep.style.use(hep.style.ROOT)

In [29]:
prefix="/run/user/1000/gvfs/sftp:host=lxplus-gpu.cern.ch,user=mmalucch/"
filename_test = prefix+"/eos/home-m/mmalucch/spanet_inputs/output_JetGoodHiggs_test.h5"
filename_pred = prefix+"/eos/home-r/ramellar/prediction.h5"
filename_pred = "/home/matteo/Downloads/prediction.h5"
df_test = h5py.File(filename_test,'r')
df_pred = h5py.File(filename_pred,'r')

In [30]:
df_pred["TARGETS"]["h1"].keys()

<KeysViewHDF5 ['assignment_probability', 'b1', 'b2', 'detection_probability', 'marginal_probability']>

## Compute the jet assignment efficiency
We extract the predicted and true indices for the individual quarks.
Jets are correctly assigned when the predicted jet index is equal to the true index. We can compute the efficiency as the ratio of the correctly assigned jets over the total number of jets.

In [31]:
idx_b1_pred = df_pred["TARGETS"]["h1"]["b1"][()]
idx_b2_pred = df_pred["TARGETS"]["h1"]["b2"][()]
idx_b1_pred

array([0, 1, 0, ..., 2, 1, 0])

In [32]:
idx_b1_true = df_test["TARGETS"]["h1"]["b1"][()]
idx_b2_true = df_test["TARGETS"]["h1"]["b2"][()]
idx_b1_true

array([1, 0, 1, ..., 0, 0, 2])

In [33]:
idx_h1_pred = ak.concatenate((ak.unflatten(idx_b1_pred, ak.ones_like(idx_b1_pred)), ak.unflatten(idx_b2_pred, ak.ones_like(idx_b2_pred))), axis=1)
idx_h1_true = ak.concatenate((ak.unflatten(idx_b1_true, ak.ones_like(idx_b1_true)), ak.unflatten(idx_b2_true, ak.ones_like(idx_b2_true))), axis=1)
idx_h1_pred

<Array [[0, 2], [1, 2], ... [1, 3], [0, 1]] type='129710 * var * int64'>

In [34]:
print(idx_h1_pred)
print(idx_h1_true)

[[0, 2], [1, 2], [0, 3], [0, 2], [0, 1], ... [0, 2], [2, 3], [2, 3], [1, 3], [0, 1]]
[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]


In [35]:
idx_b3_pred = df_pred["TARGETS"]["h2"]["b3"][()]
idx_b4_pred = df_pred["TARGETS"]["h2"]["b4"][()]
idx_b3_true = df_test["TARGETS"]["h2"]["b3"][()]
idx_b4_true = df_test["TARGETS"]["h2"]["b4"][()]

idx_h2_pred = ak.concatenate((ak.unflatten(idx_b3_pred, ak.ones_like(idx_b3_pred)), ak.unflatten(idx_b4_pred, ak.ones_like(idx_b4_pred))), axis=1)
idx_h2_true = ak.concatenate((ak.unflatten(idx_b3_true, ak.ones_like(idx_b3_true)), ak.unflatten(idx_b4_true, ak.ones_like(idx_b4_true))), axis=1)
print(idx_h2_pred)
print(idx_h2_true)

[[1, 3], [0, 3], [1, 2], [1, 3], [2, 3], ... [1, 3], [0, 1], [0, 1], [0, 2], [2, 3]]
[[0, 2], [1, 2], [0, 3], [1, -1], [0, 1], ... 0, -1], [0, 1], [2, 3], [1, 3], [0, 1]]


In [42]:
# order the true array in increasing order
idx_h1_true_order = ak.sort(idx_h1_true, axis=1)
print(idx_h1_true)
print(idx_h1_true_order)
print(ak.sum(idx_h1_true[:,0]>idx_h1_true[:,1]))
print(ak.sum(idx_h1_true_order[:,0]>idx_h1_true_order[:,1]))

idx_h2_true_order=ak.sort(idx_h2_true, axis=1)


[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]
[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]
5630
0


In [62]:
# concatenate the two arrays for h1 and h2 adding another axis
idx_true = ak.concatenate((ak.unflatten(idx_h1_true_order, ak.ones_like(idx_h1_true_order[:,0])), ak.unflatten(idx_h2_true_order, ak.ones_like(idx_h2_true_order[:,0]))), axis=1)
print(idx_true[2:5])

idx_pred = ak.concatenate((ak.unflatten(idx_h1_pred, ak.ones_like(idx_h1_pred[:,0])), ak.unflatten(idx_h2_pred, ak.ones_like(idx_h2_pred[:,0]))), axis=1)
print(idx_pred[2:5])

# if an event has a -1 in the true array, remove it
mask = ak.all(ak.all(idx_true>=0, axis=1), axis=1)
print(mask)

idx_true_fully_matched = idx_true[mask]
idx_pred_fully_matched = idx_pred[mask]
print(idx_true_fully_matched[2:5])
print(idx_pred_fully_matched[2:5])

print(len(idx_true_fully_matched))

[[[1, 2], [0, 3]], [[0, 2], [-1, 1]], [[2, 3], [0, 1]]]
[[[0, 3], [1, 2]], [[0, 2], [1, 3]], [[0, 1], [2, 3]]]
[True, True, True, False, True, True, True, ... True, False, True, True, True, True]
[[[1, 2], [0, 3]], [[2, 3], [0, 1]], [[0, 1], [2, 3]]]
[[[0, 3], [1, 2]], [[0, 1], [2, 3]], [[0, 1], [2, 3]]]
116863


In [61]:
# CHECK IF THE TWO ARRAYS ARE EQUAL and if you swap the columns
matched_events =[]
for i in range(len(idx_true_fully_matched)):
    arr_true = idx_true_fully_matched[i]
    arr_pred = idx_pred_fully_matched[i]
    if ak.all(arr_true[0]==arr_pred[0]) or ak.all(arr_true[0]==arr_pred[1]):
        matched_events.append(i)

print(len(matched_events))


115049


In [63]:
print("eff: ", len(matched_events)/len(idx_true_fully_matched))

eff:  0.984477550636215


IGNORE

In [37]:
is_correct_higgs = ak.sum(idx_h1_pred == idx_h1_true, axis=1) == 2
is_correct_higgs

<Array [False, False, False, ... False, False] type='129710 * bool'>

In [38]:
n_tot = len(is_correct_higgs)
n_tot

129710

In [39]:
n_correct = ak.sum(is_correct_higgs)
n_correct

62438

In [40]:
eff_h = n_correct / n_tot
eff_h

0.48136612443142396

In [41]:
idx_q1_pred = df_pred["TARGETS"]["t1"]["q1"][()]
idx_q2_pred = df_pred["TARGETS"]["t1"]["q2"][()]
idx_b_pred = df_pred["TARGETS"]["t1"]["b"][()]
idx_q1_true = df_test["TARGETS"]["t1"]["q1"][()]
idx_q2_true = df_test["TARGETS"]["t1"]["q2"][()]
idx_b_true = df_test["TARGETS"]["t1"]["b"][()]
idx_b_pred

KeyError: "Unable to synchronously open object (object 't1' doesn't exist)"

In [None]:
idx_thad_pred = ak.concatenate(
     (ak.unflatten(idx_q1_pred, ak.ones_like(idx_q1_pred)),
     ak.unflatten(idx_q2_pred, ak.ones_like(idx_q2_pred)),
     ak.unflatten(idx_b_pred, ak.ones_like(idx_b_pred))),
     axis=1)
idx_thad_true = ak.concatenate(
     (ak.unflatten(idx_q1_true, ak.ones_like(idx_q1_true)),
     ak.unflatten(idx_q2_true, ak.ones_like(idx_q2_true)),
     ak.unflatten(idx_b_true, ak.ones_like(idx_b_true))),
     axis=1)
idx_thad_pred

In [None]:
is_correct_thad = ak.sum(idx_thad_pred == idx_thad_true, axis=1) == 2
is_correct_thad

In [None]:
n_correct = ak.sum(is_correct_thad)
n_tot = len(is_correct_thad)
eff_thad = n_correct / n_tot
eff_thad

0.26598998420327363

In [None]:
idx_b_pred = df_pred["TARGETS"]["t2"]["b"][()]
idx_b_true = df_test["TARGETS"]["t2"]["b"][()]
idx_tlep_pred = ak.unflatten(idx_b_pred, ak.ones_like(idx_b_pred))
idx_tlep_true = ak.unflatten(idx_b_true, ak.ones_like(idx_b_pred))
idx_tlep_pred

In [None]:
is_correct_tlep = ak.sum(idx_tlep_pred == idx_tlep_true, axis=1) == 1
is_correct_tlep

In [None]:
n_correct = ak.sum(is_correct_tlep)
n_tot = len(is_correct_tlep)
eff_tlep = n_correct / n_tot
eff_tlep

0.7139448122878366

## Resulting Higgs and top reconstruction efficiencies
The Higgs and top efficiencies are defined as the number of events with all the jets correctly assigned over the total number of events:

$\epsilon = \frac{N_{assigned}}{N_{tot}}$

In [None]:
eff_h, eff_thad, eff_tlep

(0.5520787819715659, 0.26598998420327363, 0.7139448122878366)