In [1]:
from collections import defaultdict
import awkward as ak
import numba
import numpy as np
import pandas as pd
import h5py
import vector
vector.register_numba()
vector.register_awkward()

#import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
#import mplhep as hep
#hep.style.use(hep.style.ROOT)

In [5]:
prefix="/run/user/1000/gvfs/sftp:host=lxplus.cern.ch,user=mmalucch/"
filename_test = prefix+"/eos/home-m/mmalucch/spanet_inputs/output_JetGoodHiggs_test.h5"
filename_test="/work/mmalucch/out_hh4b/hh4b_btag_in_presel_0pad/output_JetGoodHiggs_test.h5"
filename_pred = prefix+"/eos/home-r/ramellar/prediction.h5"
filename_pred = "/home/matteo/Downloads/prediction.h5"
filename_pred="/work/mmalucch/out_hh4b/hh4b_btag_in_presel_0pad/output_JetGoodHiggs_test.h5"
df_test = h5py.File(filename_test,'r')
df_pred = h5py.File(filename_pred,'r')

In [6]:
df_pred["TARGETS"]["h1"].keys()

<KeysViewHDF5 ['b1', 'b2']>

## Compute the jet assignment efficiency
We extract the predicted and true indices for the individual quarks.
Jets are correctly assigned when the predicted jet index is equal to the true index. We can compute the efficiency as the ratio of the correctly assigned jets over the total number of jets.

In [7]:
idx_b1_pred = df_pred["TARGETS"]["h1"]["b1"][()]
idx_b2_pred = df_pred["TARGETS"]["h1"]["b2"][()]
idx_b1_pred

array([1, 0, 1, ..., 0, 0, 2])

In [8]:
idx_b1_true = df_test["TARGETS"]["h1"]["b1"][()]
idx_b2_true = df_test["TARGETS"]["h1"]["b2"][()]
idx_b1_true

array([1, 0, 1, ..., 0, 0, 2])

In [9]:
idx_h1_pred = ak.concatenate((ak.unflatten(idx_b1_pred, ak.ones_like(idx_b1_pred)), ak.unflatten(idx_b2_pred, ak.ones_like(idx_b2_pred))), axis=1)
idx_h1_true = ak.concatenate((ak.unflatten(idx_b1_true, ak.ones_like(idx_b1_true)), ak.unflatten(idx_b2_true, ak.ones_like(idx_b2_true))), axis=1)
idx_h1_pred

<Array [[1, 3], [0, 3], ... [0, 2], [2, 3]] type='129710 * var * int64'>

In [10]:
print(idx_h1_pred)
print(idx_h1_true)

[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]
[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]


In [11]:
idx_b3_pred = df_pred["TARGETS"]["h2"]["b3"][()]
idx_b4_pred = df_pred["TARGETS"]["h2"]["b4"][()]
idx_b3_true = df_test["TARGETS"]["h2"]["b3"][()]
idx_b4_true = df_test["TARGETS"]["h2"]["b4"][()]

idx_h2_pred = ak.concatenate((ak.unflatten(idx_b3_pred, ak.ones_like(idx_b3_pred)), ak.unflatten(idx_b4_pred, ak.ones_like(idx_b4_pred))), axis=1)
idx_h2_true = ak.concatenate((ak.unflatten(idx_b3_true, ak.ones_like(idx_b3_true)), ak.unflatten(idx_b4_true, ak.ones_like(idx_b4_true))), axis=1)
print(idx_h2_pred)
print(idx_h2_true)

[[0, 2], [1, 2], [0, 3], [1, -1], [0, 1], ... 0, -1], [0, 1], [2, 3], [1, 3], [0, 1]]
[[0, 2], [1, 2], [0, 3], [1, -1], [0, 1], ... 0, -1], [0, 1], [2, 3], [1, 3], [0, 1]]


In [45]:
# order the true array in increasing order
idx_h1_true_order = ak.sort(idx_h1_true, axis=1)
idx_h1_true_order = idx_h1_true
print(idx_h1_true)
print(idx_h1_true_order)
print(ak.sum(idx_h1_true[:,0]>idx_h1_true[:,1]))
print(ak.sum(idx_h1_true_order[:,0]>idx_h1_true_order[:,1]))

print(idx_h1_true[idx_h1_true[:,0]>idx_h1_true[:,1]])
print(len(idx_h1_true[idx_h1_true[:,0]>idx_h1_true[:,1]]))

print(idx_h1_true_order)

print(idx_h1_true_order[idx_h1_true_order[:,1]==-1])
print(ak.num(idx_h1_true_order[(idx_h1_true[:,1]==-1) & (idx_h1_true[:,0]!=-1)], axis=0))
print(len(idx_h1_true[(idx_h1_true[:,1]==-1) & (idx_h1_true[:,0]!=-1)]))

idx_h2_true_order=ak.sort(idx_h2_true, axis=1)
idx_h2_true_order = idx_h2_true


[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]
[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]
5630
5630
[[0, -1], [0, -1], [0, -1], [0, -1], [2, ... -1], [2, -1], [2, -1], [1, -1], [1, -1]]
5630
[[1, 3], [0, 3], [1, 2], [0, 2], [2, 3], ... [1, 3], [2, 3], [0, 1], [0, 2], [2, 3]]
[[0, -1], [0, -1], [0, -1], [0, -1], [2, ... -1], [2, -1], [2, -1], [1, -1], [1, -1]]
5630
5630


In [32]:
# concatenate the two arrays for h1 and h2 adding another axis
idx_true = ak.concatenate((ak.unflatten(idx_h1_true_order, ak.ones_like(idx_h1_true_order[:,0])), ak.unflatten(idx_h2_true_order, ak.ones_like(idx_h2_true_order[:,0]))), axis=1)
print(idx_true[2:5])

idx_pred = ak.concatenate((ak.unflatten(idx_h1_pred, ak.ones_like(idx_h1_pred[:,0])), ak.unflatten(idx_h2_pred, ak.ones_like(idx_h2_pred[:,0]))), axis=1)
print(idx_pred[2:5])

# if an event has a -1 in the true array, remove it
mask_fully_matched = ak.all(ak.all(idx_true>=0, axis=-1), axis=-1)
print(mask_fully_matched)

idx_true_fully_matched = idx_true[mask_fully_matched]
idx_pred_fully_matched = idx_pred[mask_fully_matched]
print(idx_true_fully_matched[2:5])
print(idx_pred_fully_matched[2:5])

print(ak.sum(idx_true_fully_matched[idx_true_fully_matched[:,:,0]>idx_true_fully_matched[:,:,1]]))

print(len(idx_true_fully_matched))
print(len(idx_true_fully_matched)/len(idx_true))

# idx_true_fully_matched=ak.to_numpy  (idx_true_fully_matched)
# idx_pred_fully_matched=ak.to_numpy  (idx_pred_fully_matched)


[[[1, 2], [0, 3]], [[0, 2], [1, -1]], [[2, 3], [0, 1]]]
[[[1, 2], [0, 3]], [[0, 2], [1, -1]], [[2, 3], [0, 1]]]
[True, True, True, False, True, True, True, ... True, False, True, True, True, True]
[[[1, 2], [0, 3]], [[2, 3], [0, 1]], [[0, 1], [2, 3]]]
[[[1, 2], [0, 3]], [[2, 3], [0, 1]], [[0, 1], [2, 3]]]
0
116863
0.9009559787217639


In [24]:
# # CHECK IF THE TWO ARRAYS ARE EQUAL and if you swap the columns
# matched_events =[]
# for i in range(len(idx_true_fully_matched)):
#     arr_true = idx_true_fully_matched[i]
#     arr_pred = idx_pred_fully_matched[i]
#     if ak.all(arr_true[0]==arr_pred[0]) or ak.all(arr_true[0]==arr_pred[1]):
#         matched_events.append(i)

# print(len(matched_events))


In [25]:
# print("eff: ", len(matched_events)/len(idx_true_fully_matched))

In [26]:
correctly_fully_matched = (
    ak.all(idx_true_fully_matched[:, 0] == idx_pred_fully_matched[:, 0], axis=1)
    | ak.all(idx_true_fully_matched[:, 0] == idx_pred_fully_matched[:, 1], axis=1)
    | ak.all(idx_true_fully_matched[:, 1] == idx_pred_fully_matched[:, 0], axis=1)
    | ak.all(idx_true_fully_matched[:, 1] == idx_pred_fully_matched[:, 1], axis=1)
)
print(ak.sum(correctly_fully_matched) / len(idx_true_fully_matched))

1.0


In [27]:
mask_1h = ak.sum(ak.any(idx_true == -1, axis=-1), axis=-1) == 1
print(mask_1h)
print(idx_true)
idx_true_partially_matched_1h = idx_true[mask_1h]
idx_pred_partially_matched_1h = idx_pred[mask_1h]
print(idx_true_partially_matched_1h[2:5])
print(idx_pred_partially_matched_1h[2:5])
print(len(idx_true_partially_matched_1h))
print(len(idx_true_partially_matched_1h) / len(idx_true))

correctly_partially_matched_1h = (
    ak.all(
        idx_true_partially_matched_1h[:, 0] == idx_pred_partially_matched_1h[:, 0],
        axis=1,
    )
    | ak.all(
        idx_true_partially_matched_1h[:, 0] == idx_pred_partially_matched_1h[:, 1],
        axis=1,
    )
    | ak.all(
        idx_true_partially_matched_1h[:, 1] == idx_pred_partially_matched_1h[:, 0],
        axis=1,
    )
    | ak.all(
        idx_true_partially_matched_1h[:, 1] == idx_pred_partially_matched_1h[:, 1],
        axis=1,
    )
)
print(correctly_partially_matched_1h[2:5])
print(ak.sum(correctly_partially_matched_1h) / len(idx_true_partially_matched_1h))

[False, False, False, True, False, False, ... True, False, False, False, False]
[[[1, 3], [0, 2]], [[0, 3], [1, 2]], [[1, ... [[0, 2], [1, 3]], [[2, 3], [0, 1]]]
[[[0, 1], [2, -1]], [[0, 3], [1, -1]], [[0, -1], [2, 3]]]
[[[0, 1], [2, -1]], [[0, 3], [1, -1]], [[0, -1], [2, 3]]]
12279
0.09466502197209159
[True, True, True]
1.0


In [28]:
mask_0h=ak.sum(ak.any(idx_true==-1, axis=-1), axis=-1)==2
print(mask_0h)
print(idx_true)
idx_true_unmatched=idx_true[mask_0h]
idx_pred_unmatched=idx_pred[mask_0h]
print(idx_true_unmatched[2:5])
print(idx_pred_unmatched[2:5])
print(len(idx_true_unmatched))
print(len(idx_true_unmatched)/len(idx_true))


[False, False, False, False, False, False, ... False, False, False, False, False]
[[[1, 3], [0, 2]], [[0, 3], [1, 2]], [[1, ... [[0, 2], [1, 3]], [[2, 3], [0, 1]]]
[[[1, -1], [2, -1]], [[0, -1], [1, -1]], [[1, -1], [0, -1]]]
[[[1, -1], [2, -1]], [[0, -1], [1, -1]], [[1, -1], [0, -1]]]
568
0.004378999306144476


In [29]:
print(len(idx_true_fully_matched)+len(idx_true_partially_matched_1h)+len(idx_true_unmatched))
print(len(idx_true))

129710
129710


In [30]:
# summary
frac_fully_matched = len(idx_true_fully_matched) / len(idx_true)
frac_partially_matched_1h = len(idx_true_partially_matched_1h) / len(idx_true)
frac_unmatched = len(idx_true_unmatched) / len(idx_true)
eff_fully_matched = ak.sum(correctly_fully_matched) / len(idx_true_fully_matched)
eff_partially_matched_1h = ak.sum(correctly_partially_matched_1h) / len(idx_true_partially_matched_1h)
eff_unmatched = len(idx_true_unmatched) / len(idx_true)

print(
    f"Fraction of events fully matched: {frac_fully_matched:.3f}\n"
    f"Fraction of events partially matched (1h): {frac_partially_matched_1h:.3f}\n"
    f"Fraction of events unmatched: {frac_unmatched:.3f}\n"
    f"Purity fully matched: {eff_fully_matched:.3f}\n"
    f"Purity partially matched (1h): {eff_partially_matched_1h:.3f}\n"
    f"Purity unmatched: {eff_unmatched:.3f}"
)

Fraction of events fully matched: 0.901
Fraction of events partially matched (1h): 0.095
Fraction of events unmatched: 0.004
Purity fully matched: 1.000
Purity partially matched (1h): 1.000
Purity unmatched: 0.004
