In [1]:
from collections import defaultdict
import awkward as ak
import numba
import numpy as np
import pandas as pd
import h5py
import vector
vector.register_numba()
vector.register_awkward()

#import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
#import mplhep as hep
#hep.style.use(hep.style.ROOT)

# Predictions on fully-matched dataset of model trained on inclusive dataset

In [2]:
filename_test = "/afs/cern.ch/work/m/mmarcheg/ttHbb/ttHbb_SPANet/datasets/test_file_test_29753.h5"
filename_pred = "/afs/cern.ch/work/m/mmarcheg/ttHbb/ttHbb_SPANet/spanet_output/predictions/predictions_version_4_fullymatched_test_29753.h5"
df_test = h5py.File(filename_test,'r')
df_pred = h5py.File(filename_pred,'r')

In [3]:
df_pred["TARGETS"]["h"].keys()

<KeysViewHDF5 ['assignment_probability', 'b1', 'b2', 'detection_probability', 'marginal_probability']>

## Compute the jet assignment efficiency
We extract the predicted and true indices for the individual quarks.
Jets are correctly assigned when the predicted jet index is equal to the true index. We can compute the efficiency as the ratio of the correctly assigned jets over the total number of jets.

In [4]:
idx_b1_pred = df_pred["TARGETS"]["h"]["b1"][()]
idx_b2_pred = df_pred["TARGETS"]["h"]["b2"][()]
idx_b1_pred

array([0, 0, 2, ..., 0, 4, 1])

In [5]:
idx_b1_true = df_test["TARGETS"]["h"]["b1"][()]
idx_b2_true = df_test["TARGETS"]["h"]["b2"][()]
idx_b1_true

array([0, 0, 2, ..., 0, 4, 1])

In [6]:
idx_h_pred = ak.concatenate((ak.unflatten(idx_b1_pred, ak.ones_like(idx_b1_pred)), ak.unflatten(idx_b2_pred, ak.ones_like(idx_b2_pred))), axis=1)
idx_h_true = ak.concatenate((ak.unflatten(idx_b1_true, ak.ones_like(idx_b1_true)), ak.unflatten(idx_b2_true, ak.ones_like(idx_b2_true))), axis=1)
idx_h_pred

In [7]:
idx_h_true

In [8]:
is_correct_higgs = ak.sum(idx_h_pred == idx_h_true, axis=1) == 2
is_correct_higgs

In [9]:
n_tot = len(is_correct_higgs)
n_tot

29753

In [10]:
n_correct = ak.sum(is_correct_higgs)
n_correct

18392

In [11]:
eff_h = n_correct / n_tot
eff_h

0.6181561523207744

In [12]:
idx_q1_pred = df_pred["TARGETS"]["t1"]["q1"][()]
idx_q2_pred = df_pred["TARGETS"]["t1"]["q2"][()]
idx_b_pred = df_pred["TARGETS"]["t1"]["b"][()]
idx_q1_true = df_test["TARGETS"]["t1"]["q1"][()]
idx_q2_true = df_test["TARGETS"]["t1"]["q2"][()]
idx_b_true = df_test["TARGETS"]["t1"]["b"][()]
idx_b_pred

array([6, 5, 0, ..., 1, 0, 4])

In [13]:
idx_thad_pred = ak.concatenate(
     (ak.unflatten(idx_q1_pred, ak.ones_like(idx_q1_pred)),
     ak.unflatten(idx_q2_pred, ak.ones_like(idx_q2_pred)),
     ak.unflatten(idx_b_pred, ak.ones_like(idx_b_pred))),
     axis=1)
idx_thad_true = ak.concatenate(
     (ak.unflatten(idx_q1_true, ak.ones_like(idx_q1_true)),
     ak.unflatten(idx_q2_true, ak.ones_like(idx_q2_true)),
     ak.unflatten(idx_b_true, ak.ones_like(idx_b_true))),
     axis=1)
idx_thad_pred

In [14]:
is_correct_thad = ak.sum(idx_thad_pred == idx_thad_true, axis=1) == 2
is_correct_thad

In [15]:
n_correct = ak.sum(is_correct_thad)
n_tot = len(is_correct_thad)
eff_thad = n_correct / n_tot
eff_thad

0.19003125735219978

In [16]:
idx_b_pred = df_pred["TARGETS"]["t2"]["b"][()]
idx_b_true = df_test["TARGETS"]["t2"]["b"][()]
idx_tlep_pred = ak.unflatten(idx_b_pred, ak.ones_like(idx_b_pred))
idx_tlep_true = ak.unflatten(idx_b_true, ak.ones_like(idx_b_pred))
idx_tlep_pred

In [17]:
is_correct_tlep = ak.sum(idx_tlep_pred == idx_tlep_true, axis=1) == 1
is_correct_tlep

In [18]:
n_correct = ak.sum(is_correct_tlep)
n_tot = len(is_correct_tlep)
eff_tlep = n_correct / n_tot
eff_tlep

0.7325311733270594

## Resulting Higgs and top reconstruction efficiencies
The Higgs and top efficiencies are defined as the number of events with all the jets correctly assigned over the total number of events:

$\epsilon = \frac{N_{assigned}}{N_{tot}}$

In [19]:
eff_h, eff_thad, eff_tlep

(0.6181561523207744, 0.19003125735219978, 0.7325311733270594)