In [37]:
# This notebook is for creating a sample of fully matched events from all_jets_fullRun2_v1.parquet

In [None]:
import awkward as ak
import numba
import numpy as np
import pandas as pd
import awkward as ak
import h5py
import vector
vector.register_numba()
vector.register_awkward()

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import mplhep as hep
hep.style.use(hep.style.ROOT)

In [38]:
df = ak.from_parquet("/eos/user/d/dvalsecc/www/ttHbbAnalysis/training_dataset/all_jets_fullRun2_v1.parquet")
df

<Array [{jets: [{pt: 68.7, ... m: 125}}] type='1414130 * {"jets": var * {"pt": f...'>

In [39]:
(jets,_,_,_,_,_,_,_) = ak.unzip(df)

In [40]:
jets

<Array [[{pt: 68.7, eta: 0.853, ... prov: 1}]] type='1414130 * var * {"pt": floa...'>

In [41]:
mask_fullymatched = ak.sum(jets.matched == True, axis=1)>=6
jets = jets[mask_fullymatched]

higgs = jets[jets.prov == 1]
mask_match = ak.num(higgs) == 2

w_or_t_jets = jets[(jets.prov == 5)|(jets.prov == 2)]
mask_match = mask_match & (ak.num(w_or_t_jets) == 3)

lep_top = jets[jets.prov == 3]
mask_match = mask_match & (ak.num(lep_top) == 1)

jets = jets[mask_match]

In [42]:
jets

<Array [[{pt: 145, eta: 1.5, ... prov: 1}]] type='159299 * var * {"pt": float32,...'>

In [44]:
ak.max(ak.num(jets))

16

In [45]:
output_file = h5py.File("fulltth_matched.h5", "w")

In [46]:
t1 = output_file.create_group("TARGETS/t1") # hadronic top -> q1 q2 b
t2 = output_file.create_group("TARGETS/t2") # leptonic top -> b
h = output_file.create_group("TARGETS/h") # higgs -> b1 b2

In [47]:
multiindex = ak.zip([ak.local_index(jets, i) for i in range(jets.ndim)])

In [48]:
multiindex[0:2].tolist()

[[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5)],
 [(1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6)]]

# Create arrays for H-> b1 b2

In [49]:
mask = jets.prov == 1

In [50]:
multiindex2 = multiindex[mask]

In [51]:
multiindex2[0:5].tolist()

[[(0, 1), (0, 5)],
 [(1, 2), (1, 4)],
 [(2, 1), (2, 4)],
 [(3, 0), (3, 4)],
 [(4, 0), (4, 4)]]

In [52]:
multiindex2

<Array [[(0, 1), (0, 5), ... 1), (159298, 5)]] type='159299 * var * (int64, int64)'>

In [53]:
b1_array = []
b2_array = []

for index,i in enumerate(multiindex2):
    b1_array.append(i[0].tolist()[1])
    b2_array.append(i[1].tolist()[1])

# Create arrays for t1 -> q1 q2 b

In [54]:
mask = jets.prov == 5 # W->q1&q2 from t1

In [55]:
multiindex2 = multiindex[mask]

In [56]:
q1_array = []
q2_array = []

for index,i in enumerate(multiindex2):
    q1_array.append(i[0].tolist()[1])
    q2_array.append(i[1].tolist()[1])

In [57]:
mask = jets.prov == 2 # t1->Wb 

In [58]:
multiindex2 = multiindex[mask]

In [59]:
had_b_array = []

for index,i in enumerate(multiindex2):
    had_b_array.append(i[0].tolist()[1])


# Create arrays for t2 -> b

In [60]:
mask = jets.prov == 3 # t2->b 

In [61]:
multiindex2 = multiindex[mask]

In [62]:
lep_b_array = []

for index,i in enumerate(multiindex2):
    lep_b_array.append(i[0].tolist()[1])


In [63]:
t1_q1 = output_file.create_dataset("TARGETS/t1/q1", np.shape(q1_array), dtype='int64', data=q1_array)
t1_q2 = output_file.create_dataset("TARGETS/t1/q2", np.shape(q2_array), dtype='int64', data=q2_array)
t1_b = output_file.create_dataset("TARGETS/t1/b", np.shape(had_b_array), dtype='int64', data=had_b_array)

t2_b = output_file.create_dataset("TARGETS/t2/b", np.shape(lep_b_array), dtype='int64', data=lep_b_array)

h_b1 = output_file.create_dataset("TARGETS/h/b1", np.shape(b1_array), dtype='int64', data=b1_array)
h_b2 = output_file.create_dataset("TARGETS/h/b2", np.shape(b2_array), dtype='int64', data=b2_array)

In [64]:
inputs = output_file.create_group("INPUTS")
source = output_file.create_group("INPUTS/Source")

pt_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.pt, 16, clip=True), 0))
mask = ~(pt_array == 0)
mask_ds = output_file.create_dataset("INPUTS/Source/MASK", np.shape(mask), dtype='bool', data=mask)
pt_ds = output_file.create_dataset("INPUTS/Source/pt", np.shape(pt_array), dtype='float32', data=pt_array)

phi_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.phi, 16, clip=True), 0))
phi_ds = output_file.create_dataset("INPUTS/Source/phi", np.shape(phi_array), dtype='float32', data=phi_array)

eta_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.eta, 16, clip=True), 0))
eta_ds = output_file.create_dataset("INPUTS/Source/eta", np.shape(eta_array), dtype='float32', data=eta_array)

btag = ak.to_numpy(ak.fill_none(ak.pad_none(jets.btag, 16, clip=True), 0))
btag_ds = output_file.create_dataset("INPUTS/Source/btag", np.shape(btag), dtype='float32', data=btag)


In [65]:
output_file.close()

In [66]:
jets.prov[0:20].tolist()

[[5.0, 1.0, 5.0, 3.0, 2.0, 1.0],
 [5.0, 4.0, 1.0, 3.0, 1.0, 2.0, 5.0],
 [2.0, 1.0, 5.0, 3.0, 1.0, 5.0],
 [1.0, 5.0, 5.0, 3.0, 1.0, 2.0],
 [1.0, 2.0, 5.0, 3.0, 1.0, 5.0, -1.0],
 [1.0, 3.0, 5.0, 1.0, 5.0, 2.0],
 [2.0, 4.0, 5.0, 5.0, 1.0, 3.0, 1.0],
 [1.0, 5.0, 2.0, 3.0, 5.0, 1.0, 4.0],
 [1.0, 5.0, 3.0, 2.0, 5.0, 1.0],
 [2.0, 5.0, 1.0, 5.0, 3.0, 1.0, 4.0, -1.0],
 [2.0, 1.0, 5.0, 5.0, 3.0, 1.0, 4.0],
 [1.0, 2.0, 3.0, 1.0, 5.0, 5.0],
 [1.0, 2.0, 5.0, 5.0, 3.0, 1.0],
 [4.0, 2.0, 3.0, 5.0, 5.0, 1.0, -1.0, 1.0, -1.0, -1.0],
 [2.0, 5.0, 1.0, 3.0, 1.0, 5.0],
 [5.0, 1.0, 5.0, 3.0, 2.0, 1.0],
 [1.0, 1.0, 5.0, 2.0, 5.0, 3.0],
 [5.0, 5.0, 1.0, 1.0, 3.0, 2.0],
 [1.0, 4.0, 2.0, 5.0, 3.0, 5.0, 1.0],
 [2.0, 3.0, 5.0, 5.0, 1.0, 1.0]]