In [1]:
# This notebook is for creating a training sample and a separate testing sample
# that includes lepton pt,eta,phi, met and ht from event 
# file all_jets_fullRun2_v1.parquet

In [104]:
from collections import defaultdict
import awkward as ak
import numba
import numpy as np
import pandas as pd
import h5py
import vector
vector.register_numba()
vector.register_awkward()

#import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
#import mplhep as hep
#hep.style.use(hep.style.ROOT)

In [3]:
filename = "/afs/cern.ch/work/m/mmarcheg/ttHbb/ttHbb_SPANet/test_dataset/output_test_v3/ttHTobb_ttToSemiLep.parquet"
df = ak.from_parquet(filename)

In [4]:
df.JetGood

In [5]:
jets = ak.with_name(df.JetGood, name="Momentum4D")
leptons = ak.with_name(df.LeptonGood, name="Momentum4D")
met = ak.with_name(df.MET, name="Momentum4D")
jets

In [6]:
frac_train = 0.8
ntot = ak.count(met)
index_train_max = int(np.ceil(frac_train*ntot))
ntot

1277812

In [7]:
jets_train = jets[:index_train_max]
leptons_train = leptons[:index_train_max]
met_train = met[:index_train_max]

jets_test = jets[index_train_max:]
leptons_test = leptons[index_train_max:]
met_test = met[index_train_max:]

ak.count(met_train), ak.count(met_test)

(1022250, 255562)

In [33]:
def create_groups(file):
    file.create_group("TARGETS/t1") # hadronic top -> q1 q2 b
    file.create_group("TARGETS/t2") # leptonic top -> b
    file.create_group("TARGETS/h") # higgs -> b1 b2
    file.create_group("INPUTS")
    file.create_group("INPUTS/Source")
    file.create_group("INPUTS/Lepton")
    file.create_group("INPUTS/Met")
    file.create_group("INPUTS/ht")
    return file

def create_targets(file, particle, jets):
    multiindex = ak.zip([ak.local_index(jets, i) for i in range(jets.ndim)])
    
    if particle == "h":
        mask = jets.prov == 1 # H->b1b2
        multiindex2 = multiindex[mask]
        
        b1_array = []
        b2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                b1_array.append(-1)
                b2_array.append(-1)
            elif len(i) == 1:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(-1)
            elif len(i) == 2:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(i[1].tolist()[1])
        
        file.create_dataset("TARGETS/h/b1", np.shape(b1_array), dtype='int64', data=b1_array)
        file.create_dataset("TARGETS/h/b2", np.shape(b2_array), dtype='int64', data=b2_array)
        
    elif particle == "t1":
        mask = jets.prov == 5 # W->q1q2 from t1
        multiindex2 = multiindex[mask]
        
        q1_array = []
        q2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                q1_array.append(-1)
                q2_array.append(-1)
            elif len(i) == 1:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(-1)
            elif len(i) == 2:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(i[1].tolist()[1])
                
        mask = jets.prov == 2 # t1->Wb 
        multiindex2 = multiindex[mask]
        
        had_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                had_b_array.append(-1)
            elif len(i) == 1:
                had_b_array.append(i[0].tolist()[1])
                
        file.create_dataset("TARGETS/t1/q1", np.shape(q1_array), dtype='int64', data=q1_array)
        file.create_dataset("TARGETS/t1/q2", np.shape(q2_array), dtype='int64', data=q2_array)
        file.create_dataset("TARGETS/t1/b", np.shape(had_b_array), dtype='int64', data=had_b_array)
                
    elif particle == "t2":
        mask = jets.prov == 3 # t2->b 
        multiindex2 = multiindex[mask]
        
        lep_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                lep_b_array.append(-1)
            elif len(i) == 1:
                lep_b_array.append(i[0].tolist()[1])

        file.create_dataset("TARGETS/t2/b", np.shape(lep_b_array), dtype='int64', data=lep_b_array)

def get_object_features(objects, features=["MASK", "pt", "eta", "sin_phi", "cos_phi", "btag"]):

    features_dict = {}
    for feat in features:
        if feat == "MASK": continue
        if feat in ["sin_phi", "cos_phi"]:
            phi = objects["phi"]
            if feat == "sin_phi":
                values = np.sin(phi)
            elif feat == "cos_phi":
                values = np.cos(phi)
        else:
            values = objects[feat]
        if objects.ndim == 1:
            features_dict[feat] = ak.to_numpy(values)
        elif objects.ndim == 2:
            features_dict[feat] = ak.to_numpy(ak.fill_none(ak.pad_none(values, 16, clip=True), 0))
        else:
            raise NotImplementedError

    if "MASK" in features:
        if not "pt" in features:
            raise NotImplementedError
        features_dict["MASK"] = ~(features_dict["pt"] == 0)
    return features_dict

def create_inputs(file, jets, lep, met):
    features = defaultdict(dict)
    features["Jet"] = get_object_features(jets, ["MASK", "pt", "eta", "sin_phi", "cos_phi", "btag"])
    features["Lepton"] = get_object_features(lep, ["pt", "eta", "sin_phi", "cos_phi"])
    features["Met"] = get_object_features(met, ["pt", "eta", "sin_phi", "cos_phi"])
    features["Event"]["ht"] = ak.sum(features["Jet"]["pt"], axis=1)

    for obj, feats in features.items():
        for feat, val in feats.items():
            if feat == "MASK":
                dtype = 'bool'
            else:
                dtype = 'float32'
            ds = file.create_dataset(f"INPUTS/{obj}/{feat}", np.shape(val), dtype=dtype, data=val)

In [34]:
mask_fullymatched = ak.sum(jets_test.matched == True, axis=1)>=6
jets_test_fullymatched = jets_test[mask_fullymatched]

In [35]:
higgs = jets_test_fullymatched[jets_test_fullymatched.prov == 1]
mask_match = ak.num(higgs) == 2

w_or_t_jets = jets_test_fullymatched[(jets_test_fullymatched.prov == 5)|(jets_test_fullymatched.prov == 2)]
mask_match = mask_match & (ak.num(w_or_t_jets) == 3)

lep_top = jets_test_fullymatched[jets_test_fullymatched.prov == 3]
mask_match = mask_match & (ak.num(lep_top) == 1)

jets_test_fullymatched = jets_test_fullymatched[mask_match]
jets_test_fullymatched

In [41]:
# Prepare files for inputs and targets
test_file = h5py.File("test_lep_met_ht_matched_v5.h5", "w")
test_file = create_groups(test_file)

In [43]:
create_targets(test_file, "h", jets_test_fullymatched)
create_targets(test_file, "t1", jets_test_fullymatched)
create_targets(test_file, "t2", jets_test_fullymatched)

In [44]:
# Create input arrays in the files
create_inputs(test_file, jets_test_fullymatched, leptons_test, met_test)

In [45]:
test_file

<HDF5 file "test_lep_met_ht_matched_v5.h5" (mode r+)>

In [46]:
# Print the index of b-jets
test_file["TARGETS"]["t1"]["b"][:5]

array([6, 5, 0, 1, 5])

In [47]:
# Print the btag score of jets
test_file["INPUTS"]["Jet"]["btag"][:5]

array([[0.49682617, 0.43969727, 0.02072144, 0.00827789, 0.0051384 ,
        0.9663086 , 0.8642578 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.5883789 , 0.328125  , 0.9995117 , 0.00226212, 0.03013611,
        0.8149414 , 0.40551758, 0.00651169, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.9765625 , 0.99316406, 0.9995117 , 0.83251953, 0.05593872,
        0.0138855 , 0.01021576, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.10681152, 0.9790039 , 0.0066452 , 0.00436401, 0.9995117 ,
        0.99609375, 0.00643921, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.02229309, 0.9980469 , 0.99560547, 0.00672913, 0.06240845,
        0.95751953, 0.00355911, 

In [37]:
test_file.close()

In [19]:
feats = get_object_features(jets, features=["MASK", "pt", "eta", "sin_phi", "cos_phi", "btag"])
feats

{'pt': array([[110.25   ,  78.     ,  69.125  , ...,   0.     ,   0.     ,
           0.     ],
        [157.25   , 155.125  , 107.3125 , ...,   0.     ,   0.     ,
           0.     ],
        [ 65.4375 ,  42.65625,  41.40625, ...,   0.     ,   0.     ,
           0.     ],
        ...,
        [217.25   , 167.5    , 113.5625 , ...,   0.     ,   0.     ,
           0.     ],
        [ 87.25   ,  83.125  ,  78.4375 , ...,   0.     ,   0.     ,
           0.     ],
        [112.125  , 105.4375 ,  67.25   , ...,   0.     ,   0.     ,
           0.     ]]),
 'eta': array([[ 0.09004211, -1.77514648, -1.21435547, ...,  0.        ,
          0.        ,  0.        ],
        [ 1.18603516,  1.58496094, -0.38067627, ...,  0.        ,
          0.        ,  0.        ],
        [-0.17422485,  0.19692993, -0.95397949, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 1.37890625,  2.11181641,  0.83276367, ...,  0.        ,
          0.        ,  0.        ],
        [-1

In [25]:
feats = get_object_features(met, features=["MASK", "pt", "eta", "sin_phi", "cos_phi"])
feats

{'pt': array([ 69.345604,  62.67269 ,  53.48528 , ..., 134.18373 , 110.67311 ,
        130.79652 ], dtype=float32),
 'eta': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'sin_phi': array([ 0.8207763,  0.9999924,  0.7120113, ..., -0.8824544, -0.611303 ,
         0.9743884], dtype=float32),
 'cos_phi': array([ 0.5712498 ,  0.00390179,  0.70216805, ...,  0.47039792,
        -0.7913967 , -0.22487144], dtype=float32),
 'MASK': array([ True,  True,  True, ...,  True,  True,  True])}

In [21]:
jets_test_fullymatched.ndim

2

In [22]:
leptons_test.ndim

1

In [54]:
test_file["INPUTS"]["ht"].keys()

<KeysViewHDF5 []>

In [55]:
ak.local_index(jets)

In [60]:
jets.prov

In [59]:
ak.local_index(jets)[jets.prov == 1]

In [62]:
multiindex = ak.zip([ak.local_index(jets, i) for i in range(jets.ndim)])
multiindex[:5]

In [70]:
mask = jets.prov == 1 # H->b1b2
multiindex2 = multiindex[mask]
multiindex2[:10]

In [91]:
def get_indices_b(obj):

    multiindex = ak.zip([ak.local_index(obj, i) for i in range(obj.ndim)])
    
    b1_array = []
    b2_array = []
    
    for index,i in enumerate(multiindex2):
        if len(i) == 0:
            b1_array.append(-1)
            b2_array.append(-1)
        elif len(i) == 1:
            b1_array.append(i[0].tolist()[1])
            b2_array.append(-1)
        elif len(i) == 2:
            b1_array.append(i[0].tolist()[1])
            b2_array.append(i[1].tolist()[1])
    return b1_array, b2_array

In [92]:
b1, b2 = get_indices_b(jets[:10])
b1

[1,
 2,
 2,
 3,
 3,
 1,
 1,
 2,
 0,
 1,
 1,
 1,
 1,
 0,
 3,
 0,
 2,
 2,
 3,
 0,
 2,
 1,
 0,
 1,
 2,
 0,
 1,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 2,
 1,
 0,
 3,
 2,
 1,
 1,
 0,
 0,
 0,
 1,
 3,
 3,
 3,
 0,
 -1,
 0,
 2,
 3,
 0,
 2,
 3,
 0,
 0,
 2,
 2,
 1,
 0,
 3,
 5,
 1,
 1,
 1,
 1,
 3,
 1,
 2,
 0,
 3,
 0,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 0,
 2,
 0,
 0,
 0,
 2,
 1,
 2,
 2,
 1,
 0,
 2,
 1,
 0,
 0,
 2,
 4,
 1,
 1,
 3,
 1,
 2,
 0,
 -1,
 2,
 2,
 0,
 3,
 4,
 0,
 2,
 0,
 4,
 1,
 0,
 1,
 0,
 4,
 1,
 0,
 2,
 3,
 1,
 2,
 2,
 3,
 3,
 4,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 3,
 3,
 -1,
 3,
 0,
 3,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 5,
 2,
 0,
 1,
 2,
 1,
 3,
 1,
 3,
 2,
 1,
 1,
 2,
 3,
 3,
 0,
 1,
 1,
 0,
 0,
 3,
 2,
 1,
 3,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 1,
 0,
 4,
 1,
 3,
 1,
 1,
 0,
 0,
 2,
 1,
 0,
 0,
 1,
 1,
 3,
 1,
 6,
 3,
 0,
 1,
 2,
 2,
 0,
 0,
 2,
 0,
 3,
 -1,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 3,
 2,
 2,
 3,
 3,
 3,
 1,
 0,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 3,
 3,
 1,
 1,
 0,
 1,


In [87]:
b2[:10]

[6, -1, 3, 5, -1, 2, 3, 5, 3, 3]

In [89]:
ak.fill_none(i2, -1)[:10]

In [90]:
ak.local_index(jets[:10])

In [68]:
b1_array[:10]

[1, 2, 2, 3, 3, 1, 1, 2, 0, 1]

In [69]:
b2_array[:10]

[6, -1, 3, 5, -1, 2, 3, 5, 3, 3]

In [79]:
indices = ak.local_index(jets)[mask]
indices

In [80]:
indices_padded = ak.pad_none(indices, 2)

In [81]:
i1 = indices_padded[:,0]
i2 = indices_padded[:,1]
i1

In [82]:
ak.fill_none(i2, -1)

In [96]:
def create_targets_optimized(file, particle, jets):
    indices = ak.local_index(jets)
    
    if particle == "h":
        mask = jets.prov == 1 # H->b1b2
        # We select the local indices of jets matched with the Higgs
        # The indices are padded with None such that there are 2 entries per event
        # The None values are filled with -1 (a nan value).
        indices_prov = ak.fill_none(ak.pad_none(indices[mask], 2), -1)

        index_b1 = indices_prov[:,0]
        index_b2 = indices_prov[:,1]
        
        file.create_dataset("TARGETS/h/b1", np.shape(index_b1), dtype='int64', data=index_b1)
        file.create_dataset("TARGETS/h/b2", np.shape(index_b2), dtype='int64', data=index_b2)
        
    elif particle == "t1":
        mask = jets.prov == 5 # W->q1q2 from t1
        indices_prov = ak.fill_none(ak.pad_none(indices[mask], 2), -1)

        index_q1 = indices_prov[:,0]
        index_q2 = indices_prov[:,1]

        mask = jets.prov == 2 # t1->Wb
        index_b_hadr = ak.fill_none(ak.pad_none(indices[mask], 1), -1)
                
        file.create_dataset("TARGETS/t1/q1", np.shape(index_q1), dtype='int64', data=index_q1)
        file.create_dataset("TARGETS/t1/q2", np.shape(index_q2), dtype='int64', data=index_q2)
        file.create_dataset("TARGETS/t1/b", np.shape(index_b_hadr), dtype='int64', data=index_b_hadr)
                
    elif particle == "t2":
        mask = jets.prov == 3 # t2->b
        index_b_lep = ak.fill_none(ak.pad_none(indices[mask], 1), -1)

        file.create_dataset("TARGETS/t2/b", np.shape(index_b_lep), dtype='int64', data=index_b_lep)

In [100]:
%%time
# Prepare files for inputs and targets
file_old = h5py.File("test_lep_met_ht_matched_old.h5", "w")
file_old = create_groups(file_old)
create_targets(file_old, "h", jets_test_fullymatched)
create_targets(file_old, "t1", jets_test_fullymatched)
create_targets(file_old, "t2", jets_test_fullymatched)
# Create input arrays in the files
create_inputs(file_old, jets_test_fullymatched, leptons_test, met_test)

CPU times: user 13.1 s, sys: 39.3 ms, total: 13.1 s
Wall time: 13.4 s


In [103]:
%%time
# Prepare files for inputs and targets
file_optimized = h5py.File("test_lep_met_ht_matched_optimized.h5", "w")
file_optimized = create_groups(file_optimized)
create_targets_optimized(file_optimized, "h", jets_test_fullymatched)
create_targets_optimized(file_optimized, "t1", jets_test_fullymatched)
create_targets_optimized(file_optimized, "t2", jets_test_fullymatched)
# Create input arrays in the files
create_inputs(file_optimized, jets_test_fullymatched, leptons_test, met_test)

CPU times: user 87.8 ms, sys: 43.9 ms, total: 132 ms
Wall time: 132 ms


In [125]:
mask = jets_test_fullymatched.prov == 5
ak.local_index(jets_test_fullymatched)[mask]

In [126]:
len(df)

1277812

In [128]:
dir(file_optimized)

['_MutableMapping__marker',
 '__abstractmethods__',
 '__bool__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_d',
 '_e',
 '_gcpl_crt_order',
 '_id',
 '_ipython_key_completions_',
 '_lapl',
 '_lcpl',
 '_libver',
 'attrs',
 'build_virtual_dataset',
 'clear',
 'close',
 'copy',
 'create_dataset',
 'create_dataset_like',
 'create_group',
 'create_virtual_dataset',
 'driver',
 'file',
 'filename',
 'flush',
 'get',
 'id',
 'items',
 'key

In [149]:
for key in file_optimized.keys():
    print(f"{key}/")
    for collection in file_optimized[key].keys():
        print(f"{key}/{collection}")
        gap+= len(f"/{collection}")
        for var in file_optimized[key][collection].keys():
            print(f"{key}/{collection}/{var}")

INPUTS/
INPUTS/Event
INPUTS/Event/ht
INPUTS/Jet
INPUTS/Jet/MASK
INPUTS/Jet/btag
INPUTS/Jet/cos_phi
INPUTS/Jet/eta
INPUTS/Jet/pt
INPUTS/Jet/sin_phi
INPUTS/Lepton
INPUTS/Lepton/cos_phi
INPUTS/Lepton/eta
INPUTS/Lepton/pt
INPUTS/Lepton/sin_phi
INPUTS/Met
INPUTS/Met/cos_phi
INPUTS/Met/eta
INPUTS/Met/pt
INPUTS/Met/sin_phi
INPUTS/Source
INPUTS/ht
TARGETS/
TARGETS/h
TARGETS/h/b1
TARGETS/h/b2
TARGETS/t1
TARGETS/t1/b
TARGETS/t1/q1
TARGETS/t1/q2
TARGETS/t2
TARGETS/t2/b


In [142]:
file_optimized["INPUTS"]["Jet"].keys()

<KeysViewHDF5 ['MASK', 'btag', 'cos_phi', 'eta', 'pt', 'sin_phi']>