In [1]:
# This notebook is for creating a training sample and a separate testing sample
# that includes lepton pt,eta,phi, met and ht from event 
# file all_jets_fullRun2_v1.parquet

In [2]:
import awkward as ak
import numba
import numpy as np
import pandas as pd
import awkward as ak
import h5py
import vector
vector.register_numba()
vector.register_awkward()

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import mplhep as hep
hep.style.use(hep.style.ROOT)

In [3]:
df = ak.from_parquet("/eos/user/d/dvalsecc/www/ttHbbAnalysis/training_dataset/all_jets_fullRun2_v1.parquet")

In [4]:
df

<Array [{jets: [{pt: 68.7, ... m: 125}}] type='1414130 * {"jets": var * {"pt": f...'>

In [5]:
(jets,_,_,_,_,lepton_reco, met,_) = ak.unzip(df)
jets = ak.with_name(jets, name="Momentum4D")
lepton_reco = ak.with_name(lepton_reco, name="Momentum4D")
met = ak.with_name(met, name="Momentum4D")

In [6]:
jets

<MomentumArray4D [[{pt: 68.7, eta: 0.853, ... prov: 1}]] type='1414130 * var * M...'>

In [7]:
maxlen = int(np.ceil(0.8*len(jets)))
test_jets = jets[maxlen:]
test_lep = lepton_reco[maxlen:]
test_met = met[maxlen:]

In [8]:
def create_groups(file):
    file.create_group("TARGETS/t1") # hadronic top -> q1 q2 b
    file.create_group("TARGETS/t2") # leptonic top -> b
    file.create_group("TARGETS/h") # higgs -> b1 b2
    file.create_group("INPUTS")
    file.create_group("INPUTS/Source")
    file.create_group("INPUTS/Lepton")
    file.create_group("INPUTS/Met")
    file.create_group("INPUTS/ht")
    return file

def create_targets(file, particle, jets):
    multiindex = ak.zip([ak.local_index(jets, i) for i in range(jets.ndim)])
    
    if particle == "h":
        mask = jets.prov == 1 # H->b1b2
        multiindex2 = multiindex[mask]
        
        b1_array = []
        b2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                b1_array.append(-1)
                b2_array.append(-1)
            elif len(i) == 1:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(-1)
            elif len(i) == 2:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(i[1].tolist()[1])
        
        file.create_dataset("TARGETS/h/b1", np.shape(b1_array), dtype='int64', data=b1_array)
        file.create_dataset("TARGETS/h/b2", np.shape(b2_array), dtype='int64', data=b2_array)
        
    elif particle == "t1":
        mask = jets.prov == 5 # W->q1q2 from t1
        multiindex2 = multiindex[mask]
        
        q1_array = []
        q2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                q1_array.append(-1)
                q2_array.append(-1)
            elif len(i) == 1:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(-1)
            elif len(i) == 2:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(i[1].tolist()[1])
                
        mask = jets.prov == 2 # t1->Wb 
        multiindex2 = multiindex[mask]
        
        had_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                had_b_array.append(-1)
            elif len(i) == 1:
                had_b_array.append(i[0].tolist()[1])
                
        file.create_dataset("TARGETS/t1/q1", np.shape(q1_array), dtype='int64', data=q1_array)
        file.create_dataset("TARGETS/t1/q2", np.shape(q2_array), dtype='int64', data=q2_array)
        file.create_dataset("TARGETS/t1/b", np.shape(had_b_array), dtype='int64', data=had_b_array)
                
    elif particle == "t2":
        mask = jets.prov == 3 # t2->b 
        multiindex2 = multiindex[mask]
        
        lep_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                lep_b_array.append(-1)
            elif len(i) == 1:
                lep_b_array.append(i[0].tolist()[1])

        file.create_dataset("TARGETS/t2/b", np.shape(lep_b_array), dtype='int64', data=lep_b_array)
        
def create_inputs(file, jets, lep, met):
    pt_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.pt, 16, clip=True), 0))
    mask = ~(pt_array == 0)
    mask_ds = file.create_dataset("INPUTS/Source/MASK", np.shape(mask), dtype='bool', data=mask)
    pt_ds = file.create_dataset("INPUTS/Source/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.phi, 16, clip=True), 0))
    phi_ds = file.create_dataset("INPUTS/Source/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.eta, 16, clip=True), 0))
    eta_ds = file.create_dataset("INPUTS/Source/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    btag = ak.to_numpy(ak.fill_none(ak.pad_none(jets.btag, 16, clip=True), 0))
    btag_ds = file.create_dataset("INPUTS/Source/btag", np.shape(btag), dtype='float32', data=btag)
    
    # Fill Lepton
    pt_array = ak.to_numpy(lep.pt)
    pt_ds = file.create_dataset("INPUTS/Lepton/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(lep.phi)
    phi_ds = file.create_dataset("INPUTS/Lepton/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(lep.eta)
    eta_ds = file.create_dataset("INPUTS/Lepton/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    # Fill Met
    pt_array = ak.to_numpy(met.pt)
    pt_ds = file.create_dataset("INPUTS/Met/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(met.phi)
    phi_ds = file.create_dataset("INPUTS/Met/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(met.eta)
    eta_ds = file.create_dataset("INPUTS/Met/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    # Fill ht
    pt_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.pt, 15, clip=True), 0))
    ht_array = np.sum(pt_array, axis=1)
    ht_ds = file.create_dataset("INPUTS/ht/ht", np.shape(ht_array), dtype='float32', data=ht_array)

In [9]:
mask_fullymatched = ak.sum(test_jets.matched == True, axis=1)>=6
test_jets = test_jets[mask_fullymatched]

In [11]:
higgs = test_jets[test_jets.prov == 1]
mask_match = ak.num(higgs) == 2

w_or_t_jets = test_jets[(test_jets.prov == 5)|(test_jets.prov == 2)]
mask_match = mask_match & (ak.num(w_or_t_jets) == 3)

lep_top = test_jets[test_jets.prov == 3]
mask_match = mask_match & (ak.num(lep_top) == 1)

test_jets = test_jets[mask_match]

In [13]:
# Prepare files for inputs and targets
test_file = h5py.File("test_lep_met_ht_matched.h5", "w")
test_file = create_groups(test_file)

In [14]:
create_targets(test_file, "h", test_jets)
create_targets(test_file, "t1", test_jets)
create_targets(test_file, "t2", test_jets)

In [15]:
# Create input arrays in the files
create_inputs(test_file, test_jets, test_lep, test_met)

In [16]:
test_file.close()