In [2]:
# This notebook is for creating training and testing samples from all_jets_fullRun2_v1.parquet file

In [3]:
import awkward as ak
import numba
import numpy as np
import pandas as pd
import awkward as ak
import h5py
import vector
vector.register_numba()
vector.register_awkward()

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import mplhep as hep
hep.style.use(hep.style.ROOT)

In [4]:
df = ak.from_parquet("/eos/user/d/dvalsecc/www/ttHbbAnalysis/training_dataset/all_jets_fullRun2_v1.parquet")

In [5]:
df

<Array [{jets: [{pt: 68.7, ... m: 125}}] type='1414130 * {"jets": var * {"pt": f...'>

In [6]:
(jets,_,_,_,_,_,_,_) = ak.unzip(df)

In [7]:
jets

<Array [[{pt: 68.7, eta: 0.853, ... prov: 1}]] type='1414130 * var * {"pt": floa...'>

In [8]:
ak.max(ak.num(jets))

16

In [9]:
maxlen = int(np.ceil(0.8*len(jets)))
test_jets = jets[maxlen:]
train_jets = jets[:maxlen]

In [10]:
test_jets

<Array [[{pt: 90.6, eta: 1.28, ... prov: 1}]] type='282826 * var * {"pt": float3...'>

In [11]:
train_jets

<Array [[{pt: 68.7, eta: 0.853, ... prov: 3}]] type='1131304 * var * {"pt": floa...'>

In [12]:
def create_groups(file):
    file.create_group("TARGETS/t1") # hadronic top -> q1 q2 b
    file.create_group("TARGETS/t2") # leptonic top -> b
    file.create_group("TARGETS/h") # higgs -> b1 b2
    file.create_group("INPUTS")
    file.create_group("INPUTS/Source")
    return file

In [13]:
def create_targets(file, particle, jets):
    multiindex = ak.zip([ak.local_index(jets, i) for i in range(jets.ndim)])
    
    if particle == "h":
        mask = jets.prov == 1 # H->b1b2
        multiindex2 = multiindex[mask]
        
        b1_array = []
        b2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                b1_array.append(-1)
                b2_array.append(-1)
            elif len(i) == 1:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(-1)
            elif len(i) == 2:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(i[1].tolist()[1])
        
        file.create_dataset("TARGETS/h/b1", np.shape(b1_array), dtype='int64', data=b1_array)
        file.create_dataset("TARGETS/h/b2", np.shape(b2_array), dtype='int64', data=b2_array)
        
    elif particle == "t1":
        mask = jets.prov == 5 # W->q1q2 from t1
        multiindex2 = multiindex[mask]
        
        q1_array = []
        q2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                q1_array.append(-1)
                q2_array.append(-1)
            elif len(i) == 1:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(-1)
            elif len(i) == 2:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(i[1].tolist()[1])
                
        mask = jets.prov == 2 # t1->Wb 
        multiindex2 = multiindex[mask]
        
        had_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                had_b_array.append(-1)
            elif len(i) == 1:
                had_b_array.append(i[0].tolist()[1])
                
        file.create_dataset("TARGETS/t1/q1", np.shape(q1_array), dtype='int64', data=q1_array)
        file.create_dataset("TARGETS/t1/q2", np.shape(q2_array), dtype='int64', data=q2_array)
        file.create_dataset("TARGETS/t1/b", np.shape(had_b_array), dtype='int64', data=had_b_array)
                
    elif particle == "t2":
        mask = jets.prov == 3 # t2->b 
        multiindex2 = multiindex[mask]
        
        lep_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                lep_b_array.append(-1)
            elif len(i) == 1:
                lep_b_array.append(i[0].tolist()[1])

        file.create_dataset("TARGETS/t2/b", np.shape(lep_b_array), dtype='int64', data=lep_b_array)

In [14]:
def create_inputs(file, jets):
    pt_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.pt, 16, clip=True), 0))
    mask = ~(pt_array == 0)
    mask_ds = file.create_dataset("INPUTS/Source/MASK", np.shape(mask), dtype='bool', data=mask)
    pt_ds = file.create_dataset("INPUTS/Source/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.phi, 16, clip=True), 0))
    phi_ds = file.create_dataset("INPUTS/Source/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.eta, 16, clip=True), 0))
    eta_ds = file.create_dataset("INPUTS/Source/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    btag = ak.to_numpy(ak.fill_none(ak.pad_none(jets.btag, 16, clip=True), 0))
    btag_ds = file.create_dataset("INPUTS/Source/btag", np.shape(btag), dtype='float32', data=btag)

In [15]:
# Prepare files for inputs and targets
train_file = h5py.File("train.h5", "w")
test_file = h5py.File("test.h5", "w")

train_file = create_groups(train_file)
test_file = create_groups(test_file)

In [16]:
multiindex = ak.zip([ak.local_index(train_jets, i) for i in range(train_jets.ndim)])
multiindex[0:2].tolist() # This is used in create_targets()

[[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4)],
 [(1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5)]]

In [17]:
# Create target arrays in the files. This will take a few minutes
create_targets(train_file, "h", train_jets)
create_targets(train_file, "t1", train_jets)
create_targets(train_file, "t2", train_jets)

In [18]:
create_targets(test_file, "h", test_jets)
create_targets(test_file, "t1", test_jets)
create_targets(test_file, "t2", test_jets)

In [19]:
# Create input arrays in the files
create_inputs(train_file, train_jets)
create_inputs(test_file, test_jets)

In [20]:
train_file.close()
test_file.close()