In [1]:
# This notebook is for creating a training and testing samples from v2_sig_bkg/*.parquet files

In [1]:
import awkward as ak
import numba
import numpy as np
import pandas as pd
import awkward as ak
import h5py
import vector
vector.register_numba()
vector.register_awkward()

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import mplhep as hep
hep.style.use(hep.style.ROOT)

In [2]:
def create_groups(file):
    file.create_group("TARGETS/t1") # hadronic top -> q1 q2 b
    file.create_group("TARGETS/t2") # leptonic top -> b
    file.create_group("TARGETS/h") # higgs -> b1 b2
    file.create_group("INPUTS")
    file.create_group("INPUTS/Source")
    return file

def create_targets(file, particle, jets):
    multiindex = ak.zip([ak.local_index(jets, i) for i in range(jets.ndim)])
    
    if particle == "h":
        mask = jets.prov == 1 # H->b1b2
        multiindex2 = multiindex[mask]
        
        b1_array = []
        b2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                b1_array.append(-1)
                b2_array.append(-1)
            elif len(i) == 1:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(-1)
            elif len(i) == 2:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(i[1].tolist()[1])
        
        file.create_dataset("TARGETS/h/b1", np.shape(b1_array), dtype='int64', data=b1_array)
        file.create_dataset("TARGETS/h/b2", np.shape(b2_array), dtype='int64', data=b2_array)
        
    elif particle == "t1":
        mask = jets.prov == 5 # W->q1q2 from t1
        multiindex2 = multiindex[mask]
        
        q1_array = []
        q2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                q1_array.append(-1)
                q2_array.append(-1)
            elif len(i) == 1:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(-1)
            elif len(i) == 2:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(i[1].tolist()[1])
                
        mask = jets.prov == 2 # t1->Wb 
        multiindex2 = multiindex[mask]
        
        had_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                had_b_array.append(-1)
            elif len(i) == 1:
                had_b_array.append(i[0].tolist()[1])
                
        file.create_dataset("TARGETS/t1/q1", np.shape(q1_array), dtype='int64', data=q1_array)
        file.create_dataset("TARGETS/t1/q2", np.shape(q2_array), dtype='int64', data=q2_array)
        file.create_dataset("TARGETS/t1/b", np.shape(had_b_array), dtype='int64', data=had_b_array)
                
    elif particle == "t2":
        mask = jets.prov == 3 # t2->b 
        multiindex2 = multiindex[mask]
        
        lep_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                lep_b_array.append(-1)
            elif len(i) == 1:
                lep_b_array.append(i[0].tolist()[1])

        file.create_dataset("TARGETS/t2/b", np.shape(lep_b_array), dtype='int64', data=lep_b_array)

def create_inputs(file, jets):
    pt_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.pt, ak.max(ak.num(jets)), clip=True), 0))
    mask = ~(pt_array == 0)
    mask_ds = file.create_dataset("INPUTS/Source/MASK", np.shape(mask), dtype='bool', data=mask)
    pt_ds = file.create_dataset("INPUTS/Source/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.phi, ak.max(ak.num(jets)), clip=True), 0))
    phi_ds = file.create_dataset("INPUTS/Source/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.eta, ak.max(ak.num(jets)), clip=True), 0))
    eta_ds = file.create_dataset("INPUTS/Source/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    btag = ak.to_numpy(ak.fill_none(ak.pad_none(jets.btag, ak.max(ak.num(jets)), clip=True), 0))
    btag_ds = file.create_dataset("INPUTS/Source/btag", np.shape(btag), dtype='float32', data=btag)

In [3]:
basedir = "/eos/user/y/ymaidann/eth_project/Spanet_project/v2_sig_bkg/"

## Signal files

In [70]:
names = ["ttHTobb_2016_PostVFP", 
         "ttHTobb_2016_PreVFP",
         "ttHTobb_2017",
         "ttHTobb_2018"]
files = []
for name in names:
    files.append("all_jets_fullRun2_"+name+"_v2.parquet")

In [66]:
for i in range(len(files)):
    df = ak.from_parquet(basedir + files[i])
    (jets,_,_,_,_,_,_,_,_) = ak.unzip(df)
    
    # Get fully matched jets from df
    mask_fullymatched = ak.sum(jets.matched == True, axis=1)>=6
    higgs = jets[jets.prov == 1]
    jets = jets[ak.num(higgs) == 2]

    w_or_t_jets = jets[(jets.prov == 5)|(jets.prov == 2)]
    jets = jets[ak.num(w_or_t_jets) == 3]

    lep_top = jets[jets.prov == 3]
    jets = jets[ak.num(lep_top) == 1]
    
    output_file = h5py.File(names[i]+"_matched.h5", "w")
    output_file = create_groups(output_file)
    
    # Create target arrays in the files. This will take a few minutes
    create_targets(output_file, "h", jets)
    create_targets(output_file, "t1", jets)
    create_targets(output_file, "t2", jets)
    
    # Create input arrays in the files
    create_inputs(output_file, jets)
    output_file.close()


## Backgrounds

In [4]:
names = ["TTbbSemiLeptonic_Powheg_2016_PostVFP", 
         "TTbbSemiLeptonic_Powheg_2016_PreVFP",
         "TTbbSemiLeptonic_Powheg_2017",
         "TTbbSemiLeptonic_Powheg_2018",
         "TTToSemiLeptonic_2016_PostVFP",
         "TTToSemiLeptonic_2016_PreVFP",
         "TTToSemiLeptonic_2017",
         "TTToSemiLeptonic_2018"]
names = ["TTbbSemiLeptonic_Powheg_2016_PostVFP"]
files = []
for name in names:
    files.append("all_jets_fullRun2_"+name+"_v2.parquet")

In [6]:
def create_targets_bkg(file, particle, jets):
   
    if particle == "h":

        b1_array = np.ones(len(jets)) 
        b2_array = np.ones(len(jets))
        b1_array *= -1
        b2_array *= -1
        
        file.create_dataset("TARGETS/h/b1", np.shape(b1_array), dtype='int64', data=b1_array)
        file.create_dataset("TARGETS/h/b2", np.shape(b2_array), dtype='int64', data=b2_array)
        
    else:
        create_targets(file, particle, jets)

In [None]:
for i in range(len(files)):
    df = ak.from_parquet(basedir + files[i])
    (jets,_,_,_,_,_,_,_) = ak.unzip(df)
    
    jets = jets[1000000:]
    output_file = h5py.File(names[i]+"2.h5", "w")
    output_file = create_groups(output_file)

    # Create target arrays in the files. This will take a few minutes
    create_targets_bkg(output_file, "h", jets)
    create_targets_bkg(output_file, "t1", jets)
    create_targets_bkg(output_file, "t2", jets)
    
    # Create input arrays in the files
    create_inputs(output_file, jets)
    output_file.close()