In [1]:
%load_ext autoreload
%autoreload 2

import copy
import torch
import numpy as np
import awkward as ak
import uproot
import pandas as pd
import dask
import vector
import particle
import hepunits

from torch.utils.data import DataLoader

from memflow.dataset.data import RootData,ParquetData,get_intersection_indices
from memflow.dataset.dataset import CombinedDataset
from memflow.dataset.HH import HHRecoDataset,HHGenDataset
from memflow.read_data import utils

vector.register_awkward()

In [2]:
gen_data = RootData(
    files = [
        'GluGluToHHTo2B2VTo2L2Nu_node_cHHH1.root',
        'GluGluToHHTo2B2VTo2L2Nu_node_cHHH5.root',
    ],
    treenames = [
        'gen_HH;1',
    ],
    lazy = True,
    # N = 1000,
)


branches_bbWW_DL = [
    'H1',
    'H2',
    'W_plus',
    'W_minus',
    'bottom',
    'antibottom',
    'lep_plus_from_W',
    'lep_minus_from_W',
    'neutrino_from_W',
    'antineutrino_from_W',
]

mask_bbWW_DL = np.logical_and.reduce(
    [
        gen_data[f'{br}_E'] >= 0
        for br in branches_bbWW_DL
    ]
)
print (f'Out of {len(mask_bbWW_DL)} events, {mask_bbWW_DL.sum()} are bbWW DL')

gen_data.cut(mask_bbWW_DL)

print (gen_data)

Out of 791595 events, 756500 are bbWW DL
Data object
Loaded branches:
   ... H1_E: 756500
   ... H2_E: 756500
   ... W_minus_E: 756500
   ... W_plus_E: 756500
   ... antibottom_E: 756500
   ... antineutrino_from_W_E: 756500
   ... bottom_E: 756500
   ... file: 756500
   ... lep_minus_from_W_E: 756500
   ... lep_plus_from_W_E: 756500
   ... neutrino_from_W_E: 756500
   ... sample: 756500
   ... tree: 756500
Branch in files not loaded:
   ... H1_Px
   ... H1_Py
   ... H1_Pz
   ... H1_eta
   ... H1_idx
   ... H1_mass
   ... H1_pdgId
   ... H1_phi
   ... H1_pt
   ... H1_sum_E
   ... H2_Px
   ... H2_Py
   ... H2_Pz
   ... H2_eta
   ... H2_idx
   ... H2_mass
   ... H2_pdgId
   ... H2_phi
   ... H2_pt
   ... H2_sum_E
   ... ISR_10_E
   ... ISR_10_Px
   ... ISR_10_Py
   ... ISR_10_Pz
   ... ISR_10_eta
   ... ISR_10_idx
   ... ISR_10_mass
   ... ISR_10_parent
   ... ISR_10_pdgId
   ... ISR_10_phi
   ... ISR_10_pt
   ... ISR_11_E
   ... ISR_11_Px
   ... ISR_11_Py
   ... ISR_11_Pz
   ... ISR_11_e

In [3]:
gen_dataset = HHGenDataset(
    data = gen_data,
    selection = [
        'leptons',
        'bquarks',
    ],
    coordinates = 'cartesian',
    apply_boost = False,
    preprocessing = False,
    build = True,
)
print (gen_dataset)

Saving objects to /home/lw23382/Documents/MEM_DNN_project/MEMFlow/memflow/dataset/hh_gen
No final state recorded with `final_states_object_name` property, will not compute PS points
Parton dataset with 756500 events
 Initial states pdgids : [21, 21]
 Final states pdgids   : [6, -6, 11, -12, -11, 12]
 Final states masses   : [172.5, 172.5, 0.0005109989499999999, 0.0, 0.0005109989499999999, 0.0]
Containing the following tensors
leptons  : data ([756500, 4, 5]), mask ([756500, 4])
           Presences : [100.00%, 100.00%, 100.00%, 100.00%]
           Features : ['px', 'py', 'pz', 'E', 'pdgId']
           Selected for batches : True
higgs    : data ([756500, 2, 5]), mask ([756500, 2])
           Presences : [100.00%, 100.00%]
           Features : ['px', 'py', 'pz', 'E', 'pdgId']
           Selected for batches : False
bquarks  : data ([756500, 2, 5]), mask ([756500, 2])
           Presences : [100.00%, 100.00%]
           Features : ['px', 'py', 'pz', 'E', 'pdgId']
           Selected for

In [4]:
loader_parton = DataLoader(
    gen_dataset,
    batch_size = 32,
)
batch = next(iter(loader_parton))

for obj,sel in zip(batch,loader_parton.dataset.selection):
    print (sel,obj.shape)

leptons torch.Size([32, 4, 5])
bquarks torch.Size([32, 2, 5])


In [5]:
reco_data = RootData(
    files = [
        'GluGluToHHTo2B2VTo2L2Nu_node_cHHH1.root',
        'GluGluToHHTo2B2VTo2L2Nu_node_cHHH5.root',
    ],
    treenames = [
        'reco_DL;1',
    ],
    lazy = True,
    # N = 1000,
)
print ('Initial :',reco_data.events)

mask_resolved = reco_data['flag_resolved']==1
reco_data.cut(mask_resolved)
print ('Resolved :',reco_data.events)
print (reco_data)


Initial : 791595
Resolved : 585473
Data object
Loaded branches:
   ... file: 585473
   ... flag_resolved: 585473
   ... sample: 585473
   ... tree: 585473
Branch in files not loaded:
   ... VBF1_E
   ... VBF1_Px
   ... VBF1_Py
   ... VBF1_Pz
   ... VBF1_eta
   ... VBF1_idx
   ... VBF1_mass
   ... VBF1_phi
   ... VBF1_pt
   ... VBF1_sel
   ... VBF2_E
   ... VBF2_Px
   ... VBF2_Py
   ... VBF2_Pz
   ... VBF2_eta
   ... VBF2_idx
   ... VBF2_mass
   ... VBF2_phi
   ... VBF2_pt
   ... VBF2_sel
   ... VBF3_E
   ... VBF3_Px
   ... VBF3_Py
   ... VBF3_Pz
   ... VBF3_eta
   ... VBF3_idx
   ... VBF3_mass
   ... VBF3_phi
   ... VBF3_pt
   ... VBF3_sel
   ... VBF4_E
   ... VBF4_Px
   ... VBF4_Py
   ... VBF4_Pz
   ... VBF4_eta
   ... VBF4_idx
   ... VBF4_mass
   ... VBF4_phi
   ... VBF4_pt
   ... VBF4_sel
   ... VBF5_E
   ... VBF5_Px
   ... VBF5_Py
   ... VBF5_Pz
   ... VBF5_eta
   ... VBF5_idx
   ... VBF5_mass
   ... VBF5_phi
   ... VBF5_pt
   ... VBF5_sel
   ... VBF6_E
   ... VBF6_Px
   ... VBF6_P

In [6]:
reco_dataset = HHRecoDataset(
    data = reco_data,
    selection = [
        'jets',
        'electrons',
        'muons',
        'met',
    ],
    coordinates = 'cartesian',
    apply_boost = False,
    preprocessing = False,
    default_features = {
        'btag' : -1.,
        'btagged': -1.,
        'pdgId' : 0.,
        'charge' : 0.
    },
    build = True,
)
print (reco_dataset)

Saving objects to /home/lw23382/Documents/MEM_DNN_project/MEMFlow/memflow/dataset/hh_reco
Reco dataset with 585473 events
Containing the following tensors
boost      : data ([585473, 1, 8]), mask ([585473, 1])
             Presences : [100.00%]
             Features : ['px', 'py', 'pz', 'E', 'btag', 'btagged', 'pdgId', 'charge']
             Selected for batches : False
jets       : data ([585473, 13, 8]), mask ([585473, 13])
             Presences : [100.00%, 96.30%, 63.53%, 31.39%, 12.46%, 4.21%, 1.27%, 0.35%, 0.09%, 0.02%, <0.01%, <0.01%, <0.01%]
             Features : ['px', 'py', 'pz', 'E', 'btag', 'btagged', 'pdgId', 'charge']
             Selected for batches : True
electrons  : data ([585473, 3, 8]), mask ([585473, 3])
             Presences : [32.62%, 3.89%, <0.01%]
             Features : ['px', 'py', 'pz', 'E', 'pdgId', 'charge', 'btag', 'btagged']
             Selected for batches : True
muons      : data ([585473, 3, 8]), mask ([585473, 3])
             Presences : [42.86

In [7]:
event = 5
print ('jets')
print (reco_dataset._objects['jets'][0][event])
print (reco_dataset._objects['jets'][1][event])
print ('electrons')
print (reco_dataset._objects['electrons'][0][event])
print (reco_dataset._objects['electrons'][1][event])
print ('muons')
print (reco_dataset._objects['muons'][0][event])
print (reco_dataset._objects['muons'][1][event])
print ('met')
print (reco_dataset._objects['met'][0][event])
print (reco_dataset._objects['met'][1][event])

jets
tensor([[ 8.9191e+01,  3.5915e+01,  2.5476e+02,  2.7255e+02,  9.9951e-01,
          1.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 4.0977e+01, -2.4735e+01,  5.6720e+01,  7.5069e+01,  1.2901e-02,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0

In [8]:
loader_reco = DataLoader(
    reco_dataset,
    batch_size = 32,
)
batch = next(iter(loader_reco))

for obj,sel in zip(batch,loader_reco.dataset.selection):
    print (sel,obj.shape)

jets torch.Size([32, 13, 8])
electrons torch.Size([32, 3, 8])
muons torch.Size([32, 3, 8])
met torch.Size([32, 1, 8])


In [9]:
dataset_comb = CombinedDataset(
    gen_dataset = gen_dataset,
    reco_dataset = reco_dataset,
    intersection_branch = 'event',
)
print (dataset_comb)

Looking into file metadata
	entry 0 : ['GluGluToHHTo2B2VTo2L2Nu_node_cHHH1.root'
 'GluGluToHHTo2B2VTo2L2Nu_node_cHHH5.root']
	entry 1 : ['GluGluToHHTo2B2VTo2L2Nu_node_cHHH1.root'
 'GluGluToHHTo2B2VTo2L2Nu_node_cHHH5.root']
Will only consider common files : ['GluGluToHHTo2B2VTo2L2Nu_node_cHHH1.root', 'GluGluToHHTo2B2VTo2L2Nu_node_cHHH5.root']
(Note : this assumes the files have the same order between the different data objects, and the content of the intersection branch as well)
For entry 0 : from 756500 events, 559975 selected
For entry 1 : from 585473 events, 559975 selected
Combined dataset (extracting 559975 events of the following) :
Parton dataset with 756500 events
 Initial states pdgids : [21, 21]
 Final states pdgids   : [6, -6, 11, -12, -11, 12]
 Final states masses   : [172.5, 172.5, 0.0005109989499999999, 0.0, 0.0005109989499999999, 0.0]
Containing the following tensors
leptons  : data ([756500, 4, 5]), mask ([756500, 4])
           Presences : [100.00%, 100.00%, 100.00%, 10

In [10]:
loader_comb = DataLoader(
    dataset_comb,
    batch_size = 32,
)
batch = next(iter(loader_comb))

for obj,sel in zip(batch,loader_comb.dataset.gen_dataset.selection+loader_comb.dataset.reco_dataset.selection):
    print (sel,obj.shape)

leptons torch.Size([32, 4, 5])
bquarks torch.Size([32, 2, 5])
jets torch.Size([32, 13, 8])
electrons torch.Size([32, 3, 8])
muons torch.Size([32, 3, 8])
met torch.Size([32, 1, 8])


In [11]:
event = 2
for i,sel in enumerate(dataset_comb.gen_dataset.selection+dataset_comb.reco_dataset.selection):
    print (sel)
    print (dataset_comb[event][i])

leptons
tensor([[-14.2627, -22.2602,   2.1776,  26.5270, -11.0000],
        [-41.8196, -81.1029, 214.3930, 233.0042,  12.0000],
        [ -6.5457, -23.7391,   9.4523,  26.3768,  15.0000],
        [ -4.3819,   8.1817,   6.9634,  11.6030, -16.0000]],
       dtype=torch.float64)
bquarks
tensor([[  3.3305,  44.5005,  75.8591,  88.0113,   5.0000],
        [ 46.0525,  61.7103, -39.9548,  86.7490,  -5.0000]],
       dtype=torch.float64)
jets
tensor([[ 3.6043e+00,  4.4373e+01,  7.3704e+01,  8.6591e+01,  9.9609e-01,
          1.0000e+00,  0.0000e+00,  0.0000e+00],
        [-1.5839e+00,  3.1012e+01,  1.1439e+01,  3.4295e+01,  1.1940e-02,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-6.3921e+00, -2.6599e+01,  1.0424e+01,  2.9431e+01,  1.0124e-02,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e