In [1]:
%load_ext autoreload
%autoreload 2

import copy
import torch
import numpy as np
import awkward as ak
import uproot
import pandas as pd
import dask
import vector
import particle
import hepunits

from torch.utils.data import DataLoader

from memflow.dataset.data import RootData,ParquetData
from memflow.dataset.dataset import CombinedDataset
from memflow.dataset.tth import ttHRecoDataset,ttHGenDataset
from memflow.read_data import utils

vector.register_awkward()

In [2]:
data = ParquetData(
    files = [
        'all_jets_fullRun2_ttHTobb_forTraining_2016_PreVFP_v3.parquet',
    ],
    lazy = True,
    #N = 100000,
)
print (data)

Data object
Loaded branches:
   ... file: 462769
   ... sample: 462769
   ... tree: 462769
Branch in files not loaded:
   ... generator_info
   ... higgs
   ... jets
   ... lepton_partons
   ... lepton_reco
   ... met
   ... partons
   ... partons_matched
   ... weight


In [3]:
gen_dataset = ttHGenDataset(
    data = data,
    selection = [
        'boost',
        'partons',
        'higgs',
        'top_leptonic',
        'top_hadronic',
        'ISR',
    ],
    build = True,
)
print (gen_dataset)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Saving objects to /home/lw23382/Documents/MEM_DNN_project/MEMFlow/memflow/dataset/tth_gen
Parton dataset with 462769 events
 Initial states pdgids : [21, 21]
 Final states pdgids   : [25, 6, -6, 21]
 Final states masses   : [125.25, 172.5, 172.5, 0.0]
Containing the following tensors
boost         : data ([462769, 1, 4]), mask ([462769, 1])
                Presences : [100.00%]
                Features : ['x', 'y', 'z', 't']
                Selected for batches : True
partons       : data ([462769, 7, 6]), mask ([462769, 7])
                Presences : [100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 99.99%]
                Features : ['pt', 'eta', 'phi', 'mass', 'pdgId', 'prov']
                Selected for batches : True
leptons       : data ([462769, 2, 5]), mask ([462769, 2])
                Presences : [100.00%, 100.00%]
                Features : ['pt', 'eta', 'phi', 'mass', 'pdgId']
                Selected for batches : False
higgs         : data ([462769, 1, 4]), mask ([

In [4]:
loader_parton = DataLoader(
    gen_dataset,
    batch_size = 32,
)
batch = next(iter(loader_parton))

for obj,sel in zip(batch,loader_parton.dataset.selection):
    print (sel,obj.shape)

boost torch.Size([32, 1, 4])
partons torch.Size([32, 7, 6])
higgs torch.Size([32, 1, 4])
top_leptonic torch.Size([32, 1, 4])
top_hadronic torch.Size([32, 1, 4])
ISR torch.Size([32, 1, 4])


In [5]:
reco_dataset = ttHRecoDataset(
    data = data,
    selection = [
        'jets',
        'lepton',
        'met',
    ],
    default_features = {
        'pt' : 0.,
        'eta' : 0.,
        'phi' : 0.,
        'mass' : 0.,
        'btag' : -1.,
    },
    build = True,
)
print (reco_dataset)

Saving objects to /home/lw23382/Documents/MEM_DNN_project/MEMFlow/memflow/dataset/tth_reco
Object boost : discarding features ['m']
Object jets : discarding features ['m', 'matched', 'prov']
Object lepton : discarding features ['m']
Object met : discarding features ['m']
Reco dataset with 462769 events
Containing the following tensors
boost   : data ([462769, 1, 5]), mask ([462769, 1])
          Presences : [100.00%]
          Features : ['pt', 'eta', 'phi', 'mass', 'btag']
          Selected for batches : False
jets    : data ([462769, 15, 5]), mask ([462769, 15])
          Presences : [100.00%, 100.00%, 100.00%, 100.00%, 79.92%, 46.55%, 20.02%, 6.95%, 2.11%, 0.57%, 0.13%, 0.03%, <0.01%, <0.01%, <0.01%]
          Features : ['pt', 'eta', 'phi', 'mass', 'btag']
          Selected for batches : True
lepton  : data ([462769, 1, 5]), mask ([462769, 1])
          Presences : [100.00%]
          Features : ['pt', 'eta', 'phi', 'mass', 'btag']
          Selected for batches : True
met     : 

In [6]:
loader_reco = DataLoader(
    reco_dataset,
    batch_size = 32,
)
batch = next(iter(loader_reco))

for obj,sel in zip(batch,loader_reco.dataset.selection):
    print (sel,obj.shape)

jets torch.Size([32, 15, 5])
lepton torch.Size([32, 1, 5])
met torch.Size([32, 1, 5])


In [7]:
comb_dataset = CombinedDataset(
    gen_dataset = gen_dataset,
    reco_dataset = reco_dataset,
)
print (comb_dataset)

Combined dataset :
Parton dataset with 462769 events
 Initial states pdgids : [21, 21]
 Final states pdgids   : [25, 6, -6, 21]
 Final states masses   : [125.25, 172.5, 172.5, 0.0]
Containing the following tensors
boost         : data ([462769, 1, 4]), mask ([462769, 1])
                Presences : [100.00%]
                Features : ['x', 'y', 'z', 't']
                Selected for batches : True
partons       : data ([462769, 7, 6]), mask ([462769, 7])
                Presences : [100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 99.99%]
                Features : ['pt', 'eta', 'phi', 'mass', 'pdgId', 'prov']
                Selected for batches : True
leptons       : data ([462769, 2, 5]), mask ([462769, 2])
                Presences : [100.00%, 100.00%]
                Features : ['pt', 'eta', 'phi', 'mass', 'pdgId']
                Selected for batches : False
higgs         : data ([462769, 1, 4]), mask ([462769, 1])
                Presences : [100.00%]
                Featu

In [8]:
loader_comb = DataLoader(
    comb_dataset,
    batch_size = 32,
)
batch = next(iter(loader_comb))

for obj,sel in zip(batch,loader_comb.dataset.gen_dataset.selection+loader_comb.dataset.reco_dataset.selection):
    print (sel,obj.shape)

boost torch.Size([32, 1, 4])
partons torch.Size([32, 7, 6])
higgs torch.Size([32, 1, 4])
top_leptonic torch.Size([32, 1, 4])
top_hadronic torch.Size([32, 1, 4])
ISR torch.Size([32, 1, 4])
jets torch.Size([32, 15, 5])
lepton torch.Size([32, 1, 5])
met torch.Size([32, 1, 5])
