In [1]:
import os

import numpy as np
import pandas as pd
import awkward as ak

import vector
vector.register_numba()
vector.register_awkward()

from coffea.util import load
from coffea.processor.accumulator import column_accumulator
from coffea.processor import accumulate

# Loading the exported dataset

We open the .coffea file and read the output accumulator. The ntuples for the training are saved under the key `columns`.

In [2]:
year = "2017"
sample = "ttHTobb_ttToSemiLep"
df = load(f"/eos/user/m/mmarcheg/ttHbb/training_datasets/{year}/output_{sample}_{year}.coffea")

In [3]:
df["cutflow"]

{'initial': {'ttHTobb_ttToSemiLep_2017': 9992000},
 'skim': {'ttHTobb_ttToSemiLep_2017': 2572504},
 'presel': {'ttHTobb_ttToSemiLep_2017': 1361657},
 'baseline': {'ttHTobb_ttToSemiLep_2017': {'ttHTobb_ttToSemiLep': 1361657}},
 'semilep_LHE': {'ttHTobb_ttToSemiLep_2017': {'ttHTobb_ttToSemiLep': 1277812}}}

In [4]:
df["sumw"]

{'baseline': {'ttHTobb_ttToSemiLep_2017': {'ttHTobb_ttToSemiLep': 646.6523174590126}},
 'semilep_LHE': {'ttHTobb_ttToSemiLep_2017': {'ttHTobb_ttToSemiLep': 606.8207020523254}}}

In [5]:
df["sum_genweights"]

{'ttHTobb_ttToSemiLep_2017': 2249755.0}

## Normalize the genweights

Since the array `weight` is filled on the fly with the weight associated with the event, it does not take into account the overall scaling by the sum of genweights (`sum_genweights`).
In order to correct for this, we have to scale by hand the `weight` array dividing by the sum of genweights.

In [6]:
datasets = df["sum_genweights"].keys()
print("Datasets: ", datasets)

Datasets:  dict_keys(['ttHTobb_ttToSemiLep_2017'])


In [7]:
w = df["columns"][sample][f"{sample}_{year}"]["semilep_LHE"]["weight"]
w

column_accumulator(array([1500.0080289 , 1089.91516367, 1150.18742552, ..., 1094.61906636,
        902.0117143 , 1089.56201334]))

In [8]:
df["columns"][sample][f"{sample}_{year}"]["semilep_LHE"].keys()

dict_keys(['weight', 'Parton_N', 'Parton_pt', 'Parton_eta', 'Parton_phi', 'Parton_mass', 'Parton_pdgId', 'Parton_provenance', 'PartonMatched_N', 'PartonMatched_pt', 'PartonMatched_eta', 'PartonMatched_phi', 'PartonMatched_mass', 'PartonMatched_pdgId', 'PartonMatched_provenance', 'PartonMatched_dRMatchedJet', 'JetGood_N', 'JetGood_pt', 'JetGood_eta', 'JetGood_phi', 'JetGood_hadronFlavour', 'JetGood_btagDeepFlavB', 'JetGoodMatched_N', 'JetGoodMatched_pt', 'JetGoodMatched_eta', 'JetGoodMatched_phi', 'JetGoodMatched_hadronFlavour', 'JetGoodMatched_btagDeepFlavB', 'JetGoodMatched_dRMatchedJet', 'LeptonGood_pt', 'LeptonGood_eta', 'LeptonGood_phi', 'LeptonGood_pdgId', 'LeptonGood_charge', 'LeptonGood_mvaTTH', 'MET_phi', 'MET_pt', 'MET_significance', 'Generator_x1', 'Generator_x2', 'Generator_id1', 'Generator_id2', 'Generator_xpdf1', 'Generator_xpdf2', 'LeptonParton_N', 'LeptonParton_pt', 'LeptonParton_eta', 'LeptonParton_phi', 'LeptonParton_mass', 'LeptonParton_pdgId', 'HiggsParton_pt', 'Higg

In [9]:
for dataset in datasets:
    df["columns"][sample][dataset]["semilep_LHE"]["weight"] = column_accumulator(df["columns"][sample][dataset]["semilep_LHE"]["weight"].value / df["sum_genweights"][dataset])

In [10]:
w_new = df["columns"][sample][f"{sample}_{year}"]["semilep_LHE"]["weight"]
w_new

column_accumulator(array([0.00066674, 0.00048446, 0.00051125, ..., 0.00048655, 0.00040094,
       0.0004843 ]))

## Accumulate ntuples from different data-taking eras

In order to enlarge our training sample, we merge ntuples coming from different data-taking eras.

In [11]:
cs = accumulate([df["columns"][sample][dataset]["semilep_LHE"] for dataset in datasets])

partons = ak.unflatten(ak.zip({"pt": cs["Parton_pt"].value,
                              "eta": cs["Parton_eta"].value,
                              "phi": cs["Parton_phi"].value,
                               "mass": cs["Parton_mass"].value,
                              "pdgId": cs["Parton_pdgId"].value, 
                              "prov": cs["Parton_provenance"].value},
                         with_name='Momentum4D'),
                     cs["Parton_N"].value)

partons_matched = ak.unflatten(ak.zip({"pt": cs["PartonMatched_pt"].value,
                              "eta": cs["PartonMatched_eta"].value,
                              "phi": cs["PartonMatched_phi"].value,
                              "mass": cs["PartonMatched_mass"].value,
                              "pdgId": cs["PartonMatched_pdgId"].value, 
                              "prov": cs["PartonMatched_provenance"].value},
                         with_name='Momentum4D'),
                     cs["PartonMatched_N"].value)

jets = ak.unflatten(ak.zip({"pt": cs["JetGood_pt"].value,
                              "eta": cs["JetGood_eta"].value,
                              "phi": cs["JetGood_phi"].value,
                              "btag": cs["JetGood_btagDeepFlavB"].value,
                              "m": np.zeros_like(cs["JetGood_pt"].value)},
                         with_name='Momentum4D'),
                     cs["JetGood_N"].value)

jets_matched = ak.unflatten(ak.zip({"pt": cs["JetGoodMatched_pt"].value,
                              "eta": cs["JetGoodMatched_eta"].value,
                              "phi": cs["JetGoodMatched_phi"].value,
                              "btag": cs["JetGoodMatched_btagDeepFlavB"].value,
                              "prov": cs["PartonMatched_provenance"].value,
                              "m": np.zeros_like(cs["JetGoodMatched_pt"].value)},
                         with_name='Momentum4D'),
                     cs["JetGoodMatched_N"].value)


generator_info = ak.zip({"pdgid1": cs["Generator_id1"].value,
                              "pdgid2": cs["Generator_id2"].value,
                              "x1": cs["Generator_x1"].value,
                              "x2": cs["Generator_x2"].value},
                         )


lepton_partons = ak.unflatten(ak.zip({"pt": cs["LeptonParton_pt"].value,
                              "eta": cs["LeptonParton_eta"].value,
                              "phi": cs["LeptonParton_phi"].value,
                              "mass": cs["LeptonParton_mass"].value,
                              "pdgId": cs["LeptonParton_pdgId"].value},
                         with_name='Momentum4D'),
                     cs["LeptonParton_N"].value)


lepton = ak.zip({"pt": cs["LeptonGood_pt"].value,
                              "eta": cs["LeptonGood_eta"].value,
                              "phi": cs["LeptonGood_phi"].value,
                              "pdgId": cs["LeptonGood_pdgId"].value,
                              "m": np.zeros_like(cs["LeptonGood_pt"].value)},
                         with_name='Momentum4D')


met = ak.zip({"pt": cs["MET_pt"].value,
              "eta":  np.zeros_like(cs["MET_pt"].value),
              "phi": cs["MET_phi"].value,
              "m": np.zeros_like(cs["MET_pt"].value)},
         with_name='Momentum4D')

higgs = ak.zip({"pt": cs["HiggsParton_pt"].value,
                              "eta": cs["HiggsParton_eta"].value,
                              "phi": cs["HiggsParton_phi"].value,
                              "m": cs["HiggsParton_mass"].value},
                         with_name='Momentum4D')


In [12]:
ak.sum(jets_matched.pt == -999) / ak.count(jets_matched.pt)

0.14270152939384279

In [13]:
jets_matched = ak.mask(jets_matched, jets_matched.pt==-999, None)
partons_matched = ak.mask(partons_matched, partons_matched.pt==-999, None)
is_jet_matched = ~ak.is_none(jets_matched, axis=1)
jets = ak.with_field(jets, is_jet_matched, "matched")

# Filling with -1 the not matched provenance
jets = ak.with_field(jets, ak.fill_none(jets_matched.prov, -1), "prov")

In [14]:
jets.matched

<Array [[True, True, True, ... True, True]] type='1277812 * var * bool'>

Jets and partons_matched arrays are **aligned**.

In [15]:
dfout = ak.zip({
    "jets": jets,
    "partons_matched": partons_matched,
    "partons": partons,
    "generator_info": generator_info,
    "lepton_partons":lepton_partons,
    "lepton_reco": lepton,
    "met": met,
    "higgs": higgs
    }, depth_limit=1)

ak.to_parquet(dfout, "/eos/user/m/mmarcheg/ttHbb/training_datasets/test/ttHTobb_ttToSemiLep_2017_v2.parquet")