## Feature processing for production mechanism classification

Di-photon event selection based on https://arxiv.org/abs/1802.04146

In [1]:
import os
import time

from itertools import chain
from multiprocessing.dummy import Process
from multiprocessing.dummy import Lock
from multiprocessing.dummy import Value

import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib.pyplot as plt

import uproot

In [2]:
data_path = "eosuser.cern.ch//eos/user/a/ananiev/data/"
output_path = "/afs/cern.ch/user/a/ananiev/cernbox/output/"

We still keep working with monte carlo simulations, since only they have labels.

In [3]:
datasets = {
    "GamGam": {
        "MC": [
            ("mc_341081.ttH125_gamgam.GamGam", {"tag": "tt"}),
            ("mc_343981.ggH125_gamgam.GamGam", {"tag": "gg"}),
            ("mc_345041.VBFH125_gamgam.GamGam", {"tag": "VBF"}),
            ("mc_345318.WpH125J_Wincl_gamgam.GamGam", {"tag": "Wp"}),
            ("mc_345319.ZH125J_Zincl_gamgam.GamGam", {"tag": "Z"})

        ]
    }
}

In [4]:
test_mc = uproot.open(os.path.join("root://", data_path, "GamGam", "MC", f"{datasets['GamGam']['MC'][0][0]}.root"))

In [5]:
test_mc_events = next(test_mc["mini"].iterate(["*"], entrystop=10000))

In [6]:
def yield_files_with_meta(datasets):
    for process_name, process in datasets.items():
        print("Process: ", process_name)
        for type_name, thetype in process.items():
            print("Type: ", type_name)
            for filedata in thetype:
                try:
                    filename, meta = filedata
                except ValueError:
                    filename = filedata
                    meta = {}
                print("File: ", filename)
                fullpath = os.path.join("root://", data_path, process_name, type_name, f"{filename}.root")
                yield (process_name, type_name, filename), meta, fullpath

In [7]:
def dict_apply_mask(d, mask, fields=None):
    if fields is None:
        fields = d.keys()
    for f in fields:
        d[f] = d[f][mask]

In [8]:
def eta2tg_theta(eta):
    tg_theta = np.exp(-eta)
    tg_theta = 2*tg_theta**2/(1 - tg_theta**2)
    return tg_theta

In [9]:
def atlas_two_cosine(events, p1, p2):
    tg_theta_1 = eta2tg_theta(events[p1+b"_eta"])
    tg_theta_2 = eta2tg_theta(events[p2+b"_eta"])
    cos_delta_phi = np.cos(events[p2+b"_phi"] - events[p1+b"_phi"])
    return (cos_delta_phi + tg_theta_1*tg_theta_2)/np.sqrt((tg_theta_1**2 + 1)*(tg_theta_2**2 + 1))

We still follow "macro" / "micro" features classification, where "micro" features describe particles within the event (pT of 1st, 2nd, 3rd photon...). We extract features from event batch in groups per particle type. We begin from photons. Basically we repeat cuts applied in Higgs diphoton analysis, but here we use thinner region around higgs mass and also we don't care much about the rapidity cuts. Since we train on MC, we don't need to check whether events hit the detector. 

In [10]:
def process_photons(events, mask):
    macro_mask = mask.copy()
    
    macro_events = {}
    micro_events = {}
    
    macro_events[b"photon_n"] = events[b"photon_n"][macro_mask]
    macro_events[b"trigP"] = events[b"trigP"][macro_mask]
    n_threshold = macro_events[b"photon_n"] >= 2
    is_diphoton = macro_events[b"trigP"]
    macro_mask[macro_mask] = n_threshold & is_diphoton
    
    micro_events[b"photon_pt"] = events[b"photon_pt"][macro_mask]
    micro_events[b"photon_eta"] = events[b"photon_eta"][macro_mask]
    micro_events[b"photon_phi"] = events[b"photon_phi"][macro_mask]
    micro_events[b"photon_E"] = events[b"photon_E"][macro_mask]
    micro_events[b"photon_isTightID"] = events[b"photon_isTightID"][macro_mask]
    micro_events[b"photon_trigMatched"] = events[b"photon_trigMatched"][macro_mask]
    micro_events[b"photon_ptcone30"] = events[b"photon_ptcone30"][macro_mask]
    micro_events[b"photon_etcone20"] = events[b"photon_etcone20"][macro_mask]
    
    
    pts = micro_events[b"photon_pt"].argsort(ascending=False)
    row_indices = np.arange(pts.shape[0])
    lead_pts = pts[:, 0]
    sublead_pts = pts[:, 1]
    
    
    macro_events[b"photon_n"] = macro_events[b"photon_n"][macro_mask]
    macro_events[b"photon_1lead_pt"] = micro_events[b"photon_pt"][row_indices, lead_pts]
    macro_events[b"photon_1lead_eta"] = micro_events[b"photon_eta"][row_indices, lead_pts]
    macro_events[b"photon_1lead_phi"] = micro_events[b"photon_phi"][row_indices, lead_pts]
    macro_events[b"photon_1lead_E"] = micro_events[b"photon_E"][row_indices, lead_pts]
    macro_events[b"photon_1lead_isTightID"] = micro_events[b"photon_isTightID"][row_indices, lead_pts]
    macro_events[b"photon_1lead_trigMatched"] = micro_events[b"photon_trigMatched"][row_indices, lead_pts]
    macro_events[b"photon_1lead_ptcone30"] = micro_events[b"photon_ptcone30"][row_indices, lead_pts]
    macro_events[b"photon_1lead_etcone20"] = micro_events[b"photon_etcone20"][row_indices, lead_pts]
    macro_events[b"photon_2lead_pt"] = micro_events[b"photon_pt"][row_indices, sublead_pts]
    macro_events[b"photon_2lead_eta"] = micro_events[b"photon_eta"][row_indices, sublead_pts]
    macro_events[b"photon_2lead_phi"] = micro_events[b"photon_phi"][row_indices, sublead_pts]
    macro_events[b"photon_2lead_E"] = micro_events[b"photon_E"][row_indices, sublead_pts]
    macro_events[b"photon_2lead_isTightID"] = micro_events[b"photon_isTightID"][row_indices, sublead_pts]
    macro_events[b"photon_2lead_trigMatched"] = micro_events[b"photon_trigMatched"][row_indices, sublead_pts]
    macro_events[b"photon_2lead_ptcone30"] = micro_events[b"photon_ptcone30"][row_indices, sublead_pts]
    macro_events[b"photon_2lead_etcone20"] = micro_events[b"photon_etcone20"][row_indices, sublead_pts]
    
    macro_filter = (  (macro_events[b"photon_1lead_pt"] > 25000)
                    &
                      (macro_events[b"photon_2lead_pt"] > 25000)
                    &
                      (macro_events[b"photon_1lead_isTightID"])
                    &
                      (macro_events[b"photon_2lead_isTightID"])
                    &
                      (macro_events[b"photon_1lead_trigMatched"])
                    &
                      (macro_events[b"photon_2lead_trigMatched"])
                    & 
                      (macro_events[b"photon_1lead_ptcone30"] < 0.065)
                    & 
                      (macro_events[b"photon_1lead_etcone20"] < 0.065)
                    & 
                      (macro_events[b"photon_2lead_ptcone30"] < 0.065)
                    & 
                      (macro_events[b"photon_2lead_etcone20"] < 0.065)
                   )
    
    dict_apply_mask(macro_events, macro_filter)
    macro_mask[macro_mask] = macro_filter

    macro_events[b"h_mass"] = np.sqrt(2.*macro_events[b"photon_1lead_E"]
                                        *macro_events[b"photon_2lead_E"]
                                        *(1. - atlas_two_cosine(macro_events, b"photon_1lead", b"photon_2lead"))
                                     )
    
    mass_cutoff =   (macro_events[b"photon_1lead_E"]/macro_events[b"h_mass"] > 0.35) \
                  & (macro_events[b"photon_2lead_E"]/macro_events[b"h_mass"] > 0.25) \
                  & (macro_events[b"h_mass"] >= 115000.) \
                  & (macro_events[b"h_mass"] <= 135000)
    
    dict_apply_mask(macro_events, mass_cutoff)
    macro_mask[macro_mask] = mass_cutoff
    
    del macro_events[b"photon_1lead_isTightID"]
    del macro_events[b"photon_2lead_isTightID"]
    del macro_events[b"photon_1lead_trigMatched"]
    del macro_events[b"photon_2lead_trigMatched"]
    del macro_events[b"photon_1lead_ptcone30"]  # both null
    del macro_events[b"photon_2lead_ptcone30"]
    del macro_events[b"trigP"]
    
    return macro_events, macro_mask
#process_photons(test_mc_events, np.ones_like(test_mc_events[b"photon_n"], dtype=np.bool))

Weights extraction completely repeats the di-photon analysis

In [11]:
def process_weights(events, mask):
    total_weights = events[b"SumWeights"][0]
    x_section = events[b"XSection"][0]
    weights = (  events[b"mcWeight"]
               * events[b'scaleFactor_PILEUP'] 
               * events[b'scaleFactor_ELE'] 
               * events[b'scaleFactor_MUON'] 
               * events[b'scaleFactor_PHOTON'] 
               * events[b'scaleFactor_TAU'] 
               * events[b'scaleFactor_BTAG'] 
               * events[b'scaleFactor_LepTRIGGER'] 
               * events[b'scaleFactor_PhotonTRIGGER']
              )[mask]/total_weights*x_section
    return {b"weight": weights}, mask
#process_weights(test_mc_events, np.ones_like(test_mc_events[b"photon_n"], dtype=np.bool))

In [12]:
def extract_descriptive(events, field):
    values = events[field]
    return {
        field + b"_min": values.min(),
        field + b"_max": values.max(),
        field + b"_mean": values.mean(),
        field + b"_sum": values.sum(),
        field + b"_std": values.std()
    }

For jets and leptons, in addition to their macro features we also extract `descriptive` features, like: min, max, mean, sum, std computed on microfeatures. For instance: max pt of leptons within the event. In this way we avoid the problem of variable feature number, which then will become an issue for ML algorithms. There are several ways to tackle that problem. While we are following only one of them, knowing the possibilities might be helpful when developing more advanced solutions.

Varible feature number handling options:

* Feature aggregation (the one we follow due to its simplicity)
* Data imputation. Assume event always has "15" leptons, fill in the gaps with mean or from the distribution if number of leptons < "15".
* Set kernels, that allow computing "distance" between sets agnostically to the order of elements and allowing sets of different sizes to be compared. One of the famous ones — [Pyramid Match](http://jmlr.csail.mit.edu/papers/volume8/grauman07a/grauman07a.pdf), that compares overlaps between histograms built from feautre sets.
* Neural nets that find embedding of sets of varible size in the fixed dimensional space. Example: [DeepSets](https://papers.nips.cc/paper/6931-deep-sets)

In [13]:
def process_jets(events, mask):
    macro_mask = mask.copy()
    
    macro_events = {}
    micro_events = {}
    
    macro_events[b"jet_n"] = events[b"jet_n"][macro_mask]
    
    micro_events[b"jet_pt"] = events[b"jet_pt"][macro_mask]
    micro_events[b"jet_theta"] = np.arctan(eta2tg_theta(events[b"jet_eta"][macro_mask]))
    micro_events[b"jet_phi"] = events[b"jet_phi"][macro_mask]
    micro_events[b"jet_E"] = events[b"jet_E"][macro_mask]
    micro_events[b"jet_MV2c10"] = events[b"jet_MV2c10"][macro_mask]
    
    macro_events.update(extract_descriptive(micro_events, b"jet_pt"))
    macro_events.update(extract_descriptive(micro_events, b"jet_phi"))
    macro_events.update(extract_descriptive(micro_events, b"jet_E"))
    macro_events.update(extract_descriptive(micro_events, b"jet_theta"))
    macro_events.update(extract_descriptive(micro_events, b"jet_MV2c10"))
    
    return macro_events, macro_mask
#process_jets(test_mc_events, np.ones_like(test_mc_events[b"jet_n"], dtype=np.bool))

In [14]:
def process_lep(events, mask):
    macro_mask = mask.copy()
    
    macro_events = {}
    micro_events = {}
    
    macro_events[b"lep_n"] = events[b"lep_n"][macro_mask]
    
    micro_events[b"lep_pt"] = events[b"lep_pt"][macro_mask]
    micro_events[b"lep_theta"] = np.arctan(eta2tg_theta(events[b"lep_eta"][macro_mask]))
    micro_events[b"lep_phi"] = events[b"lep_phi"][macro_mask]
    micro_events[b"lep_E"] = events[b"lep_E"][macro_mask]
    micro_events[b"lep_z0"] = events[b"lep_z0"][macro_mask]
    micro_events[b"lep_charge"] = events[b"lep_charge"][macro_mask]
    micro_events[b"lep_ptcone30"] = events[b"lep_ptcone30"][macro_mask]
    micro_events[b"lep_etcone20"] = events[b"lep_etcone20"][macro_mask]
    
    macro_events.update(extract_descriptive(micro_events, b"lep_pt"))
    macro_events.update(extract_descriptive(micro_events, b"lep_phi"))
    macro_events.update(extract_descriptive(micro_events, b"lep_E"))
    macro_events.update(extract_descriptive(micro_events, b"lep_theta"))
    macro_events.update(extract_descriptive(micro_events, b"lep_charge"))
    macro_events.update(extract_descriptive(micro_events, b"lep_z0"))
    macro_events.update(extract_descriptive(micro_events, b"lep_ptcone30"))
    macro_events.update(extract_descriptive(micro_events, b"lep_etcone20"))
    
    return macro_events, macro_mask
#process_lep(test_mc_events, np.ones_like(test_mc_events[b"lep_n"], dtype=np.bool))

In [15]:
def mask_backprop(mask_seq, obj_seq):
    mask_obj_iter = iter(zip(mask_seq[::-1], obj_seq[::-1]))
    prev_mask, _ = next(mask_obj_iter)
    for mask, obj in mask_obj_iter:
        conditional_mask = (mask & prev_mask)[mask]
        dict_apply_mask(obj, conditional_mask)

In [16]:
def process_event_batch(events):
    mask_seq = []
    obj_seq = []
    
    mask = np.ones_like(events[b"photon_n"], dtype=np.bool)
    
    photons, mask = process_photons(events, mask)
    mask_seq.append(mask)
    obj_seq.append(photons)
    
    jets, mask = process_jets(events, mask)
    mask_seq.append(mask)
    obj_seq.append(photons)
    
    leptons, mask = process_lep(events, mask)
    mask_seq.append(mask)
    obj_seq.append(leptons)
    
    weights, mask = process_weights(events, mask)
    mask_seq.append(mask)
    obj_seq.append(weights)
        
    mask_backprop(mask_seq, obj_seq)
    
    other_features = {
          b"met_et": events[b"met_et"]
        , b"met_phi": events[b"met_phi"]
    }
    dict_apply_mask(other_features, mask)
    
    batch = {}
    for obj in chain(obj_seq, [other_features]):
        batch.update(obj)
    return pd.DataFrame(batch)
    
#process_event_batch(test_events, bin_edges)

In [17]:
def touch(path):
    if os.path.exists(path):
        return True
    dirpath, filename = os.path.dirname(path), os.path.basename(path)
    os.makedirs(dirpath, exist_ok=True)
    with open(path, "w") as f:
        f.flush()
        return False

In [18]:
def process_file(filepath, label, fout, flock, entrysteps, write_header):
    print(filepath)
    with uproot.open(filepath) as f:
        for i, data in enumerate(f["mini"].iterate(["*"], entrysteps=entrysteps)):
            print(label, "Processing: " + str((i+1)*entrysteps) + "\n")
            processed_batch = process_event_batch(data)
            processed_batch.columns = [q.decode("utf-8") for q in processed_batch.columns]
            processed_batch["label"] = label
            with flock:
                processed_batch.to_csv(fout, sep="\t", header=bool(write_header.value), index=False)
                if write_header.value:
                    write_header.value = 0
                fout.flush()
    return True

In [19]:
def events_per_file(datasets, output_file):
    entrysteps = 100000
    
    processes = []
    output_lock = Lock()
    touch(output_file)
    output_file_fd = open(output_file, "a")
    write_header = Value("b", 1)
    
    events = {}
    time_start = time.perf_counter()
    for (process, thetype, name), meta, fullpath in yield_files_with_meta(datasets):
        label = meta.get("tag") or f"{process}.{thetype}.{name}"
        p = Process(target=process_file, args=(fullpath, label, output_file_fd, output_lock, entrysteps, write_header))
        processes.append(p)
        p.start()
        
    [p.join() for p in processes]
    
    output_file_fd.close()
    time_now = time.perf_counter()
    print("Done!", "Time spent: ", time_now - time_start)

Finally, we apply all the feature extraction functions mentioned above to event batches. We store outputs from all the MC files into the same output file with features. Production mechanism is now stored in the separate column as a textual label for each row. We spawn a thread per MC file, since most of the time take IO operations, threads don't introduce overhead to the process.

In [20]:
events_per_file(datasets, os.path.join(output_path, "hgg_features.tsv"))

Process:  GamGam
Type:  MC
File:  mc_341081.ttH125_gamgam.GamGam
root://eosuser.cern.ch//eos/user/a/ananiev/data/GamGam/MC/mc_341081.ttH125_gamgam.GamGam.root
File:  mc_343981.ggH125_gamgam.GamGam
root://eosuser.cern.ch//eos/user/a/ananiev/data/GamGam/MC/mc_343981.ggH125_gamgam.GamGam.root
File:  mc_345041.VBFH125_gamgam.GamGam
root://eosuser.cern.ch//eos/user/a/ananiev/data/GamGam/MC/mc_345041.VBFH125_gamgam.GamGam.root
File:  mc_345318.WpH125J_Wincl_gamgam.GamGam
root://eosuser.cern.ch//eos/user/a/ananiev/data/GamGam/MC/mc_345318.WpH125J_Wincl_gamgam.GamGam.root
File:  mc_345319.ZH125J_Zincl_gamgam.GamGam
root://eosuser.cern.ch//eos/user/a/ananiev/data/GamGam/MC/mc_345319.ZH125J_Zincl_gamgam.GamGam.root
Z Processing: 100000

VBF Processing: 100000

tt Processing: 100000

gg Processing: 100000

Wp Processing: 100000

Z Processing: 200000

Wp Processing: 200000

VBF Processing: 200000

tt Processing: 200000

Z Processing: 300000

gg Processing: 200000

VBF Processing: 300000

gg Proces