In [1]:
import copy
import glob
import json
import os
import re
import warnings

import awkward as ak
import hist
import numpy as np

# import matplotlib.pyplot as plt
# import mplhep as hep
# from cycler import cycler

# plt.style.use(hep.style.CMS)
# plt.rcParams.update({'font.size': 20})
# cmap_petroff10 = ["#3f90da", "#ffa90e", "#bd1f01", "#94a4a2", "#832db6", "#a96b59", "#e76300", "#b9ac70", "#717581", "#92dadd"]
# plt.rcParams.update({"axes.prop_cycle": cycler("color", cmap_petroff10)})

In [2]:
lpc_fileprefix = "/eos/uscms/store/group/lpcdihiggsboost/tsievert/HiggsDNA_parquet/v3.1/"

# lpc_filegroup = lambda s: f'Run3_{s}_mergedFullAllVars_MultiBDT_output'
lpc_filegroup = lambda s: f'Run3_{s}_mergedFullAllVars_MultiBDT_output_flag'
MC_FILEPREFIX_22 = os.path.join(lpc_fileprefix, lpc_filegroup('2022'), 'sim', '')
MC_FILEPREFIX_23 = os.path.join(lpc_fileprefix, lpc_filegroup('2023'), 'sim', '')
MC_FILEPREFIX_24 = os.path.join(lpc_fileprefix, lpc_filegroup('2024'), 'sim', '')
DATA_FILEPREFIX_22 = os.path.join(lpc_fileprefix, lpc_filegroup('2022'), 'data', '')
DATA_FILEPREFIX_23 = os.path.join(lpc_fileprefix, lpc_filegroup('2023'), 'data', '')
DATA_FILEPREFIX_24 = os.path.join(lpc_fileprefix, lpc_filegroup('2024'), 'data', '')

signal_filegroup = lambda s: f'{s}/GluGlutoHH_kl-1p00_kt-1p00_c2-0p00'
ttH_filegroup = lambda s: f'{s}/ttHtoGG' if s != 'postEE' else f'{s}/ttHToGG'
bbH_filegroup = lambda s: f'{s}/bbHtoGG'
VH_filegroup = lambda s: f'{s}/VHtoGG'
ggFH_filegroup = lambda s: f'{s}/GluGluHtoGG'
VBFH_filegroup = lambda s: f'{s}/VBFHtoGG' if s != 'postEE' else f'{s}/VBFHToGG'

END_FILEPATH = '*boostedCat.parquet'

FILEPATHS = {}
for name, filegroup in {
    'ggF HH': signal_filegroup, 
    # 'ttH': ttH_filegroup, 'bbH': bbH_filegroup,
    # 'VH': VH_filegroup, 'ggF H': ggFH_filegroup, 'VBF H': VBFH_filegroup,
}.items():
    FILEPATHS[name] = [
        glob.glob(os.path.join(MC_FILEPREFIX_22, filegroup('preEE'), 'nominal', END_FILEPATH)),
        glob.glob(os.path.join(MC_FILEPREFIX_22, filegroup('postEE'), 'nominal', END_FILEPATH)),
        glob.glob(os.path.join(MC_FILEPREFIX_23, filegroup('preBPix'), 'nominal', END_FILEPATH)),
        glob.glob(os.path.join(MC_FILEPREFIX_23, filegroup('postBPix'), 'nominal', END_FILEPATH))
    ]

FILEPATHS['Data'] = [
    glob.glob(os.path.join(DATA_FILEPREFIX_22, END_FILEPATH)),
    glob.glob(os.path.join(DATA_FILEPREFIX_23, END_FILEPATH))
]

order = ['ggF HH', 'ttH + bbH', 'VH', 'non-res + ggFH + VBFH']

In [3]:
brunella_signal = np.loadtxt('/uscms/home/tsievert/nobackup/XHYbbgg/HHtobbyy/signal_pass_boosted.csv', delimiter=',', skiprows=1, usecols=(1,2))
brunella_data = np.loadtxt('/uscms/home/tsievert/nobackup/XHYbbgg/HHtobbyy/pass_boosted_data.csv', delimiter=',', skiprows=1, usecols=(1,2))

brunella_pass_boosted_lumis = {
    'ggF HH': brunella_signal[:, 0], 'Data': brunella_data[:, 0]
}
brunella_pass_boosted_events = {
    'ggF HH': brunella_signal[:, 1], 'Data': brunella_data[:, 1]
}

In [12]:
def get_ttH_score(multibdt_output):
    return multibdt_output[:, 0] / (multibdt_output[:, 0] + multibdt_output[:, 1])

def get_QCD_score(multibdt_output):
    return multibdt_output[:, 0] / (multibdt_output[:, 0] + multibdt_output[:, 2] + multibdt_output[:, 3])

CATS = [
    [0.979, 0.9983],
    [0.96, 0.9951],
    [0.89, 0.9868],
]

def pass_category(multibdt_output, cat_i):
    pass_ttH = get_ttH_score(multibdt_output) > CATS[cat_i][0]
    pass_QCD = get_QCD_score(multibdt_output) > CATS[cat_i][1]

    pass_prevQCD = get_QCD_score(multibdt_output) <= (
        CATS[cat_i-1][1] if cat_i > 1 else 1
    )

    print(np.nonzero(pass_ttH))
    print(np.max(get_ttH_score(multibdt_output)))
    print(np.min(get_ttH_score(multibdt_output)))
    print(np.nonzero(pass_QCD))
    print(np.nonzero(pass_prevQCD))

    return pass_ttH & pass_QCD & pass_prevQCD

pass_boosted_lumis, pass_boosted_events = {}, {}
for name, filepaths in FILEPATHS.items():
    if name != 'ggF HH' and name != 'Data': continue
    sample_list = [ak.from_parquet(filepath) for filepath in filepaths]
    sample = ak.concatenate(sample_list)

    pass_boosted = (
        (sample['is_boosted'] == 1)
        & (sample['mass'] > 100) & (sample['mass'] < 180)
        & (sample['lead_mvaID'] > -0.7) & (sample['sublead_mvaID'] > -0.7)
        # & (sample['weight'] > 0)
        # & (sample['nonResReg_DNNpair_dijet_mass_DNNreg'] > 70) & (sample['nonResReg_DNNpair_dijet_mass_DNNreg'] < 190)
    )
    pass_boosted_lumis[name] = ak.to_numpy(sample['lumi'][pass_boosted])
    pass_boosted_events[name] = ak.to_numpy(sample['event'][pass_boosted])

    print('='*60+'\n'+'='*60+'\n'+'='*60)
    print(name)
    print('-'*60)
    print(f"Num {name} that pass boosted: {ak.sum(pass_boosted, axis=0)}")
    # for cat_i in range(0, 3):
    #     MultiBDT_output = np.array([
    #         sample['MultiBDT_output_' + "".join(output_dim.split())] for output_dim in order
    #     ]).T
    #     print(np.max(MultiBDT_output[:, 0]))
    #     print(np.min(MultiBDT_output[:, 0]))
    #     passed_cat = pass_category(MultiBDT_output, cat_i)
    #     print(np.nonzero(passed_cat))
    #     pass_cat = (
    #         pass_category(MultiBDT_output, cat_i)
    #         & sample['MultiBDT_flag']
    #     )
    #     pass_both = (pass_cat & pass_boosted)

    #     print('-'*60)
    #     print(f"Num {name} that pass cat {cat_i+1}: {ak.sum(pass_cat, axis=0)}")
    #     print(f"Num {name} that pass both: {ak.sum(pass_both, axis=0)}")

ggF HH
------------------------------------------------------------
Num ggF HH that pass boosted: 13538
Data
------------------------------------------------------------
Num Data that pass boosted: 3


In [18]:
for key in brunella_pass_boosted_lumis.keys():
    print("="*60)
    print(key)
    print(f"Num FNAL-Caltech-Purdue {key} events that pass boosted: {np.size(pass_boosted_lumis[key])}")
    print(f"Num SnT {key} events that pass boosted: {np.size(brunella_pass_boosted_lumis[key])}")

    unique_FCP_lumis, unique_FCP_lumi_indices, unique_FCP_lumi_counts = np.unique(pass_boosted_lumis[key], return_index=True, return_counts=True)
    print(f"Num F-C-P events with repeated lumis: {np.sum(unique_FCP_lumi_counts >= 2)}")
    unique_SnT_lumis, unique_SnT_lumi_indices, unique_SnT_lumi_counts = np.unique(brunella_pass_boosted_lumis[key], return_index=True, return_counts=True)
    print(f"Num SnT events with repeated lumis: {np.sum(unique_SnT_lumi_counts >= 2)}")

    unique_FCP_events, unique_FCP_event_indices, unique_FCP_event_counts = np.unique(pass_boosted_events[key], return_index=True, return_counts=True)
    print(f"Num F-C-P events with repeated events: {np.sum(unique_FCP_event_counts >= 2)}")
    unique_SnT_events, unique_SnT_event_indices, unique_SnT_event_counts = np.unique(brunella_pass_boosted_events[key], return_index=True, return_counts=True)
    print(f"Num SnT events with repeated events: {np.sum(unique_SnT_event_counts >= 2)}")
    
    FCP_lumi_order = np.argsort(unique_FCP_lumis)
    print(f"Lumis in FNAL-Caltech-Purdue boosted that are not in SnT boosted: \n{np.setdiff1d(unique_FCP_lumis, unique_SnT_lumis)}")
    print(f"    Counts of repeated Lumis same in FCP and SnT? {np.all(unique_FCP_lumi_counts[FCP_lumi_order] == unique_SnT_lumi_counts[FCP_lumi_order])}")

    SnT_lumi_order = np.argsort(unique_SnT_lumis)
    print(f"Lumis in SnT boosted that are not in FNAL-Caltech-Purdue boosted: \n{np.setdiff1d(unique_SnT_lumis, unique_FCP_lumis)}")
    print(f"    Counts of repeated Lumis same in FCP and SnT? {np.all(unique_FCP_lumi_counts[SnT_lumi_order] == unique_SnT_lumi_counts[SnT_lumi_order])}")

    FCP_event_order = np.argsort(unique_FCP_events)
    print(f"Events in FNAL-Caltech-Purdue boosted that are not in SnT boosted: \n{np.setdiff1d(unique_FCP_events, unique_SnT_events)}")
    print(f"    Counts of repeated Events same in FCP and SnT? {np.all(unique_FCP_event_counts[FCP_event_order] == unique_SnT_event_counts[FCP_event_order])}")

    SnT_event_order = np.argsort(unique_SnT_events)
    print(f"Events in SnT boosted that are not in FNAL-Caltech-Purdue boosted: \n{np.setdiff1d(unique_SnT_events, unique_FCP_events)}")
    print(f"    Counts of repeated Events same in FCP and SnT? {np.all(unique_FCP_event_counts[SnT_event_order] == unique_SnT_event_counts[SnT_event_order])}")

ggF HH
Num FNAL-Caltech-Purdue ggF HH events that pass boosted: 13538
Num SnT ggF HH events that pass boosted: 13538
Num F-C-P events with repeated lumis: 2362
Num SnT events with repeated lumis: 2362
Num F-C-P events with repeated events: 181
Num SnT events with repeated events: 181
Lumis in FNAL-Caltech-Purdue boosted that are not in SnT boosted: 
[]
    Counts of repeated Lumis same in FCP and SnT? True
Lumis in SnT boosted that are not in FNAL-Caltech-Purdue boosted: 
[]
    Counts of repeated Lumis same in FCP and SnT? True
Events in FNAL-Caltech-Purdue boosted that are not in SnT boosted: 
[]
    Counts of repeated Events same in FCP and SnT? True
Events in SnT boosted that are not in FNAL-Caltech-Purdue boosted: 
[]
    Counts of repeated Events same in FCP and SnT? True
Data
Num FNAL-Caltech-Purdue Data events that pass boosted: 3
Num SnT Data events that pass boosted: 3
Num F-C-P events with repeated lumis: 0
Num SnT events with repeated lumis: 0
Num F-C-P events with repeated