In [1]:
# Stdlib packages
import copy
import datetime
import glob
import json
import os
import re
from pathlib import Path

# Common Py packages
import numpy as np
from matplotlib import pyplot as plt
from prettytable import PrettyTable

# HEP packages
import gpustat
import h5py
import hist
import mplhep as hep
import xgboost as xgb
from cycler import cycler


# ML packages
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import auc, roc_curve

# Module packages
from data_processing_BDT import process_data

gpustat.print_gpustat()

plt.style.use(hep.style.CMS)
plt.rcParams.update({'font.size': 20})
cmap_petroff10 = ["#3f90da", "#ffa90e", "#bd1f01", "#94a4a2", "#832db6", "#a96b59", "#e76300", "#b9ac70", "#717581", "#92dadd"]
plt.rcParams.update({"axes.prop_cycle": cycler("color", cmap_petroff10)})

cmslpcgpu3.fnal.gov      Thu Nov 14 16:05:22 2024  555.42.06
[0] Tesla P100-PCIE-12GB | 60°C, 100 % | 11104 / 12288 MB | ckapsiak(11102M)


In [None]:
lpc_fileprefix = "/eos/uscms/store/group/lpcdihiggsboost/tsievert/HiggsDNA_parquet/v1"

FILEPATHS_DICT = {
    'ggF HH': [
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/GluGluToHH/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/GluGluToHH/nominal/*"
    ],
    # 'VBF HH': [
    #     lpc_fileprefix+f"/Run3_2022preEE_merged_v3/VBFHHto2B2G_CV_1_C2V_1_C3_1/nominal/*", 
    #     lpc_fileprefix+f"/Run3_2022postEE_merged_v3/VBFHHto2B2G_CV_1_C2V_1_C3_1/nominal/*"
    # ],
    'ttH': [
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/ttHToGG/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/ttHToGG/nominal/*"
    ],
    'non-res + single-H': [
        # non-Resonant #
        # GG + 3Jets
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/GGJets/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/GGJets/nominal/*",
        # GJet pT 20-40
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/GJetPt20To40/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/GJetPt20To40/nominal/*",
        # GJet pT 40-inf
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/GJetPt40/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/GJetPt40/nominal/*",
        # single-H #
        # ggF H
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/GluGluHToGG/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/GluGluHToGG/nominal/*",
        # VBF H
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/VBFHToGG/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/VBFHToGG/nominal/*",
        # VH
        lpc_fileprefix+f"/Run3_2022preEE_merged_v3/VHToGG/nominal/*", 
        lpc_fileprefix+f"/Run3_2022postEE_merged_v3/VHToGG/nominal/*",
    ],
    # 'VH': [
    #     lpc_fileprefix+f"/Run3_2022preEE_merged_v3/VHToGG/nominal/*", 
    #     lpc_fileprefix+f"/Run3_2022postEE_merged_v3/VHToGG/nominal/*"
    # ],
}

CURRENT_DIRPATH = str(Path().absolute())
VERSION = 'v1'
MOD_VALS = (5, 5)
VARS = 'nonres_and_ttH_vars'
# CURRENT_TIME = '2024-11-08_13-13-20'
# OUTPUT_DIRPATH = os.path.join(CURRENT_DIRPATH, f"MultiClassBDT_model_outputs/{VERSION}/{VARS}", CURRENT_TIME)
OUTPUT_DIRPATH = os.path.join(CURRENT_DIRPATH, f"MultiClassBDT_model_outputs/{VERSION}/{VARS}")

if not os.path.exists(OUTPUT_DIRPATH):
    os.makedirs(OUTPUT_DIRPATH)

SEED = 21
OPTIMIZE_SPACE = False
NUM_EPOCHS = 150

In [None]:
def training_weights(event_weights, labels, order=None):
    if order is not None:
        sig_idx = -1
        for i, sample_name in enumerate(order):
            if re.search('ggF HH', sample_name) is not None:
                sig_idx = i
                break
    else:
        sig_idx = 0
    
    sig_sum = np.sum(event_weights[labels[:, sig_idx] == 1])
    bkg_sum = np.sum(event_weights[labels[:, sig_idx] == 0])
    
    sig_scale_factor = bkg_sum / sig_sum

    scaled_weights = np.where(
        labels[:, sig_idx] == 0, 
        event_weights,  # if bkg, do nothing
        event_weights * sig_scale_factor  # if sig, rescale to equal sum of all bkgs
    )

    abs_weights = np.abs(scaled_weights)

    return abs_weights

def xgb_labels(labels):
    label_i = np.sum(
        np.tile([i for i in range(np.shape(labels)[1])], (np.shape(labels)[0], 1)) * labels,
        axis=1
    )

    return label_i

In [None]:
order = ['ggF HH', 'ttH', 'non-res + single-H']

(
    data_df_dict, data_test_df_dict, 
    data_hlf_dict, label_dict, 
    data_hlf_test_dict, label_test_dict, 
    hlf_vars_columns_dict,
    data_aux_dict, data_test_aux_dict
) = process_data(
    FILEPATHS_DICT, OUTPUT_DIRPATH, order=order, seed=SEED, mod_vals=MOD_VALS, k_fold_test=True,
    save=False if 'CURRENT_TIME' in globals() else True
)

xgb_label_dict = {
    f"fold_{fold_idx}": copy.deepcopy(xgb_labels(label_dict[f"fold_{fold_idx}"])) for fold_idx in range(len(data_test_aux_dict))
}
xgb_label_test_dict = {
    f"fold_{fold_idx}": copy.deepcopy(xgb_labels(label_test_dict[f"fold_{fold_idx}"])) for fold_idx in range(len(data_test_aux_dict))
}

weight_train_dict = {
    f"fold_{fold_idx}": copy.deepcopy(training_weights(data_aux_dict[f'fold_{fold_idx}'].loc[:, "eventWeight"].to_numpy(), label_dict[f'fold_{fold_idx}'], order=order)) for fold_idx in range(len(data_test_aux_dict))
}
weight_test_dict = {
    f'fold_{fold_idx}': copy.deepcopy(data_test_aux_dict[f'fold_{fold_idx}'].loc[:, "eventWeight"].to_numpy()) for fold_idx in range(len(data_test_aux_dict))
}

Data HLF: (1093974, 48)
num ggF HH = 136530
num ttH = 277205
num non-res + single-H = 680239
Data HLF test: (273612, 48)
num ggF HH = 34224
num ttH = 69297
num non-res + single-H = 170091
Data HLF: (1094678, 48)
num ggF HH = 136466
num ttH = 277452
num non-res + single-H = 680760
Data HLF test: (272908, 48)
num ggF HH = 34288
num ttH = 69050
num non-res + single-H = 169570
Data HLF: (1093463, 48)
num ggF HH = 136638
num ttH = 276627
num non-res + single-H = 680198
Data HLF test: (274123, 48)
num ggF HH = 34116
num ttH = 69875
num non-res + single-H = 170132
Data HLF: (1093487, 48)
num ggF HH = 136671
num ttH = 277054
num non-res + single-H = 679762
Data HLF test: (274099, 48)
num ggF HH = 34083
num ttH = 69448
num non-res + single-H = 170568
Data HLF: (1094742, 48)
num ggF HH = 136711
num ttH = 277670
num non-res + single-H = 680361
Data HLF test: (272844, 48)
num ggF HH = 34043
num ttH = 68832
num non-res + single-H = 169969


In [7]:
bdt_train_dict, bdt_val_dict, bdt_test_dict = {}, {}, {}
for fold_idx in range(len(data_df_dict)):
    if re.search('no_std', VARS) is not None:
        print('no standardization')
        train_val_data_dict = {key: value.to_numpy() for key, value in data_df_dict.items()}
        test_data_dict = {key: value.to_numpy() for key, value in data_test_df_dict.items()}
    else:
        train_val_data_dict = data_hlf_dict
        test_data_dict = data_hlf_test_dict
    (
        X_train, X_val, y_train, y_val, weight_train, weight_val
    ) = train_test_split(
        train_val_data_dict[f"fold_{fold_idx}"], xgb_label_dict[f"fold_{fold_idx}"], 
        weight_train_dict[f"fold_{fold_idx}"],
        test_size=0.2, random_state=21
    )

    bdt_train_dict[f"fold_{fold_idx}"] = xgb.DMatrix(
        data=X_train, label=y_train, 
        weight=weight_train,
        missing=-999.0, feature_names=list(hlf_vars_columns_dict[f"fold_{fold_idx}"])
    )
    bdt_val_dict[f"fold_{fold_idx}"] = xgb.DMatrix(
        data=X_val, label=y_val, 
        weight=weight_val,
        missing=-999.0, feature_names=list(hlf_vars_columns_dict[f"fold_{fold_idx}"])
    )
    
    bdt_test_dict[f"fold_{fold_idx}"] = xgb.DMatrix(
        data=test_data_dict[f"fold_{fold_idx}"], label=xgb_label_test_dict[f"fold_{fold_idx}"], 
        weight=np.abs(weight_test_dict[f"fold_{fold_idx}"]),
        missing=-999.0, feature_names=list(hlf_vars_columns_dict[f"fold_{fold_idx}"])
    )

    print(f"fold {fold_idx}")
    print(f"Num train: {len(y_train)} -> {sum(y_train == 0)} sig & {sum(y_train == 1)} ttH bkg & {sum(y_train == 2)} non-res + single-H bkg")
    print(f"Num val: {len(y_val)} -> {sum(y_val == 0)} sig & {sum(y_val == 1)} ttH bkg & {sum(y_val == 2)} non-res + single-H bkg")
    print(f"Num test: {len(label_test_dict[f'fold_{fold_idx}'])} -> {sum(label_test_dict[f'fold_{fold_idx}'] == np.array([1, 0, 0]))[0]} sig & {sum(label_test_dict[f'fold_{fold_idx}'] == np.array([0, 1, 0]))[1]} ttH bkg & {sum(label_test_dict[f'fold_{fold_idx}'] == np.array([0, 0, 1]))[2]} non-res + single-H bkg")
    print('='*60)

    

fold 0
Num train: 875179 -> 109305 sig & 221877 ttH bkg & 543997 non-res + single-H bkg
Num val: 218795 -> 27225 sig & 55328 ttH bkg & 136242 non-res + single-H bkg
Num test: 273612 -> 34224 sig & 69297 ttH bkg & 170091 non-res + single-H bkg
fold 1
Num train: 875742 -> 108939 sig & 222057 ttH bkg & 544746 non-res + single-H bkg
Num val: 218936 -> 27527 sig & 55395 ttH bkg & 136014 non-res + single-H bkg
Num test: 272908 -> 34288 sig & 69050 ttH bkg & 169570 non-res + single-H bkg
fold 2
Num train: 874770 -> 109416 sig & 221546 ttH bkg & 543808 non-res + single-H bkg
Num val: 218693 -> 27222 sig & 55081 ttH bkg & 136390 non-res + single-H bkg
Num test: 274123 -> 34116 sig & 69875 ttH bkg & 170132 non-res + single-H bkg
fold 3
Num train: 874789 -> 109371 sig & 221675 ttH bkg & 543743 non-res + single-H bkg
Num val: 218698 -> 27300 sig & 55379 ttH bkg & 136019 non-res + single-H bkg
Num test: 274099 -> 34083 sig & 69448 ttH bkg & 170568 non-res + single-H bkg
fold 4
Num train: 875793 -> 

In [None]:
# https://stackoverflow.com/questions/57986259/multiclass-classification-with-xgboost-classifier
# https://forecastegy.com/posts/xgboost-multiclass-classification-python/
# https://indico.cern.ch/event/915265/contributions/3848138/attachments/2048174/3432202/kunlinRan_bbyy_20200531.pdf


param = {}

# Booster parameters
param['eta']              = 0.05 # learning rate
param['max_depth']        = 5  # maximum depth of a tree
param['subsample']        = 0.6 # fraction of events to train tree on
param['colsample_bytree'] = 0.4 # fraction of features to train tree on
param['num_class']        = np.shape(label_dict['fold_0'])[1] # num classes for ulti-class training

# Learning task parameters
param['objective']   = 'multi:softprob'   # objective function
param['eval_metric'] = 'merror'           # evaluation metric for cross validation
param = list(param.items()) + [('eval_metric', 'mlogloss')]

num_trees = 200  # number of trees to make

In [None]:
CURRENT_TIME = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
OUTPUT_DIRPATH = os.path.join(OUTPUT_DIRPATH, CURRENT_TIME)
if not os.path.exists(OUTPUT_DIRPATH):
    os.makedirs(OUTPUT_DIRPATH)

for fold_idx in range(len(bdt_train_dict)):
    print(f"fold {fold_idx}")
    # Train bdt
    evallist = [(bdt_train_dict[f"fold_{fold_idx}"], 'train'), (bdt_test_dict[f"fold_{fold_idx}"], 'test'), (bdt_val_dict[f"fold_{fold_idx}"], 'val')]
    booster = xgb.train(
        param, bdt_train_dict[f"fold_{fold_idx}"], num_boost_round=num_trees, 
        evals=evallist, early_stopping_rounds=7, verbose_eval=True
    )
    booster.save_model(os.path.join(OUTPUT_DIRPATH, f'{CURRENT_TIME}_BDT_fold{fold_idx}.model'))
    
    # Print perf on test dataset
    print(booster.eval(bdt_test_dict[f"fold_{fold_idx}"]))
    print('='*100)

fold 0
[0]	train-merror:0.16728	train-mlogloss:1.08998	test-merror:0.13655	test-mlogloss:1.08899	val-merror:0.16609	val-mlogloss:1.09013
[1]	train-merror:0.17110	train-mlogloss:1.08303	test-merror:0.17512	test-mlogloss:1.08285	val-merror:0.17000	val-mlogloss:1.08313
[2]	train-merror:0.13984	train-mlogloss:1.07403	test-merror:0.14929	test-mlogloss:1.07403	val-merror:0.13843	val-mlogloss:1.07406
[3]	train-merror:0.14257	train-mlogloss:1.06603	test-merror:0.15417	test-mlogloss:1.06658	val-merror:0.14074	val-mlogloss:1.06608
[4]	train-merror:0.14571	train-mlogloss:1.05972	test-merror:0.16261	test-mlogloss:1.06126	val-merror:0.14479	val-mlogloss:1.05974
[5]	train-merror:0.13733	train-mlogloss:1.05088	test-merror:0.14804	test-mlogloss:1.05241	val-merror:0.13652	val-mlogloss:1.05092
[6]	train-merror:0.13990	train-mlogloss:1.04397	test-merror:0.14605	test-mlogloss:1.04580	val-merror:0.13895	val-mlogloss:1.04403
[7]	train-merror:0.13864	train-mlogloss:1.03661	test-merror:0.14508	test-mlogloss:1

In [None]:
# OUTPUT_DIRPATH = os.path.join(OUTPUT_DIRPATH, CURRENT_TIME)
# if not os.path.exists(OUTPUT_DIRPATH):
#     os.makedirs(OUTPUT_DIRPATH)

BDT_perf = {
    'preds': [], 'fprs': [], 'tprs': [], 'thresholds': [], 'areas': [],
    'train_preds': [], 'train_fprs': [], 'train_tprs': [], 'train_thresholds': [], 'train_areas': [],
    'val_preds': [], 'val_fprs': [], 'val_tprs': [], 'val_thresholds': [], 'val_areas': [],
    'class_order': copy.deepcopy(order),
}
base_tpr = np.linspace(0, 1, 5000)  # copied from IN evaluate.py file
for fold_idx in range(len(bdt_train_dict)):
    booster = xgb.Booster(param)
    booster.load_model(os.path.join(OUTPUT_DIRPATH, f'{CURRENT_TIME}_BDT_fold{fold_idx}.model'))
    
    for pred_type, dataset in [
        ('train_', bdt_train_dict[f"fold_{fold_idx}"]), 
        ('val_', bdt_val_dict[f"fold_{fold_idx}"]),
        ('', bdt_test_dict[f"fold_{fold_idx}"])
    ]:
        BDT_perf[pred_type + 'preds'].append(booster.predict(dataset).tolist())
        BDT_perf[pred_type + 'fprs'].append([copy.deepcopy(base_tpr.tolist()) for _ in order])
        BDT_perf[pred_type + 'tprs'].append([copy.deepcopy(base_tpr.tolist()) for _ in order])
        BDT_perf[pred_type + 'thresholds'].append([copy.deepcopy(base_tpr.tolist()) for _ in order])
        BDT_perf[pred_type + 'areas'].append([0.0 for _ in order])
        for i, sample_name in enumerate(order):
            class_truths = np.where(dataset.get_label() == i, 1, 0)
            class_preds = np.array(BDT_perf[pred_type + 'preds'][-1])[:, i]
            # print(f"num {sample_name} = {np.shape(class_truths[class_truths == 1])}, num not {sample_name} = {np.shape(class_truths[class_truths == 0])}")
            
            fpr_bdt, tpr_bdt, threshold_bdt = roc_curve(class_truths, class_preds)
            fpr_bdt = np.interp(base_tpr, tpr_bdt, fpr_bdt)
            threshold_bdt = np.interp(base_tpr, tpr_bdt, threshold_bdt)

            BDT_perf[pred_type + 'fprs'][fold_idx][i] = fpr_bdt.tolist()
            BDT_perf[pred_type + 'tprs'][fold_idx][i] = base_tpr.tolist()
            BDT_perf[pred_type + 'thresholds'][fold_idx][i] = threshold_bdt.tolist()
            BDT_perf[pred_type + 'areas'][fold_idx][i] = float(auc(fpr_bdt, base_tpr))

        BDT_perf[pred_type + 'fprs'][fold_idx] = np.column_stack(BDT_perf[pred_type + 'fprs'][fold_idx]).tolist()
        BDT_perf[pred_type + 'tprs'][fold_idx] = np.column_stack(BDT_perf[pred_type + 'tprs'][fold_idx]).tolist()
        BDT_perf[pred_type + 'thresholds'][fold_idx] = np.column_stack(BDT_perf[pred_type + 'thresholds'][fold_idx]).tolist()

    with h5py.File(os.path.join(OUTPUT_DIRPATH, CURRENT_TIME+f"_BDT_ROC_fold{fold_idx}.h5"), "w") as out:
        out['FPR'] = fpr_bdt
        out['TPR'] = tpr_bdt
        out['Thresholds'] = threshold_bdt

with open(os.path.join(OUTPUT_DIRPATH, CURRENT_TIME+"_BDT_perf.json"), 'w') as f:
    json.dump(BDT_perf, f)

num ggF HH = (109096,), num not ggF HH = (1141457,)
num ttH = (222178,), num not ttH = (1028375,)
num non-res + single-H = (919279,), num not non-res + single-H = (331274,)
num ggF HH = (27434,), num not ggF HH = (285205,)
num ttH = (55027,), num not ttH = (257612,)
num non-res + single-H = (230178,), num not non-res + single-H = (82461,)
num ggF HH = (34224,), num not ggF HH = (357561,)
num ttH = (69297,), num not ttH = (322488,)
num non-res + single-H = (288264,), num not non-res + single-H = (103521,)
num ggF HH = (109298,), num not ggF HH = (1142038,)
num ttH = (221622,), num not ttH = (1029714,)
num non-res + single-H = (920416,), num not non-res + single-H = (330920,)
num ggF HH = (27168,), num not ggF HH = (285667,)
num ttH = (55830,), num not ttH = (257005,)
num non-res + single-H = (229837,), num not non-res + single-H = (82998,)
num ggF HH = (34288,), num not ggF HH = (356518,)
num ttH = (69050,), num not ttH = (321756,)
num non-res + single-H = (287468,), num not non-res + s

In [None]:
def plot_filepath(plot_name, plot_dirpath, plot_prefix, plot_postfix, format='png'):
    plot_prefix = plot_prefix + ('_' if plot_prefix != '' else '')
    plot_postfix = plot_postfix + ('_' if plot_postfix != '' else '')
    plot_name = plot_prefix + plot_name + plot_postfix + f'.{format}'

    plot_filepath = os.path.join(plot_dirpath, plot_name)
    return plot_filepath

def plot_rocs(
    fprs, tprs, labels, plot_name, plot_dirpath,
    plot_prefix='', plot_postfix='', close=True, log=None
):
    plt.figure(figsize=(9,7))
    
    for fpr, tpr, label in zip(fprs, tprs, labels):
        linestyle = 'solid' if re.search('IN', label) is not None else ('dashed' if re.search('BDT', label) is not None else 'dotted')
        plt.plot(fpr, tpr, label=label, linestyle=linestyle)

    plt.legend(bbox_to_anchor=(1, 1))
    plt.xlabel('Background contamination')
    plt.ylabel('Signal efficiency')
    if log is not None and re.search('x', log) is not None:
        plt.xscale('log')
    elif log is not None and re.search('y', log) is not None:
        plt.yscale('log')
    
    plt.savefig(
        plot_filepath(plot_name, plot_dirpath, plot_prefix, plot_postfix), 
        bbox_inches='tight'
    )
    plt.savefig(
        plot_filepath(plot_name, plot_dirpath, plot_prefix, plot_postfix, format='pdf'), 
        bbox_inches='tight'
    )
    if close:
        plt.close()

def plot_output_scores(
    sigs_and_bkgs, order, plot_name, plot_dirpath,
    plot_prefix='', plot_postfix='', bins=50, weights=None, log=False
):
    plt.figure(figsize=(9,7))

    hist_axis = hist.axis.Regular(bins, 0., 1., name='var', growth=False, underflow=False, overflow=False)
    hists = {}
    for sample_name in order:
        hists[sample_name] = hist.Hist(hist_axis, storage='weight').fill(
            var=sigs_and_bkgs[sample_name], 
            weight=weights[sample_name] if weights is not None else np.ones_like(sigs_and_bkgs[sample_name])
        )
    hep.histplot(
        [hists[sample_name] for sample_name in order],
        yerr=(True if weights is not None else False),
        alpha=0.7, density=(False if weights is not None else True), histtype='step',
        label=[sample_name for sample_name in order]
    )

    plt.legend(bbox_to_anchor=(1, 1))
    plt.xlabel('Output score')
    if log:
        plt.yscale('log')
    
    plt.savefig(
        plot_filepath(plot_name, plot_dirpath, plot_prefix, plot_postfix), 
        bbox_inches='tight'
    )
    plt.savefig(
        plot_filepath(plot_name, plot_dirpath, plot_prefix, plot_postfix, format='pdf'), 
        bbox_inches='tight'
    )
    plt.close()

def plot_s_over_root_b(
    sig, bkg, sample_name, plot_name, plot_dirpath,
    plot_prefix='', plot_postfix='', bins=50, weights={'sig': None, 'bkg': None},
    lines=None, lines_labels=None, line_colors=None
):
    plt.figure(figsize=(9,7))

    hist_axis = hist.axis.Regular(bins, 0., 1., name='var', growth=False, underflow=False, overflow=False)
    sig_hist = hist.Hist(hist_axis, storage='weight').fill(var=sig, weight=weights['sig'] if weights['sig'] is not None else np.ones_like(sig))
    bkg_hist = hist.Hist(hist_axis, storage='weight').fill(var=bkg, weight=weights['bkg'] if weights['bkg'] is not None else np.ones_like(bkg))
    s_over_root_b_points = sig_hist.values().flatten() / np.sqrt(bkg_hist.values().flatten())
    plt.plot(
        np.arange(0., 1., 1/bins), s_over_root_b_points, 
        label=f'{sample_name} - s/√b', alpha=0.8
    )

    if lines is not None:
        for i in range(len(lines)):
            plt.vlines(
                lines[i], 0, np.max(s_over_root_b_points), 
                label='s/√b'+(' - '+lines_labels[i] if lines_labels is not None else ''), 
                alpha=0.5, colors=line_colors[i]
            )
    
    plt.legend(bbox_to_anchor=(1, 1))
    plt.xlabel('Output score')
    plt.ylabel('s/√b')
    
    plt.savefig(
        plot_filepath(plot_name, plot_dirpath, plot_prefix, plot_postfix), 
        bbox_inches='tight'
    )
    plt.savefig(
        plot_filepath(plot_name, plot_dirpath, plot_prefix, plot_postfix, format='pdf'), 
        bbox_inches='tight'
    )
    plt.close()

In [None]:
def optimize_cut_boundaries(sigs, bkgs, weights, bins=50):
    hist_list_fold = []
    cut_boundaries_fold = []
    cut_s_over_root_bs_fold = []
    sig_weights_fold = []
    bkg_weights_fold = []
    if len(np.shape(sigs)) == 1:
        sigs, bkgs = [sigs], [bkgs] 
    for sig, bkg in zip(sigs, bkgs):
        hist_axis = hist.axis.Regular(bins, 0., 1., name='var', growth=False, underflow=False, overflow=False)
        sig_hist = hist.Hist(hist_axis, storage='weight').fill(var=sig, weight=weights['sig'])
        bkg_hist = hist.Hist(hist_axis, storage='weight').fill(var=bkg, weight=weights['bkg'])
        hist_list_fold.append({'sig': copy.deepcopy(sig_hist), 'bkg': copy.deepcopy(bkg_hist)})

        fold_idx_cuts_bins_inclusive = []
        fold_idx_sig_weights = []
        fold_idx_bkg_weights = []
        fold_idx_prev_s_over_root_b = []
        prev_s_over_root_b = 0
        for i in range(bins):
            s = np.sum(sig_hist.values().flatten()[
                (bins-1) - i : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
            ])
            sqrt_b = np.sqrt(np.sum(bkg_hist.values().flatten()[
                (bins-1) - i : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
            ]))
            if prev_s_over_root_b < (s / sqrt_b):
                prev_s_over_root_b = s / sqrt_b
                continue
            else:
                fold_idx_sig_weights.append(
                    {
                        'value': np.sum(sig_hist.values().flatten()[
                            (bins) - i : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                        ]),
                        'w2': np.sqrt(np.sum(sig_hist.variances().flatten()[
                            (bins) - i : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                        ])),
                    }
                )
                fold_idx_bkg_weights.append(
                    {
                        'value': np.sum(bkg_hist.values().flatten()[
                            (bins) - i : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                        ]),
                        'w2': np.sqrt(np.sum(bkg_hist.variances().flatten()[
                            (bins) - i : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                        ])),
                    }
                )
                fold_idx_cuts_bins_inclusive.append(bins - i)
                fold_idx_prev_s_over_root_b.append(prev_s_over_root_b)
                prev_s_over_root_b = 0
        fold_idx_sig_weights.append(
            {
                'value': np.sum(sig_hist.values().flatten()[
                    0 : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                ]),
                'w2': np.sqrt(np.sum(sig_hist.variances().flatten()[
                    0 : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                ])),
            }
        )
        fold_idx_bkg_weights.append(
            {
                'value': np.sum(bkg_hist.values().flatten()[
                    0 : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                ]),
                'w2': np.sqrt(np.sum(bkg_hist.variances().flatten()[
                    0 : bins if len(fold_idx_cuts_bins_inclusive) == 0 else fold_idx_cuts_bins_inclusive[-1]
                ])),
            }
        )
        fold_idx_cuts_bins_inclusive.append(0)
        fold_idx_prev_s_over_root_b.append(prev_s_over_root_b)
        fold_idx_score_cuts = [bin_i / bins for bin_i in fold_idx_cuts_bins_inclusive]
        cut_boundaries_fold.append(fold_idx_score_cuts)
        cut_s_over_root_bs_fold.append(fold_idx_prev_s_over_root_b)
        sig_weights_fold.append(fold_idx_sig_weights)
        bkg_weights_fold.append(fold_idx_bkg_weights)
    return cut_boundaries_fold, cut_s_over_root_bs_fold, sig_weights_fold, bkg_weights_fold

In [None]:
# OUTPUT_DIRPATH = os.path.join(OUTPUT_DIRPATH, CURRENT_TIME)
plot_dirpath = os.path.join(OUTPUT_DIRPATH, "plots")
if not os.path.exists(plot_dirpath):
    os.makedirs(plot_dirpath)

with open(os.path.join(OUTPUT_DIRPATH, CURRENT_TIME+"_BDT_perf.json"), 'r') as f:
    BDT_perf = json.load(f)
base_tpr = np.array(BDT_perf['tprs'][0])[:, 0]

# plot ROCs
avg_fprs, avg_aucs = [], []
for fold_idx in range(len(bdt_train_dict)):
    fprs = [np.array(BDT_perf['fprs'][fold_idx])[:, i] for i in range(len(order))]
    avg_fprs.append(copy.deepcopy(BDT_perf['fprs'][fold_idx]))

    tprs = [base_tpr for i in range(len(order))]

    labels = [f"{sample_name}, AUC = {BDT_perf['areas'][fold_idx][i]:.4f}" for i, sample_name in enumerate(order)]

    avg_aucs.append(copy.deepcopy(BDT_perf['areas'][fold_idx]))

    plot_rocs(fprs, tprs, labels, f"BDT_roc_testData_fold{fold_idx}", plot_dirpath)
plot_rocs(
    np.mean(avg_fprs, axis=0).T, [base_tpr for i in range(len(order))], 
    [f"{sample_name}, AUC = {np.mean(avg_aucs, axis=0)[i]:.4f}" for i, sample_name in enumerate(order)], 
    f"BDT_roc_testData_Avg", plot_dirpath
)
flat_preds = np.concatenate(BDT_perf['preds'], axis=0)
flat_truths = np.concatenate([bdt_test_dict[f"fold_{fold_idx}"].get_label() for fold_idx in range(len(bdt_train_dict))], axis=0)
fprs, aucs = [], []
for i, sample_name in enumerate(order):
    fpr, tpr, threshold = roc_curve(flat_truths == i, flat_preds[:, i])
    fpr = np.interp(base_tpr, tpr, fpr)
    fpr[0] = 0.0
    fprs.append(copy.deepcopy(fpr))
    aucs.append(float(auc(fpr, base_tpr)))
plot_rocs(
    fprs, [base_tpr for i in range(len(order))], 
    [f"{sample_name}, AUC = {aucs[i]:.4f}" for i, sample_name in enumerate(order)], 
    f"BDT_roc_testData_sum", plot_dirpath
)

# plot Output scores and s/√b curves
weights_sum, sigs_and_bkgs_sum = {}, {}
for fold_idx in range(len(bdt_train_dict)):
    weights_plot = {
        sample_name: {
            sample_name_: weight_test_dict[f"fold_{fold_idx}"][xgb_label_test_dict[f"fold_{fold_idx}"] == i] for i, sample_name_ in enumerate(order)
        } for sample_name in order
    }

    sigs_and_bkgs = {
        sample_name: {
            sample_name_: np.array(BDT_perf['preds'][fold_idx])[:, j][xgb_label_test_dict[f"fold_{fold_idx}"] == i] for i, sample_name_ in enumerate(order)
        } for j, sample_name in enumerate(order)
    }

    if fold_idx == 0:
        weights_sum = copy.deepcopy(weights_plot)
        sigs_and_bkgs_sum = copy.deepcopy(sigs_and_bkgs)
    else:
        for sample_name in order:
            for sample_name_ in order:
                weights_sum[sample_name][sample_name_] = np.concatenate((weights_sum[sample_name][sample_name_], weights_plot[sample_name][sample_name_]))
                sigs_and_bkgs_sum[sample_name][sample_name_] = np.concatenate((sigs_and_bkgs_sum[sample_name][sample_name_], sigs_and_bkgs[sample_name][sample_name_]))

    for sample_name in order:
        plot_output_scores(
            sigs_and_bkgs[sample_name], order, f"BDT_outputScoreWeighted_testData_fold{fold_idx}_{sample_name}", 
            plot_dirpath, weights=weights_plot[sample_name], log=True
        )
        plot_output_scores(
            sigs_and_bkgs[sample_name], order, f"BDT_outputScoreDensity_testData_fold{fold_idx}_{sample_name}", 
            plot_dirpath
        )
for sample_name in order:
    plot_output_scores(
        sigs_and_bkgs_sum[sample_name], order, f"BDT_outputScoreWeighted_testData_sum_{sample_name}", 
        plot_dirpath, weights=weights_sum[sample_name], log=True
    )
    plot_output_scores(
        sigs_and_bkgs_sum[sample_name], order, f"BDT_outputScoreDensity_testData_sum_{sample_name}", 
        plot_dirpath
    )

# plot s/√b curves
weights_sum_sob, sigs_and_bkgs_sum_sob = {}, {}
for fold_idx in range(len(bdt_train_dict)):
    weights_plot = {
        sample_name: {
            'sig': weight_test_dict[f"fold_{fold_idx}"][xgb_label_test_dict[f"fold_{fold_idx}"] == j],
            'bkg': weight_test_dict[f"fold_{fold_idx}"][xgb_label_test_dict[f"fold_{fold_idx}"] != j],
        } for j, sample_name in enumerate(order)
    }

    sigs_and_bkgs = {
        sample_name: {
            'sig': np.array(BDT_perf['preds'][fold_idx])[:, j][xgb_label_test_dict[f"fold_{fold_idx}"] == j],
            'bkg': np.array(BDT_perf['preds'][fold_idx])[:, j][xgb_label_test_dict[f"fold_{fold_idx}"] != j],
        } for j, sample_name in enumerate(order)
    }

    if fold_idx == 0:
        weights_sum = copy.deepcopy(weights_plot)
        sigs_and_bkgs_sum = copy.deepcopy(sigs_and_bkgs)
    else:
        for sample_name in order:
            for sample_name_ in weights_plot[sample_name].keys():
                weights_sum[sample_name][sample_name_] = np.concatenate((weights_sum[sample_name][sample_name_], weights_plot[sample_name][sample_name_]))
                sigs_and_bkgs_sum[sample_name][sample_name_] = np.concatenate((sigs_and_bkgs_sum[sample_name][sample_name_], sigs_and_bkgs[sample_name][sample_name_]))

    for sample_name in order:
        plot_s_over_root_b(
            sigs_and_bkgs[sample_name]['sig'], sigs_and_bkgs[sample_name]['bkg'], sample_name, f"BDT_sOverRootb_testData_fold{fold_idx}_{sample_name}", 
            plot_dirpath, weights=weights_plot[sample_name]
        )

        (
            cut_boundaries_fold, cut_s_over_root_bs_fold, sig_weights_fold, bkg_weights_fold
        ) = optimize_cut_boundaries(
            sigs_and_bkgs[sample_name]['sig'], sigs_and_bkgs[sample_name]['bkg'], 
            weights_plot[sample_name]
        )

        BDT_cut_labels = [
            f"s/√b={cut_s_over_root_bs_fold[0][cut_idx]:.4f}, s={sig_weights_fold[0][cut_idx]['value']:.4f}±{sig_weights_fold[0][cut_idx]['w2']:.4f}, b={bkg_weights_fold[0][cut_idx]['value']:.4f}±{bkg_weights_fold[0][cut_idx]['w2']:.4f}" for cut_idx in range(len(cut_s_over_root_bs_fold[0]))
        ]
        line_labels = BDT_cut_labels
        lines = cut_boundaries_fold[0]
        line_colors = cmap_petroff10

        plot_s_over_root_b(
            sigs_and_bkgs[sample_name]['sig'], sigs_and_bkgs[sample_name]['bkg'], sample_name, 
            f"BDT_sOverRootb_withCuts_testData_fold{fold_idx}_{sample_name}", plot_dirpath, 
            weights=weights_plot[sample_name],
            lines=lines, lines_labels=line_labels, line_colors=line_colors
        )
for sample_name in order:
    plot_s_over_root_b(
        sigs_and_bkgs_sum[sample_name]['sig'], sigs_and_bkgs_sum[sample_name]['bkg'], sample_name, f"BDT_sOverRootb_testData_sum_{sample_name}", 
        plot_dirpath, weights=weights_sum[sample_name]
    )

    (
        cut_boundaries_fold, cut_s_over_root_bs_fold, sig_weights_fold, bkg_weights_fold
    ) = optimize_cut_boundaries(
        sigs_and_bkgs_sum[sample_name]['sig'], sigs_and_bkgs_sum[sample_name]['bkg'], 
        weights_sum[sample_name]
    )

    BDT_cut_labels = [
        f"s/√b={cut_s_over_root_bs_fold[0][cut_idx]:.4f}, s={sig_weights_fold[0][cut_idx]['value']:.4f}±{sig_weights_fold[0][cut_idx]['w2']:.4f}, b={bkg_weights_fold[0][cut_idx]['value']:.4f}±{bkg_weights_fold[0][cut_idx]['w2']:.4f}" for cut_idx in range(len(cut_s_over_root_bs_fold[0]))
    ]
    line_labels = BDT_cut_labels
    lines = cut_boundaries_fold[0]
    line_colors = cmap_petroff10

    plot_s_over_root_b(
        sigs_and_bkgs_sum[sample_name]['sig'], sigs_and_bkgs_sum[sample_name]['bkg'], sample_name, 
        f"BDT_sOverRootb_withCuts_testData_sum_{sample_name}", plot_dirpath, 
        weights=weights_sum[sample_name],
        lines=lines, lines_labels=line_labels, line_colors=line_colors
    )


In [None]:
xgb.plot_importance(booster)

In [None]:
import graphviz
xgb.to_graphviz(booster)