In [46]:
import sys
sys.path.append("..")

from utils import data_dir

import os
import random
import numpy as np
from pprint import pprint
import matplotlib as mpl
import matplotlib.pyplot as plt
parameters = {'axes.labelsize': 12,
          'xtick.labelsize': 12,
          'ytick.labelsize': 12,
          'legend.fontsize': 12,
          'lines.linewidth' : 2,
          'lines.markersize' : 7}
plt.rcParams.update(parameters)

In [47]:
import seaborn as sns
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, confusion_matrix, ConfusionMatrixDisplay, auc, precision_recall_curve
from sklearn.preprocessing import label_binarize

In [48]:
task_name = "axion1"

In [49]:
def findBin(bins, var, ibin):
    for i in range(len(bins)-1):
        if var >= bins[i] and var < bins[i+1]:
            ibin.append(i)

In [50]:
# get root files and convert them to array
branch_labels = {"frac_first": "$f_{1}$",
                 "first_lateral_width_eta_w20": "$w_{s20}$",
                 "first_lateral_width_eta_w3": "$w_{s3}$",
                 "first_fraction_fside": "$f_{side}$",
                 "first_dEs": "$\Delta E_{s}$",
                 "first_Eratio": "$E_{ratio}$",
                 "second_R_eta": "$R_{\eta}$",
                 "second_R_phi": "$R_{\phi}$",
                 "second_lateral_width_eta_weta2": "$w_{\eta2}$"
                 }
branch_names = list(branch_labels.keys())
pprint(branch_names)
branch_ene = """total_e""".split(",")

['frac_first',
 'first_lateral_width_eta_w20',
 'first_lateral_width_eta_w3',
 'first_fraction_fside',
 'first_dEs',
 'first_Eratio',
 'second_R_eta',
 'second_R_phi',
 'second_lateral_width_eta_weta2']


In [29]:
bg0Legend = "$\gamma$"
bg1Legend = "$\pi^0$"
sigLegend = {
    "scalar1": r"$h_2\rightarrow\pi^0\pi^0$",
    "axion1": r"$a\rightarrow\gamma\gamma$",
    "axion2": r"$a\rightarrow3\pi^0$"
}[task_name]

In [18]:
def as_matrix(tree, columns):
    """
    tree is an npz object containing string keys (columns) and np.array values    """
    return np.stack([tree[col] for col in columns])


In [86]:
n_train = 70000
signal0_tree = np.load(f"{data_dir}/processed/bdt_vars/{task_name}_bdt_vars.npz")
signal0 = as_matrix(signal0_tree, columns=branch_names)
signal0_ene = as_matrix(signal0_tree, columns=branch_ene)
train_signal0 = signal0[:n_train].T
test_signal0 = signal0[n_train:].T
train_signal0_ene = signal0_ene[:n_train].T
test_signal0_ene = signal0_ene[n_train:].T

background0_tree = np.load(f"{data_dir}/processed/bdt_vars/gamma_bdt_vars.npz")
background0 = as_matrix(background0_tree, columns=branch_names)
background0_ene = as_matrix(background0_tree, columns=branch_ene)
train_background0 = background0[:n_train].T
test_background0 = background0[n_train:].T
train_background0_ene = background0_ene[:n_train].T
test_background0_ene = background0_ene[n_train:].T

background1_tree = np.load(f"{data_dir}/processed/bdt_vars/pi0_bdt_vars.npz")
background1 = as_matrix(background1_tree, columns=branch_names)
background1_ene = as_matrix(background1_tree, columns=branch_ene)
train_background1 = background1[:n_train].T
test_background1 = background1[n_train:].T
train_background1_ene = background1_ene[:n_train].T
test_background1_ene = background1_ene[n_train:].T

In [125]:
def plot_inputs(outdir, vars, branch_labels, sig, sig_w, bkg, bkg_w, bkg2, bkg2_w, sigLegend, bg0Legend, bg1Legend):    
    for n, var in enumerate(vars):
        _, bins = np.histogram(np.concatenate(
            (sig[:, n], bkg[:, n], bkg2[:, n])), bins=40)
        sns.distplot(sig[:, n], hist_kws={'weights': sig_w}, bins=bins, kde=False,
                     norm_hist=True, color='orange', label='{}'.format(sigLegend))
        sns.distplot(bkg[:, n], hist_kws={'weights': bkg_w}, bins=bins,
                     kde=False, norm_hist=True, color='b', label='{}'.format(bg0Legend))
        sns.distplot(bkg2[:, n], hist_kws={'weights': bkg2_w}, bins=bins,
                     kde=False, norm_hist=True, color='g', label='{}'.format(bg1Legend))
        
        plt.legend()
        if var == "first_dEs":
            plt.subplots_adjust(left=0.15)
        plt.xlabel('{}'.format(branch_labels[var]), loc='right', fontsize=24)
        plt.ylabel('Entries', fontsize=24)
        
        plt.xticks(fontsize=17)
        plt.yticks(fontsize=17)
        # https://stackoverflow.com/questions/42281851/how-to-add-padding-to-a-plot-in-python
        plt.tight_layout()
        
        plt.savefig(os.path.join(outdir, 'input_{}.pdf'.format(var)))
        plt.close()

In [126]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)

    output_dir = f"./gbdt_results_9var/{task_name}"
    os.makedirs(output_dir, exist_ok=True)
    plot_inputs(output_dir, branch_names, branch_labels, train_signal0,
                None, train_background0, None, train_background1, None, sigLegend, bg0Legend, bg1Legend)


`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(sig[:, n], hist_kws={'weights': sig_w}, bins=bins, kde=False,

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(bkg[:, n], hist_kws={'weights': bkg_w}, bins=bins,

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `dis