# A simple jupyter notebook to get acquainted with the main functions available in this repository

In [None]:
# Necessary imports
from utils import (load_lhco_rd, add_gaussian_features, train_model_multirun)
from plot_utils import plot_sic_curve_comparison
from os.path import exists, join
from os import mkdir
from matplotlib import pyplot as plt

In [None]:
# Load data from LHCO R&D dataset
data = load_lhco_rd("./treebased_ad_files/lhco_rd")

In [None]:
# Set up general settings for the trainings

# How often to re-run the entire ensemble training procedure
num_runs = 2

# How many models constitute a single ensemble
ensembles_per_model = 5

# Maximum numbers of iterations. Since we'll be using early stopping later,
# this number will probably not be reached (usually the minimum validation
# loss occurs within the first 20 iterations)
max_iters = 100

The below cell takes around 1 minute to run on a modern-era CPU.

In [None]:
# Train Histogrammed gradient boosting classifiers (HGB)

# train_model_multirun is a wrapper for the entire training procedure.
# The outputs are the losses of all models and runs as well as the model
# instances themselves.
#
# The models are saved in the specified directory.
# We choose the naming convention "0G" to indicate we are using the original
# dataset without adding Gaussian noise features.
full_losses_hgb_0G, models_hgb_0G = train_model_multirun(
    data,
    num_runs=num_runs, ensembles_per_model=ensembles_per_model,
    max_iters=max_iters, model_type="HGB", compute_val_weights=True,
    save_model_dir="./models/models_hgb_0G",
    cv_mode="random", early_stopping=True)

In [None]:
# We create a copy of the data with 10 Gaussian noise features added
data_10G = add_gaussian_features(data, 10)

The below cell takes around 2 minutes to run on a modern-era CPU.

In [None]:
# We re-run the training with the exact same overall settings,
# but now on the dataset including the Gaussian noise.
full_losses_hgb_10G, models_hgb_10G = train_model_multirun(
    data_10G,
    num_runs=num_runs, ensembles_per_model=ensembles_per_model,
    max_iters=max_iters, model_type="HGB", compute_val_weights=True,
    save_model_dir="./models/models_hgb_10G",
    cv_mode="random", early_stopping=True)

In [None]:
# If not exists, create plots directory
if not exists("./plots"):
    mkdir("./plots")

In [None]:
# set recommended RC params
plt.rcParams['pgf.rcfonts'] = False
plt.rcParams['font.serif'] = []
plt.rcParams['axes.formatter.useoffset'] = False
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['errorbar.capsize'] = 2
plt.rcParams['grid.linewidth'] = 0.5
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.title_fontsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['legend.frameon'] = False

In [None]:
# Finally, we create a significance improvement characteristic curve (SIC-curve) for the
# two sets of models:

# Set colors
color_list = ["black", "red"]

# Set linestyles (use matplotlib linestyles)
linestyles = ["solid", "solid"]

# Set model types (this time it's HGB for both cases, but one could also
# compare different BDT algorithms or a DNN to a BDT)
model_types = ["HGB", "HGB"]

# Labels for the legend
labels = ["Baseline", "Baseline + 10G"]

plot_sic_curve_comparison([models_hgb_0G, models_hgb_10G],
                          [data, data_10G],
                          model_types=model_types,
                          out_filename=join("plots", "gauss_compare_HGB.pdf"),
                          color_list=color_list,
                          linestyles=linestyles,
                          labels=labels,
                          xlabel=r"$\epsilon_{S}$",
                          ylabel=r"$\epsilon_S/\sqrt{\epsilon_B}$",
                          max_y=20,
                          title="BDT")