To do's:
- clean up things
- allow wrapped models to continue training for an extra n epochs whenever fit is called
- maybe ensembling?
- documentation

Sanity checks:
- epoch number matching between checkpoint name and position in loss array?

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from os.path import join
from sklearn.metrics import roc_curve
from sklearn.neighbors import KernelDensity
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from conditional_normalizing_flow import ConditionalNormalizingFlow
from neural_network_classifier import NeuralNetworkClassifier
from preprocessing import LogitScaler

In [None]:
# :sunglasses:
plt.style.use('dark_background')

In [None]:
# data loading
data_path = "./input_data/"
outerdata_train = np.load(join(data_path, "outerdata_train.npy"))
outerdata_val = np.load(join(data_path, "outerdata_val.npy"))
innerdata_train = np.load(join(data_path, "innerdata_train.npy"))
innerdata_val = np.load(join(data_path, "innerdata_val.npy"))
innerdata_test = np.load(join(data_path, "innerdata_test.npy"))

In [None]:
# either train new flow model from scratch

# We streamline the preprocessing with an sklearn pipeline.
# Ideally we would wrap the whole model, including the flow. But out of the box,
# the pipeline class does not wrap sample() and predict_log_proba() :(
outer_scaler = make_pipeline(LogitScaler(), StandardScaler())

m_train = outerdata_train[:, 0]
x_train = outer_scaler.fit_transform(outerdata_train[:, 1:-1])
m_val = outerdata_val[:, 0]
x_val = outer_scaler.transform(outerdata_val[:, 1:-1])

flow_savedir = "./trained_flows_sklearn_new/"
flow_model = ConditionalNormalizingFlow(save_path=flow_savedir)
flow_model.fit(m_train, x_train, m_val, x_val, epochs=50, verbose=True)

# then go back to the optimal epoch checkpoint
flow_model.load_best_model()

In [None]:
# or loading existing flow model

outer_scaler = make_pipeline(LogitScaler(), StandardScaler())
outer_scaler.fit(outerdata_train[:, 1:-1])

flow_savedir = "./trained_flows_sklearn/"
flow_model = ConditionalNormalizingFlow(save_path=flow_savedir)
flow_model.load_best_model()

In [None]:
# fitting a KDE for the mass distribution based on the inner training set

# we also perform a logit first to stretch out the hard boundaries
m_scaler = LogitScaler(epsilon=1e-8)
m_train = m_scaler.fit_transform(innerdata_train[:, 0].reshape(-1, 1))

kde_model = KernelDensity(bandwidth=0.01, kernel='gaussian')
kde_model.fit(m_train)

# now let's sample 4x the number of training data
m_samples = kde_model.sample(4*len(m_train)).astype(np.float32)
m_samples = m_scaler.inverse_transform(m_samples)

In [None]:
# drawing samples from the flow model with the KDE samples as conditional
x_samples = flow_model.sample(m_samples)

x_samples = outer_scaler.inverse_transform(x_samples)

# assigning "signal" label 0 to samples
samples = np.hstack([m_samples, x_samples, np.zeros((m_samples.shape[0], 1))])

In [None]:
# comparing samples to inner background (idealized sanity check)

for i in range(5):
    _, binning, _ = plt.hist(innerdata_test[innerdata_test[:, -1] == 0, i],
                             bins=100, label="data background",
                             density=True, histtype="step")
    _ = plt.hist(samples[:, i],
                 bins=binning, label="sampled background",
                 density=True, histtype="step")
    plt.legend()
    plt.ylim(0, plt.gca().get_ylim()[1] * 1.2)
    plt.xlabel("feature {}".format(i))
    plt.ylabel("counts")
    plt.show()

In [None]:
# assigning "signal" label 1 to data
clsf_train_data = innerdata_train.copy()
clsf_train_data[:, -1] = np.ones_like(clsf_train_data[:, -1])

clsf_val_data = innerdata_val.copy()
clsf_val_data[:, -1] = np.ones_like(clsf_val_data[:, -1])

# then mixing data and samples into train/val sets together proportionally
n_train = len(clsf_train_data)
n_val = len(clsf_val_data)
n_samples_train = int(n_train / (n_train + n_val) * len(samples))
samples_train = samples[:n_samples_train]
samples_val = samples[n_samples_train:]

clsf_train_set = np.vstack([clsf_train_data, samples_train])
clsf_val_set = np.vstack([clsf_val_data, samples_val])
np.random.shuffle(clsf_train_set)
np.random.shuffle(clsf_val_set)

In [None]:
# either train new NN classifier to distinguish between real inner data and samples

# derive scaler parameters on data only, so it stays the same even if we resample
inner_scaler = StandardScaler()
inner_scaler.fit(clsf_train_data[:, 1:-1])

x_train = inner_scaler.transform(clsf_train_set[:, 1:-1])
y_train = clsf_train_set[:, -1]
x_val = inner_scaler.transform(clsf_val_set[:, 1:-1])
y_val = clsf_val_set[:, -1]

classifier_savedir = "trained_classifiers_sklearn_new/"
classifier_model = NeuralNetworkClassifier(save_path=classifier_savedir,
                                           n_inputs=x_train.shape[1])
classifier_model.fit(x_train, y_train, x_val, y_val,
                     epochs=100, verbose=True)

# then go back to the optimal epoch checkpoint
classifier_model.load_best_model()

In [None]:
# or alternatively load existing classifer model

inner_scaler = StandardScaler()
inner_scaler.fit(clsf_train_data[:, 1:-1])

classifier_savedir = "trained_classifiers_sklearn/"
classifier_model = NeuralNetworkClassifier(save_path=classifier_savedir,
                                           n_inputs=clsf_train_set[:, 1:-1].shape[1])
classifier_model.load_best_model()

In [None]:
# now let's evaluate the signal extraction performance

x_test = inner_scaler.transform(innerdata_test[:, 1:-1])
y_test = innerdata_test[:, -1]

preds_test = classifier_model.predict(x_test)

with np.errstate(divide='ignore', invalid='ignore'):
    fpr, tpr, _ = roc_curve(y_test, preds_test)
    sic = tpr / np.sqrt(fpr)

    random_tpr = np.linspace(0, 1, 300)
    random_sic = random_tpr / np.sqrt(random_tpr)

plt.plot(tpr, sic, label="CATHODE")
plt.plot(random_tpr, random_sic, "w:", label="random")
plt.xlabel("True Positive Rate")
plt.ylabel("Significance Improvement")
plt.legend(loc="upper right")
plt.show()