In [None]:
import os
import numpy as np
import tensorflow as tf
import h5py
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input,
    Dense,
    Embedding,
    BatchNormalization,
    Activation,
    Concatenate,
    GlobalAveragePooling1D,
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence
import matplotlib.pyplot as plt

2025-05-21 18:12:26.588051: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
def dense_embedding(
    n_features=6,
    n_features_cat=2,
    activation="relu",
    number_of_pupcandis=128,
    embedding_input_dim={0: 13, 1: 3},
    emb_out_dim=8,
    units=[32, 16],
):
    inputs_cont = Input(shape=(number_of_pupcandis, n_features - 2), name="input_cont")
    pxpy = Input(shape=(number_of_pupcandis, 2), name="input_pxpy")
    embeddings = []
    inputs = [inputs_cont, pxpy]

    for i_emb in range(n_features_cat):
        input_cat = Input(shape=(number_of_pupcandis,), name=f"input_cat{i_emb}")
        inputs.append(input_cat)
        embedding = Embedding(
            input_dim=embedding_input_dim[i_emb], output_dim=emb_out_dim
        )(input_cat)
        embeddings.append(embedding)

    emb_concat = Concatenate()(embeddings)
    x = Concatenate()([inputs_cont, emb_concat])

    for units_layer in units:
        x = Dense(units_layer, activation="relu")(x)
        x = BatchNormalization()(x)
        x = Activation(activation)(x)

    # Multiply with pxpy before pooling
    # CAUTION: This is according to Simon's diagram, removed in other versions
    x = tf.keras.layers.Multiply()([x, pxpy])

    x = GlobalAveragePooling1D()(x)
    outputs = Dense(2, activation="linear", name="output")(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
class METDataLoader(Sequence):
    def __init__(self, h5_file, batch_size=32, maxNPF=128):

        self.h5_file = h5_file
        self.batch_size = batch_size
        self.maxNPF = maxNPF

        # Load the data from the HDF5 file
        with h5py.File(h5_file, "r") as f:
            self.X = f["X"][:]
            self.Y = f["Y"][:]

        # Shuffle the data indices
        self.indices = np.arange(len(self.X))
        np.random.shuffle(self.indices)

    def __len__(self):
        """Returns the number of batches per epoch."""
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, idx):
        """Generates one batch of data."""
        # Get batch indices
        batch_indices = self.indices[
            idx * self.batch_size : (idx + 1) * self.batch_size
        ]

        # Extract batch data
        X_batch = self.X[batch_indices]
        Y_batch = self.Y[batch_indices]

        # Prepare inputs for the model
        input_cont = X_batch[:, :, :6]  # Continuous features
        input_pxpy = X_batch[:, :, 1:3]  # px and py
        input_cat0 = X_batch[:, :, 6]  # Encoded pdgId
        input_cat1 = X_batch[:, :, 7]  # Encoded charge

        # Return inputs and targets
        inputs = [input_cont, input_pxpy, input_cat0, input_cat1]
        return inputs, Y_batch

    def on_epoch_end(self):
        """Shuffles the data at the end of each epoch."""
        np.random.shuffle(self.indices)