In [1]:
import numpy as np
#import awkward
import awkward as ak

In [2]:
import logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

In [3]:
"""
def stack_arrays(a, keys, axis=-1):
    flat_arr = np.stack([a[k].flatten() for k in keys], axis=axis)
    return awkward.JaggedArray.fromcounts(a[keys[0]].counts, flat_arr)
"""


'\ndef stack_arrays(a, keys, axis=-1):\n    flat_arr = np.stack([a[k].flatten() for k in keys], axis=axis)\n    return awkward.JaggedArray.fromcounts(a[keys[0]].counts, flat_arr)\n'

In [None]:
"""
def pad_array(a, maxlen, value=0., dtype='float32'):
    x = (np.ones((len(a), maxlen)) * value).astype(dtype)
    for idx, s in enumerate(a):
        if not len(s):
            continue
        trunc = s[:maxlen].astype(dtype)
        x[idx, :len(trunc)] = trunc
    return x
"""
def pad_array(a, maxlen, value=0., dtype='float32'):
    """
    a: ak.Array jagged 1D
    Devuelve un np.ndarray (N, maxlen) con padding/clipping y casteo de tipo.
    """
    # 1) pad a longitud fija con None, clip si excede
    padded = ak.pad_none(a, maxlen, clip=True)      # docs: ak.pad_none
    # 2) reemplaza None por 'value'
    filled = ak.fill_none(padded, value)
    # 3) castea valores (equivalente moderno a .astype)
    casted = ak.values_astype(filled, np.dtype(dtype))
    # 4) a NumPy
    return ak.to_numpy(casted)



In [None]:
class Dataset(object):

    def __init__(self, filepath, feature_dict = {}, label='label', pad_len=100, data_format='channel_first'):
        self.filepath = filepath
        self.feature_dict = feature_dict
        if len(feature_dict)==0:
            feature_dict['points'] = ['part_etarel', 'part_phirel']
            feature_dict['features'] = ['part_pt_log', 'part_e_log', 'part_etarel', 'part_phirel']
            feature_dict['mask'] = ['part_pt_log']
        self.label = label
        self.pad_len = pad_len
        assert data_format in ('channel_first', 'channel_last')
        self.stack_axis = 1 if data_format=='channel_first' else -1
        self._values = {}
        self._label = None
        self._load()

    def _load(self):
        logging.info('Start loading file %s' % self.filepath)
        
        counts = None
        #with awkward.load(self.filepath) as a:
        #    self._label = a[self.label]
        # Lee un único Parquet o un patrón (p. ej. 'converted/train_file_*.parquet').
        a = ak.from_parquet(
            self.filepath,
            columns=[self.label] + sum(self.feature_dict.values(), [])
        )
        #self._label = a[self.label]
        self._label = ak.to_numpy(a[self.label]).astype("float32")
        for k in self.feature_dict:
            cols = self.feature_dict[k]
            if not isinstance(cols, (list, tuple)):
                cols = [cols]
            arrs = []
            for col in cols:
                if counts is None:
                    #counts = a[col].counts
                    counts = ak.to_numpy(ak.num(a[col], axis=1))
                else:
                    #assert np.array_equal(counts, a[col].counts)
                    assert np.array_equal(counts, ak.to_numpy(ak.num(a[col], axis=1)))
                    
                arrs.append(pad_array(a[col], self.pad_len))
            self._values[k] = np.stack(arrs, axis=self.stack_axis)
        logging.info('Finished loading file %s' % self.filepath)


    def __len__(self):
        return len(self._label)

    def __getitem__(self, key):
        if key==self.label:
            return self._label
        else:
            return self._values[key]
    
    @property
    def X(self):
        return self._values
    
    @property
    def y(self):
        return self._label

    def shuffle(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        shuffle_indices = np.arange(self.__len__())
        np.random.shuffle(shuffle_indices)
        for k in self._values:
            self._values[k] = self._values[k][shuffle_indices]
        self._label = self._label[shuffle_indices]

In [6]:
#train_dataset = Dataset('converted/train_file_0.awkd', data_format='channel_last')
train_dataset = Dataset('converted/train_file_*.parquet', data_format='channel_last')

#val_dataset = Dataset('converted/val_file_0.awkd', data_format='channel_last')
val_dataset   = Dataset('converted/val_file_*.parquet',   data_format='channel_last')


[2025-08-09 00:31:37,938] INFO: Start loading file converted/train_file_*.parquet
[2025-08-09 00:31:38,030] INFO: Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
[2025-08-09 00:31:38,031] INFO: NumExpr defaulting to 16 threads.
[2025-08-09 00:31:56,658] INFO: Finished loading file converted/train_file_*.parquet
[2025-08-09 00:31:56,671] INFO: Start loading file converted/val_file_*.parquet
[2025-08-09 00:32:00,143] INFO: Finished loading file converted/val_file_*.parquet


In [7]:
import importlib, tf_keras_model
importlib.reload(tf_keras_model)


2025-08-09 00:32:00.353585: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<module 'tf_keras_model' from '/home/diego.vasquez/Documents/ParticleNet-update/tf-keras/tf_keras_model.py'>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tf_keras_model import get_particle_net, get_particle_net_lite

In [9]:
model_type = 'particle_net_lite' # choose between 'particle_net' and 'particle_net_lite'
num_classes = train_dataset.y.shape[1]
input_shapes = {k:train_dataset[k].shape[1:] for k in train_dataset.X}
if 'lite' in model_type:
    model = get_particle_net_lite(num_classes, input_shapes)
else:
    model = get_particle_net(num_classes, input_shapes)

W0000 00:00:1754713921.778313  406452 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [None]:
# Training parameters
batch_size = 1024 if 'lite' in model_type else 384
epochs = 30

In [None]:
def lr_schedule(epoch):
    lr = 1e-3
    if epoch > 10:
        lr *= 0.1
    elif epoch > 20:
        lr *= 0.01
    logging.info('Learning rate: %f'%lr)
    return lr

In [12]:
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(learning_rate=lr_schedule(0)),
              metrics=['accuracy'])
model.summary()

[2025-08-09 00:32:01,940] INFO: Learning rate: 0.001000


In [None]:
# Prepare model model saving directory.
import os
save_dir = 'model_checkpoints'
model_name = '%s_model.{epoch:03d}.h5' % model_type
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
filepath = os.path.join(save_dir, model_name)

# Prepare callbacks for model saving and for learning rate adjustment.
checkpoint = keras.callbacks.ModelCheckpoint(filepath=filepath,
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True)

lr_scheduler = keras.callbacks.LearningRateScheduler(lr_schedule)
progress_bar = keras.callbacks.ProgbarLogger()
callbacks = [checkpoint, lr_scheduler, progress_bar]

In [None]:
train_dataset.shuffle()
model.fit(train_dataset.X, train_dataset.y,
          batch_size=batch_size,
#           epochs=epochs,
          epochs=1, # --- train only for 1 epoch here for demonstration ---
          validation_data=(val_dataset.X, val_dataset.y),
          shuffle=True,
          callbacks=callbacks)

[2025-08-09 00:32:09,619] INFO: Learning rate: 0.001000


[1m  22/2268[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:19:41[0m 2s/step - accuracy: 0.6811 - loss: 0.6291