In [1]:
import numpy as np
import awkward as ak
import uproot_methods

In [2]:
import logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

In [3]:
def stack_arrays(a, keys, axis=-1):
    flat_arr = np.stack([a[k].flatten() for k in keys], axis=axis)
    return awkward.JaggedArray.fromcounts(a[keys[0]].counts, flat_arr)

In [4]:
def pad_array(a, maxlen, value=0., dtype='float32'):
    x = (np.ones((len(a), maxlen)) * value).astype(dtype)
    for idx, s in enumerate(a):
        if not len(s):
            continue
        trunc = s[:maxlen].astype(dtype)
        x[idx, :len(trunc)] = trunc
    return x

In [5]:
##and Professor suggests that we could use mass, classifacation for later application
def SetAKArr(filepath):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    px_lists = []
    px_ls = []
    py_lists = []
    py_ls = []
    pz_lists = []
    pz_ls = []
    e_lists = []
    e_ls = []
    mass_lists = []
    mass_ls = []
    _label1 = []
    _label2 = []
    for line in lines:
        if line.startswith('E'):
            exp_inf = line.split()
            _label1.append(int(exp_inf[1]))
            _label2.append(1-int(exp_inf[1]))
            if px_ls:
                px_lists.append(px_ls)
            ##clear the data in px list
#             print(px_ls)
            if py_ls:
                py_lists.append(py_ls)
            if pz_ls:
                pz_lists.append(pz_ls)
            if e_ls:
                e_lists.append(e_ls)
            if mass_ls:
                mass_lists.append(mass_ls)
            px_ls = []
            py_ls = []
            pz_ls = []
            e_ls = []
            mass_ls = []
        else:
            par = line.split()
            px_ls.append(float(par[1]))
            py_ls.append(float(par[2]))
            pz_ls.append(float(par[3]))
            e_ls.append(float(par[4]))
            mass_ls.append(float(par[5]))
            
#     print(_label)
    px = ak.JaggedArray.fromiter(np.array(px_lists))
    py = ak.JaggedArray.fromiter(py_lists)
    pz = ak.JaggedArray.fromiter(pz_lists)
    energy = ak.JaggedArray.fromiter(e_lists)
    mass = ak.JaggedArray.fromiter(mass_lists)
    p4 = uproot_methods.TLorentzVectorArray.from_cartesian(px, py, pz, energy)
    ##Create an Order Dic
    from collections import OrderedDict
    v = OrderedDict()
    v['part_px'] = px
    v['part_py'] = py
    v['part_pz'] = pz
    v['part_energy'] = energy
    v['part_mass'] = mass
#     ls1 = [1,2,3,4]
#     ls2 = [5,6,7,8]
    v['label'] = np.stack((_label1, _label2), axis=-1)
#     print(v['label'])
    return v

In [6]:
class Dataset2(object):
    def __init__(self, filepath, feature_dict = {}, label = 'E', pad_len=100, data_format='channel_first'):
        self.filepath = filepath
        self.feature_dict = feature_dict
        if len(feature_dict) == 0:
            feature_dict['points'] = ['part_px','part_py','part_pz']
            feature_dict['features'] = ['part_energy', 'part_mass']
            feature_dict['mask'] = ['part_energy']
        ##currently we use 'E' for experiments
        self.label = label
        self.pad_len = pad_len
        assert data_format in ('channel_first', 'channel_last')
        self.stack_axis = 1 if data_format=='channel_first' else -1
        self._values = {}
        self._label = None
        self._load()
        
    def _load(self):
        logging.info('Start loading file %s' % self.filepath)
        counts = None
        a = SetAKArr(self.filepath)
        for k in self.feature_dict:
                cols = self.feature_dict[k]
                if not isinstance(cols, (list, tuple)):
                    cols = [cols]
                arrs = []
                for col in cols:
                    arrs.append(pad_array(a[col], self.pad_len))
                    ##check the dimesion of a[col], and it should be array.
                self._values[k] = np.stack(arrs, axis=self.stack_axis)
        logging.info('Finished loading file %s' % self.filepath)
        
        
        
    def __len__(self):
        return len(self._label)

    def __getitem__(self, key):
        if key==self.label:
            return self._label
        else:
            return self._values[key]
    
    @property
    def X(self):
        return self._values
    
    @property
    def y(self):
        return self._label

    def shuffle(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
        shuffle_indices = np.arange(self.__len__())
        np.random.shuffle(shuffle_indices)
        for k in self._values:
            self._values[k] = self._values[k][shuffle_indices]
        self._label = self._label[shuffle_indices]

In [7]:
train_dataset = Dataset2('train.txt', data_format='channel_last')
val_dataset = Dataset2('val.txt', data_format='channel_last')

[2024-02-03 19:54:45,438] INFO: Start loading file train.txt
[2024-02-03 19:54:45,525] INFO: Finished loading file train.txt
[2024-02-03 19:54:45,526] INFO: Start loading file val.txt
[2024-02-03 19:54:45,551] INFO: Finished loading file val.txt


In [8]:
import tensorflow as tf
from tensorflow import keras
from tf_keras_model import get_particle_net, get_particle_net_lite

In [9]:
model_type = 'particle_net_lite' # choose between 'particle_net' and 'particle_net_lite'
num_classes = train_dataset.y.shape[1]
input_shapes = {k:train_dataset[k].shape[1:] for k in train_dataset.X}
if 'lite' in model_type:
    model = get_particle_net_lite(num_classes, input_shapes)
else:
    model = get_particle_net(num_classes, input_shapes)

AttributeError: 'NoneType' object has no attribute 'shape'

In [10]:
# Training parameters
batch_size = 1024 if 'lite' in model_type else 384
epochs = 30

In [11]:
def lr_schedule(epoch):
    lr = 1e-3
    if epoch > 10:
        lr *= 0.1
    elif epoch > 20:
        lr *= 0.01
    logging.info('Learning rate: %f'%lr)
    return lr

In [12]:
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(learning_rate=lr_schedule(0)),
              metrics=['accuracy'])
model.summary()

NameError: name 'model' is not defined

In [12]:
# Prepare model model saving directory.
import os
save_dir = 'model_checkpoints'
model_name = '%s_model.{epoch:03d}.h5' % model_type
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
filepath = os.path.join(save_dir, model_name)

# Prepare callbacks for model saving and for learning rate adjustment.
checkpoint = keras.callbacks.ModelCheckpoint(filepath=filepath,
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True)

lr_scheduler = keras.callbacks.LearningRateScheduler(lr_schedule)
progress_bar = keras.callbacks.ProgbarLogger()
callbacks = [checkpoint, lr_scheduler, progress_bar]

In [13]:
train_dataset.shuffle()
model.fit(train_dataset.X, train_dataset.y,
          batch_size=batch_size,
#           epochs=epochs,
          epochs=1, # --- train only for 1 epoch here for demonstration ---
          validation_data=(val_dataset.X, val_dataset.y),
          shuffle=True,
          callbacks=callbacks)

2023-11-27 13:36:58.829210: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1937600000 exceeds 10% of free system memory.
2023-11-27 13:36:59.744611: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-11-27 13:36:59.744872: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2688000000 Hz
[2023-11-27 13:36:59,767] INFO: Learning rate: 0.001000


      0/Unknown - 1617s 0s/sample - loss: 0.2187 - accuracy: 0.9116





<tensorflow.python.keras.callbacks.History at 0x7fb3f6a6a190>