# Create Dataset for Jet Tagging @ L1 studies

In [1]:
import tensorflow as tf
print(f"TensorFlow {tf.__version__}")
print(tf.config.list_physical_devices())
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    print(f"Number of available GPUs : {len(gpus)}")
    tf.config.set_visible_devices(gpus[0],"GPU")
    tf.config.experimental.set_memory_growth(gpus[0],True)
else:
    print("No GPU available, using CPU !!!")    

# To disable GPU use
tf.config.set_visible_devices([], 'GPU')


TensorFlow 2.8.0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Number of available GPUs : 1


## Define Standization Methods for HLS Jet Data 
### From Patrick :   
### https://github.com/bb511/know_dist/blob/main/preprocessing/standardisation.py

In [2]:
# Standardisation methods for the jet data.

import numpy as np
from sklearn.preprocessing import RobustScaler


def apply_standardisation(choice: str, x_data: np.ndarray, feat_range: tuple = (0, 1)):
    """Choose the type of normalisation to apply to the data.

    Args:
        choice: The choice of the user with repsect to the type of norm to apply.
        x_data: Array containing the data to normalise.
        feat_range: The range that each feature will be normalised to, e.g., in minmax.

    Returns:
        Normalised x_data numpy array.
    """
    if choice == "nonorm":
        print("Skipping normalisation...")
        return x_data

    print(f"Applying {choice} normalisation...")
    switcher = {
        "minmax": lambda: minmax(x_data, feat_range),
        "robust": lambda: robust(x_data),
        "standard": lambda: standard(x_data),
    }

    x_data = switcher.get(choice, lambda: None)()

    if x_data is None:
        raise NameError(
            "Type of normalisation does not exist! Please choose from "
            f"the following list: {list(switcher.keys())}"
        )

    return x_data


def minmax(x: np.ndarray, feature_range: tuple = (0, 1)) -> np.ndarray:
    """Applies minmax normalisation to the data, i.e., every feature of every sample
    is divided by the maximum for that respective feature.

    Args:
        x: Data array.
        feature_range: Minimum and maximum of features after the normalisation.
    """
    min_feats = x.min(axis=0).min(axis=0)
    max_feats = x.max(axis=0).max(axis=0)
    x_norm = (x - min_feats) / (max_feats - min_feats)
    x_norm = x_norm * (feature_range[1] - feature_range[0]) + feature_range[0]

    return x_norm


def robust(x: np.ndarray, percentiles: list = [95, 5]) -> np.ndarray:
    """Applies robust normalisation to the data, i.e., the median of every feature is
    subtracted from every respective sample belonging to that feature and then each
    feature is scaled with respect to the respective inter-quantile range between
    the 1st and 3rd quantiles.

    Args:
        x: Data array.
        percentiles: Between which percentiles to normalise. The default is from the
            google interaction network paper. The sklearn standard is [75, 25].
    """
    x_median = []
    interquantile_range = []

    for feature_idx in range(x.shape[-1]):
        x_feature = x[:, :, feature_idx].flatten()
        x_median.append(np.nanmedian(x_feature, axis=0))
        quantile_high, quantile_low = np.nanpercentile(x_feature, percentiles)
        interquantile_range.append(quantile_high - quantile_low)

    x_norm = (x - x_median) / interquantile_range

    return x_norm


def standard(x: np.ndarray) -> np.ndarray:
    """Applies standard normalisation to the data, i.e., the mean of each feature is
    subtracted from every sample belonging to the respective feature and then divided
    by the corresponding standard deviation.
    """
    x_mean = []
    x_std = []

    for feature_idx in range(x.shape[-1]):
        x_feature = x[:, :, feature_idx].flatten()
        x_mean.append(x_feature.mean(axis=0))
        x_std.append(x_feature.std(axis=0))

    x_norm = (x - x_mean) / x_std

    return x_norm

# Load HLS4ML dataset 

Here, we load the numpy arrays containing the 4D tensors of "jet-images" (see https://arxiv.org/pdf/1511.05190.pdf)

https://github.com/pierinim/tutorials/blob/master/GGI_Jan2021/Lecture1/Notebook1_ExploreDataset.ipynb

 * `jetImage` contains the image representation of the jets (more later)
 * `jetImageECAL` and `jetImageHCAL` are the ECAL- and HCAL-only equivalent images. We will not use them (but you are more than welcome to play with it)
 * `jetConstituentList` is the list of particles cointained in the jet. For each particle, a list of relevant quantities is stored
 * `particleFeatureNames` is the list of the names corresponding to the quantities contained in `jetConstituentList`; `jets` is the dataset we consider for the moment
 * `jetFeatureNames` is the list of the names corresponding to the quantities contained in `jets`

In [3]:
import numpy as np
import h5py
import os

# for pT, eta_rel, phi_rel
#   myJetConstituentList = np.array(f.get("jetConstituentList")[:,:,[5,8,11]])
# for px, py, pz
#   myJetConstituentList = np.array(f.get("jetConstituentList")[:,:,[0,1,2]])
#   myJetConstituentList = np.array(f.get("jetConstituentList"))
#
# Jet Constituents Features =  [0='j1_px', 1='j1_py', 2='j1_pz', 3='j1_e', 4='j1_erel', 5='j1_pt', 6='j1_ptrel',
#                         7='j1_eta', 8='j1_etarel', 9='j1_etarot', 10='j1_phi', 11='j1_phirel', 12='j1_phirot',
#                         13='j1_deltaR', 14='j1_costheta', 15='j1_costhetarel', 16='j1_pdgid']

#Data PATH
TRAIN_PATH = '/Users/sznajder/WorkM1/workdir/data/hls4ml_LHCjet_150p/'



first=True
for file in os.listdir(TRAIN_PATH):
  print("Appending %s" %file)

  with h5py.File(TRAIN_PATH+file, 'r') as data:
    if first : 
        first=False
        jetConstituent = data['jetConstituentList'][:,:,[5,8,11]]
        target = data['jets'][:,-6:-1]
        jet_pt = data['jets'][:,1]
        print("Keys in H5PY files = ",list( data.keys() ))
        print(" ")
        featurenames = data.get('jetFeatureNames')
        print("Jets Features = ",featurenames[:])
        print(" ")
        featurenames = data.get('particleFeatureNames')
        print("Jet Constituents Features = ",featurenames[:])
        print(" ")
        images = data.get('jetImage')
        print("Jet Images = ",images[:])        
        print("Jet Image Shape = ",images.shape)   
        print(" ")
        print("Jet Pt=", jet_pt)
        print(" ")
    else:
         # Read (Pt,Etarel,Phirel)
        jetConstituent = np.concatenate( [ jetConstituent, data['jetConstituentList'][:,:,[5,8,11]] ] , axis=0 )
        target   = np.concatenate( [ target, data['jets'][:,-6:-1] ] , axis=0 )
        jet_pt   = np.concatenate( [ jet_pt, data['jets'][:,1]     ] , axis=0 )

print("Target shape =", target.shape)
print("Jet Constituents shape =", jetConstituent.shape)
print("Jet Pt shape =", jet_pt.shape)


Appending jetImage_6_150p_40000_50000.h5
Keys in H5PY files =  ['jetConstituentList', 'jetFeatureNames', 'jetImage', 'jetImageECAL', 'jetImageHCAL', 'jets', 'particleFeatureNames']
 
Jets Features =  [b'j_ptfrac' b'j_pt' b'j_eta' b'j_mass' b'j_tau1_b1' b'j_tau2_b1'
 b'j_tau3_b1' b'j_tau1_b2' b'j_tau2_b2' b'j_tau3_b2' b'j_tau32_b1'
 b'j_tau32_b2' b'j_zlogz' b'j_c1_b0' b'j_c1_b1' b'j_c1_b2' b'j_c2_b1'
 b'j_c2_b2' b'j_d2_b1' b'j_d2_b2' b'j_d2_a1_b1' b'j_d2_a1_b2' b'j_m2_b1'
 b'j_m2_b2' b'j_n2_b1' b'j_n2_b2' b'j_tau1_b1_mmdt' b'j_tau2_b1_mmdt'
 b'j_tau3_b1_mmdt' b'j_tau1_b2_mmdt' b'j_tau2_b2_mmdt' b'j_tau3_b2_mmdt'
 b'j_tau32_b1_mmdt' b'j_tau32_b2_mmdt' b'j_c1_b0_mmdt' b'j_c1_b1_mmdt'
 b'j_c1_b2_mmdt' b'j_c2_b1_mmdt' b'j_c2_b2_mmdt' b'j_d2_b1_mmdt'
 b'j_d2_b2_mmdt' b'j_d2_a1_b1_mmdt' b'j_d2_a1_b2_mmdt' b'j_m2_b1_mmdt'
 b'j_m2_b2_mmdt' b'j_n2_b1_mmdt' b'j_n2_b2_mmdt' b'j_mass_trim'
 b'j_mass_mmdt' b'j_mass_prun' b'j_mass_sdb2' b'j_mass_sdm1'
 b'j_multiplicity' b'j_g' b'j_q' b'j_w' b'j_z' b'

# The Image dataset labels ( ONE HOT ENCODING )

Jets can be converted to images considering the (&eta;, &phi;) plane, centered along the axis direction and binned.
In our case, we consider a square of 1.6x1.6 in size (because the jet size is R=0.8) binned in 100x100 equal-size 'cells'


The ground truth is incorporated in the `['g', 'q', 'w', 'z', 't']` vector of boolean, taking the form
- `[1, 0, 0, 0, 0]` for gluons
- `[0, 1, 0, 0, 0]` for quarks
- `[0, 0, 1, 0, 0]` for Ws
- `[0, 0, 0, 1, 0]` for Zs
- `[0, 0, 0, 0, 1]` for tops

This is what is called 'one-hot' encoding of a descrete label (typical of ground truth for classification problems)

## Filter out Pt<2GeV constituents and Shuffles constituents

In [4]:
from einops import rearrange, reduce, repeat

# Convert target format from one-hot encoding to single neuron
#target = np.argmax(target, axis=1)

# The dataset is N_jets x N_constituents x N_features
njet     = jetConstituent.shape[0]
nconstit = jetConstituent.shape[1]
nfeat    = jetConstituent.shape[2]


# Filter out constituents with Pt<2GeV
Ptmin =2. 
constituents = np.zeros((njet, nconstit, nfeat) , dtype=np.float32) 
ij=0
max_constit=0
for j in range(njet):
    ic=0
    for c in range(nconstit):
        if ( jetConstituent[j,c,0] < Ptmin ):
            continue
        constituents[ij,ic,:] = jetConstituent[j,c,:] 
        ic+=1
    if (ic > 0):
        if ic > max_constit: max_constit=ic
        target[ij,:]=target[j,:] # assosicate the correct target a given graph 
        jet_pt[ij]=jet_pt[j]
        ij+=1


# Resizes the jets constituents and target arrays        
jetConstituent = constituents[0:ij,0:max_constit,:]
target = target[0:ij,:]
jet_pt = jet_pt[0:ij]

# Restric the number of constituents to a maximum of NMAX
nmax = 8
jetConstituent = jetConstituent[:,0:nmax,:]

# The dataset is N_jets x N_constituents x N_features
njet     = jetConstituent.shape[0]
nconstit = jetConstituent.shape[1]
nfeat    = jetConstituent.shape[2]


print('Number of jets =',njet)
print('Number of constituents =',nconstit)
print('Number of features =',nfeat)


# Shuffles jet constituents
print("Before Shuffling --->> jetConstituent[0,0:4,0] = ",jetConstituent[0,0:4,0])
for i in range(jetConstituent.shape[0]):
  jetConstituent[i] = jetConstituent[i, np.random.permutation(nconstit), :]
print("After Shuffling  --->> jetConstituent[0,0:4,0] = ",jetConstituent[0,0:4,0])


# Normalize data features using Patrick's code
norm = "none"
#norm = "robust"
#norm = "standard"
#norm = "minmax"
#norm = "rescale"

print("Using Feature Normalization  --->> {} ".format(norm))

if ( norm=="rescale" ):
# Rescale constituents features by 
  for j in range(njet):
#    jetConstituent[j,:,0]=jetConstituent[j,:,0]/jet_pt[j]
    jetConstituent[j,:,0]=jetConstituent[j,:,0]/100.
    jetConstituent[j,:,1]=jetConstituent[j,:,1]/10.
    jetConstituent[j,:,2]=jetConstituent[j,:,2]/6.5  
  print("After Normalization {} --->> jetConstituent[0,0:4,0] = ".format(norm),jetConstituent[0,0:4,0])
elif ( norm!="none" ):
  jetConstituent = apply_standardisation( norm, jetConstituent )
  print("After Normalization {} --->> jetConstituent[0,0:4,0] = ".format(norm),jetConstituent[0,0:4,0])

    


Number of jets = 880000
Number of constituents = 8
Number of features = 3
Before Shuffling --->> jetConstituent[0,0:4,0] =  [118.65741  113.409935 113.007545 104.92594 ]
After Shuffling  --->> jetConstituent[0,0:4,0] =  [ 45.74336  118.65741  104.92594  113.409935]
Using Feature Normalization  --->> none 


## Dividing the data into testing and training dataset

We will split the data into two parts (one for training+validation and one for testing) 

In [5]:
from sklearn.model_selection import train_test_split

X = jetConstituent
Y = target
del jetConstituent , target

X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)

print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

print('number of G jets for training/validation: %i'%np.sum( np.argmax(Y_train_val, axis=1)==0 ))
print('number of Q jets for training/validation: %i'%np.sum( np.argmax(Y_train_val, axis=1)==1 ))
print('number of W jets for training/validation: %i'%np.sum( np.argmax(Y_train_val, axis=1)==2 ))
print('number of Z jets for training/validation: %i'%np.sum( np.argmax(Y_train_val, axis=1)==3 ))
print('number of T jets for training/validation: %i'%np.sum( np.argmax(Y_train_val, axis=1)==4 ))


print('number of G jets for testing: %i'%np.sum( np.argmax(Y_test, axis=1)==0 ))
print('number of Q jets for testing: %i'%np.sum( np.argmax(Y_test, axis=1)==1 ))
print('number of W jets for testing: %i'%np.sum( np.argmax(Y_test, axis=1)==2 ))
print('number of Z jets for testing: %i'%np.sum( np.argmax(Y_test, axis=1)==3 ))
print('number of T jets for testing: %i'%np.sum( np.argmax(Y_test, axis=1)==4 ))

(589600, 8, 3) (290400, 8, 3) (589600, 5) (290400, 5)
number of G jets for training/validation: 118789
number of Q jets for training/validation: 114175
number of W jets for training/validation: 118845
number of Z jets for training/validation: 118615
number of T jets for training/validation: 119176
number of G jets for testing: 58463
number of Q jets for testing: 56504
number of W jets for testing: 58327
number of Z jets for testing: 58337
number of T jets for testing: 58769


## Saves the DATASET

In [6]:
np.save("../../data/X_train_val_nconst_{}_norm_{}".format(nconstit,norm), X_train_val)
np.save("../../data/X_test_nconst_{}_norm_{}".format(nconstit,norm)     , X_test)
np.save("../../data/Y_train_val_nconst_{}_norm_{}".format(nconstit,norm), Y_train_val)
np.save("../../data/Y_test_nconst_{}_norm_{}".format(nconstit,norm)     , Y_test)