# Create DATASET for training 

In [7]:
import numpy as np
import h5py
import os

#os.system("zenodo_get 10.5281/zenodo.3602260 -o data")
#os.system("tar xvzf data/hls4ml_LHCjet_150p_train.tar.gz -C data/")

# Data PATH
TRAIN_PATH = "/Users/sznajder/WorkM1/workdir/data/hls4ml_LHCjet_150p_train/"


first = True
for file in os.listdir(TRAIN_PATH):
    print("Appending %s" % file)

    with h5py.File(TRAIN_PATH + file, "r") as data:
        if first:
            first = False
            jetConstituent = data["jetConstituentList"][:, :, [5, 8, 11]]
            target = data["jets"][:, -6:-1]
            print("Keys in H5PY files = ", list(data.keys()))
            print(" ")
            featurenames = data.get("jetFeatureNames")
            print("Jets Features = ", featurenames[:])
            print(" ")
            featurenames = data.get("particleFeatureNames")
            print("Jet Constituents Features = ", featurenames[:])
            print(" ")
            images = data.get("jetImage")
            print("Jet Images = ", images[:])
            print("Jet Image Shape = ", images.shape)
            print(" ")
        else:
            # Read (Pt,Etarel,Phirel)
            jetConstituent = np.concatenate(
                [jetConstituent, data["jetConstituentList"][:, :, [5, 8, 11]]], axis=0
            )
            target = np.concatenate([target, data["jets"][:, -6:-1]], axis=0)

print("Target shape =", target.shape)
print("Jet Constituents shape =", jetConstituent.shape)

# The dataset is N_jets x N_constituents x N_features
njet = jetConstituent.shape[0]
nconstit = jetConstituent.shape[1]
nfeat = jetConstituent.shape[2]


# Filter out constituents with Pt<2GeV
Ptmin = 2.0
constituents = np.zeros((njet, nconstit, nfeat), dtype=np.float32)
ij = 0
max_constit = 0
for j in range(njet):
    ic = 0
    for c in range(nconstit):
        if jetConstituent[j, c, 0] < Ptmin:
            continue
        constituents[ij, ic, :] = jetConstituent[j, c, :]
        ic += 1
    if ic > 0:
        if ic > max_constit:
            max_constit = ic
        target[ij, :] = target[j, :]  # associate the correct target a given graph
        ij += 1


# Resizes the jets constituents and target arrays
jetConstituent = constituents[0:ij, 0:max_constit, :]
target = target[0:ij, :]

# transform pt -> log(pt+1)
# jetConstituent[:, :, 0] = np.log(jetConstituent[:, :, 0] + 1)

# Shuffles jet constituents
print("Before --->> jetConstituent[0,0:4,0] = ",jetConstituent[0,0:4,0])
for i in range(jetConstituent.shape[0]):
  jetConstituent[i] = jetConstituent[i, np.random.permutation(nconstit), :]
print("After --->> jetConstituent[0,0:4,0] = ",jetConstituent[0,0:4,0])


# Saves dataset for training / validation / test
np.save("data/jetConstituent_150_3f.npy", jetConstituent)
np.save("data/jetConstituent_target_150_3f.npy", target)



Appending jetImage_6_150p_40000_50000.h5
Keys in H5PY files =  ['jetConstituentList', 'jetFeatureNames', 'jetImage', 'jetImageECAL', 'jetImageHCAL', 'jets', 'particleFeatureNames']
 
Jets Features =  [b'j_ptfrac' b'j_pt' b'j_eta' b'j_mass' b'j_tau1_b1' b'j_tau2_b1'
 b'j_tau3_b1' b'j_tau1_b2' b'j_tau2_b2' b'j_tau3_b2' b'j_tau32_b1'
 b'j_tau32_b2' b'j_zlogz' b'j_c1_b0' b'j_c1_b1' b'j_c1_b2' b'j_c2_b1'
 b'j_c2_b2' b'j_d2_b1' b'j_d2_b2' b'j_d2_a1_b1' b'j_d2_a1_b2' b'j_m2_b1'
 b'j_m2_b2' b'j_n2_b1' b'j_n2_b2' b'j_tau1_b1_mmdt' b'j_tau2_b1_mmdt'
 b'j_tau3_b1_mmdt' b'j_tau1_b2_mmdt' b'j_tau2_b2_mmdt' b'j_tau3_b2_mmdt'
 b'j_tau32_b1_mmdt' b'j_tau32_b2_mmdt' b'j_c1_b0_mmdt' b'j_c1_b1_mmdt'
 b'j_c1_b2_mmdt' b'j_c2_b1_mmdt' b'j_c2_b2_mmdt' b'j_d2_b1_mmdt'
 b'j_d2_b2_mmdt' b'j_d2_a1_b1_mmdt' b'j_d2_a1_b2_mmdt' b'j_m2_b1_mmdt'
 b'j_m2_b2_mmdt' b'j_n2_b1_mmdt' b'j_n2_b2_mmdt' b'j_mass_trim'
 b'j_mass_mmdt' b'j_mass_prun' b'j_mass_sdb2' b'j_mass_sdm1'
 b'j_multiplicity' b'j_g' b'j_q' b'j_w' b'j_z' b'