In [1]:
import sys
import uproot
import os
import numpy as np
import pandas as pd

In [2]:
part = "pion_c" # "gamma" "electron" "muon"
path = "/data/user/adipilat/ParticleID/genEvts/"
unpad_path = "/data/user/adipilat/ParticleID/genEvts/new_datasets/unpadded/"
pad_path = "/data/user/adipilat/ParticleID/genEvts/new_datasets/padded/"
dir_ = "ana"
tree = "hgc"
max_perlayer = 10
number_layers = 50

In [3]:
variableName = [
            'event',
            'cluster2d_layer',
            'cluster2d_energy',
            'cluster2d_eta',
            'cluster2d_phi',
            'cluster2d_pt',
            'cluster2d_x',
            'cluster2d_y',
            'cluster2d_z',
            'gen_energy',
            'cluster2d_best_cpPdg',
            'cluster2d_best_cpId',
            'tracksterEM_clusters',
            'tracksterMIP_clusters',
            'tracksterHAD_clusters'
            ]
newVars =["event","tracksterID","trackster","layer","x","y","z","phi","eta","E","pt","genE","pid"]

In [4]:
name = "4_" + part + "_new"
file = path + part + "/" + name + ".root"
print("Starting data production for "+ part)
print(file)

Starting data production for pion_c
/data/user/adipilat/ParticleID/genEvts/pion_c/4_pion_c_new.root


In [5]:
df = uproot.open(file)[dir_][tree].pandas.df(variableName,flatten=False)

num_events = np.unique(df["event"].values).shape[0]
xs = df["cluster2d_x"].values
ys = df["cluster2d_y"].values
zs = df["cluster2d_z"].values
es = df["cluster2d_energy"].values
ps = df["cluster2d_pt"].values
ll = df["cluster2d_layer"].values
ee = df["event"].values
cphi = df["cluster2d_phi"].values
ceta = df["cluster2d_eta"].values
cp = df["cluster2d_best_cpPdg"].values
cpid = df["cluster2d_best_cpId"].values
    
sizes = [x.shape[0] for x in xs]
gen = df["gen_energy"].values
gen = [np.full((a[0]),a[1]) for a in zip(sizes,df["gen_energy"].values)]

trEM = df["tracksterEM_clusters"].values
trMIP = df["tracksterMIP_clusters"].values
trHAD = df["tracksterHAD_clusters"].values

In [6]:
# Define some trackster type labels - 0 = EM, 1 = HAD, 2 = MIP --->not used now
typeTr = [trEM,trHAD] #,trMIP]

In [7]:
# Store all the CP recoEn per event

cprecoEn = [] 
for i in range(num_events):
    tempEn = []
    cps = np.unique(cpid[i])
    for j in range(len(cps)):
        indices = np.where(cpid[i]==cps[j])
        tempEn.append(sum(es[i][indices]))
    cprecoEn.append(tempEn)

In [8]:
# Declare new lists

evId,trId,trNum,xTr,yTr,zTr,lTr,ptTr,enTr,etaTr,phiTr,pidTr,genTr = [],[],[],[],[],[],[],[],[],[],[],[],[]

# Loop over tracksters and append the LC info to the respective lists

for i in range(num_events):
    for num, key in enumerate(typeTr):
        for j in range(len(key[i])):
            cpidTr = []
            for item in key[i][j]:
                evId.append(ee[i])
                trId.append(num)
                trNum.append(j)
                xTr.append(xs[i][item])
                yTr.append(ys[i][item])
                zTr.append(zs[i][item])
                lTr.append(ll[i][item])
                ptTr.append(ps[i][item])
                enTr.append(es[i][item])
                etaTr.append(ceta[i][item])
                phiTr.append(cphi[i][item])
                genTr.append(gen[i][item])

                cpidTr.append(cpid[i][item])

            cpIdx = np.unique(cpidTr)
            fracEn = []
            for k in range(len(cpIdx)):
                indices = np.where(cpidTr==cpIdx[k])
                track_idx = [key[i][j][l] for l in indices[0]]
                frac = float(sum(es[i][track_idx])/cprecoEn[i][k])
                fracEn.append(frac)
            maxfracIdx = np.argmax(fracEn)
            if(fracEn[maxfracIdx] > 0.5):
                indices = np.where(cpidTr==cpIdx[maxfracIdx])
                track_idx = [key[i][j][l] for l in indices[0]]
                pidTr.append([cp[i][track_idx[0]]]*len(key[i][j]))
            else:
                pidTr.append([-1]*len(key[i][j]))

In [9]:
# Flatten everything and create arrays for dataset
EVID = np.array(evId)
TRID = np.array(trId)
TRNUM = np.array(trNum)
XTR = np.array(xTr)
YTR = np.array(yTr)
ZTR = np.array(zTr)
LTR = np.array(lTr)
PTTR = np.array(ptTr)
ENTR = np.array(enTr)
ETATR = np.array(etaTr)
PHITR = np.array(phiTr)
GENTR = np.array(genTr)
PIDTR = np.array([item for sublist in pidTr for item in sublist])

In [10]:
# Create the dataset
datas = np.vstack((EVID,TRID,TRNUM,LTR,XTR,YTR,ZTR,PHITR,ETATR,ENTR,PTTR,GENTR,PIDTR)).T
df = pd.DataFrame(datas,columns=newVars)
df = df.sort_values(["event","tracksterID","trackster","layer","E"],ascending=[True,True,True,True,False]).reset_index(drop=True)

In [11]:
df.to_hdf(unpad_path + part + "_aT.h5","data",complevel=0)

In [12]:
# Now we enumerate tracksters since they'll be used for training and we don't need the info
#about the event or the trackster type

In [13]:
trackster_sizes = df.groupby(['event', 'tracksterID', 'trackster']).size().values.tolist()
trackster_places = np.cumsum(trackster_sizes)
num_tracksters = len(trackster_sizes)
track_startes = np.array( [0] + list(trackster_places[:-1]))
track_finishes = np.array(list(track_startes[1:]) +[len(df)])
track_id = np.arange(1,num_tracksters+1)
track_bounds = np.vstack((track_startes,track_finishes)).T
new_tracks = [[i for j in range(t[1]-t[0])] for i,t in zip(track_id, track_bounds)]
new_tracks = np.array([item for sublist in new_tracks for item in sublist])
df['trackster'] = new_tracks

In [14]:
del df['event']
del df['tracksterID']

In [15]:
df[0:50]

Unnamed: 0,trackster,layer,x,y,z,phi,eta,E,pt,genE,pid
0,1,3.0,74.818108,-26.049902,325.072754,-0.335049,2.119453,0.046474,0.011004,367.569397,211.0
1,1,4.0,75.16272,-25.949078,326.017273,-0.332427,2.11869,0.407647,0.096594,367.569397,211.0
2,1,5.0,75.733452,-26.873775,328.042755,-0.340986,2.114448,0.274669,0.065353,367.569397,211.0
3,1,6.0,75.668083,-26.670002,328.987244,-0.338865,2.118809,0.967133,0.229141,367.569397,211.0
4,1,6.0,75.861526,-28.90715,328.987244,-0.364066,2.107354,0.175063,0.041941,367.569397,211.0
5,1,6.0,75.225067,-24.576128,328.987244,-0.31577,2.132137,0.046474,0.010869,367.569397,211.0
6,1,7.0,76.822479,-26.598392,331.012726,-0.333314,2.111947,1.766114,0.421239,367.569397,211.0
7,1,8.0,76.769814,-26.714674,331.957245,-0.334878,2.114853,3.271002,0.777974,367.569397,211.0
8,1,8.0,74.569885,-29.01684,331.957275,-0.371094,2.130139,0.112203,0.026293,367.569397,211.0
9,1,8.0,79.980705,-28.05938,331.957245,-0.337411,2.074245,0.065096,0.016104,367.569397,211.0


In [16]:
theIndex = list(df.groupby(["trackster","layer"]).indices.values())
theIndex = np.array([item for sublist in theIndex for item in sublist[:min(len(sublist),10)]])
df = df.iloc[theIndex]

In [17]:
# Introduce proper indices to copy the old dataset into the padded one
layer_sizes = df.groupby(["trackster","layer"]).size().values.tolist()
layer_places = np.cumsum(layer_sizes)

startes = np.array( [0] + list(layer_places[:-1]))
layers = df["layer"].values[startes]
ids = df["trackster"].values[startes]
finishes = np.array(list(startes[1:]) +[len(df)])
SSS = np.vstack((startes,finishes)).T

hitIds = [[j +(n-1)*max_perlayer + max_perlayer*number_layers*(e-1) for j in range(s[1]-s[0])] for n,s,e in zip(layers,SSS,ids)]
hitIds = np.array([item for sublist in hitIds for item in sublist])

df.loc[:,"hitIds"] = hitIds
df = df.set_index(hitIds.astype(int))

In [18]:
#Create the big mask and copy the old dataset in it to have to padded one
num_tracksters = df.trackster.max()    

bigMask = np.zeros((num_tracksters*number_layers*max_perlayer,len(df.columns)))
bigDF = pd.DataFrame(bigMask,columns=df.columns)

fakeHit = [ [(i*max_perlayer + j) for j in range(max_perlayer)] for i in range(number_layers*num_tracksters)]
fakeHit = np.array([item for sublist in fakeHit for item in sublist])

fakeLayer = [ np.full(max_perlayer,i) for j in range(1,num_tracksters+1) for i in range(1,number_layers+1)]
fakeLayer = np.array([item for sublist in fakeLayer for item in sublist])    

fakeTrackster = [ np.full(max_perlayer*number_layers,i) for i in range(1,num_tracksters+1)]
fakeTrackster = np.array([item for sublist in fakeTrackster for item in sublist])  

bigDF["layer"] = fakeLayer
bigDF["trackster"] = fakeTrackster
bigDF["hitIds"] = fakeHit

bigDF.iloc[df.index] = df
del bigDF['hitIds']

In [19]:
bigDF.to_hdf(pad_path + part + "_aT_Padded.h5","data",complevel=0)

In [26]:
bigDF[0:50]

Unnamed: 0,trackster,layer,x,y,z,phi,eta,E,pt,genE,pid
0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
