In [53]:
import sys
import uproot
import os
import numpy as np
import pandas as pd
from numpy.linalg import eig
from sklearn.decomposition import PCA

In [54]:
part = "gamma" #"electron" "muon" "pion_c"
path = "/data/user/adipilat/ParticleID/genEvts/"
unpad_path = "/data/user/adipilat/ParticleID/genEvts/new_datasets/unpadded/"
pad_path = "/data/user/adipilat/ParticleID/genEvts/new_datasets/padded/"
dir_ = "ana"
tree = "hgc"
max_perlayer = 10
number_layers = 50

In [55]:
variableName = [
            'event',
            'cluster2d_layer',
            'cluster2d_energy',
            'cluster2d_eta',
            'cluster2d_phi',
            'cluster2d_pt',
            'cluster2d_x',
            'cluster2d_y',
            'cluster2d_z',
            'cluster2d_nhitCore',
            'cluster2d_nhitAll',
            'gen_energy',
            'gen_pdgid',
            'gen_daughters',
            'gen_phi',
            'gen_eta',
            'cluster2d_best_cpPdg',
            'trackster_clusters',
            ]
newVars =["event","trackster","x","y","z","r","layer","E","nCore","nHits","id","genDR","gen_phi","gen_eta","phi","eta","genPID","genE","cpID"]

In [56]:
name = "4_" + part
file = path + part + "/" + name + ".root"
print("Starting data production for "+ part)

Starting data production for gamma


In [57]:
df = uproot.open(file)[dir_][tree].pandas.df(variableName,flatten=False)


num_events = np.unique(df["event"].values).shape[0]
xs = df["cluster2d_x"].values
ys = df["cluster2d_y"].values
zs = df["cluster2d_z"].values
es = df["cluster2d_energy"].values
ps = df["cluster2d_pt"].values
nh = df["cluster2d_nhitAll"].values
nc = df["cluster2d_nhitCore"].values
ll = df["cluster2d_layer"].values
ee = df["event"].values
    
sizes = [x.shape[0] for x in xs]
indices = [np.full((a[0]),a[1]) for a in zip(sizes,range(len(sizes)))]

cphi = df["cluster2d_phi"].values
ceta = df["cluster2d_eta"].values
gpid = df["gen_pdgid"].values
gen = df["gen_energy"].values

gphi = [np.full((a[0]),a[1]) for a in zip(sizes,df["gen_phi"].values)]
geta = [np.full((a[0]),a[1]) for a in zip(sizes,df["gen_eta"].values)]
gpid = [np.full((a[0]),a[1]) for a in zip(sizes,df["gen_pdgid"].values)]
gen = [np.full((a[0]),a[1]) for a in zip(sizes,df["gen_energy"].values)]

cp = df["cluster2d_best_cpPdg"].values
tr = df["trackster_clusters"].values

In [58]:
idtrlist = []
# LayerClusters that don't belong to any Trackster will have TracksterId = 0. Real Tracksters have the TracksterId > 0
for i in range(len(sizes)):
    idtrlist.append(np.array([0]*sizes[i]))
for i in range(len(tr)):
    for j in range(len(tr[i])):
        for item in tr[i][j]:
            idtrlist[i][item] = j + 1
idtr = np.array(idtrlist)
# print(idtr.shape)

In [59]:
rs = [np.sqrt(f[0]**2+f[1]**2) for f in zip(xs,ys)]
drs = [np.sqrt((a[0]-a[1])**2 + (a[2]-a[3])**2) for a in zip(gphi,cphi,geta,ceta)]

In [60]:
XS = np.array([item for sublist in xs for item in sublist])
YS = np.array([item for sublist in ys for item in sublist])
ZS = np.array([item for sublist in zs for item in sublist])
RS = np.array([item for sublist in rs for item in sublist])
LL = np.array([item for sublist in ll for item in sublist])
ES = np.array([item for sublist in es for item in sublist])
NC = np.array([item for sublist in nc for item in sublist])
NH = np.array([item for sublist in nh for item in sublist])
II = np.array([item for sublist in indices for item in sublist])
DRS = np.array([item for sublist in drs for item in sublist])
GPHI = np.array([item for sublist in gphi for item in sublist])
GETA = np.array([item for sublist in geta for item in sublist])
GPID = np.array([item for sublist in gpid for item in sublist])
GEN = np.array([item for sublist in gen for item in sublist])
CPHI = np.array([item for sublist in cphi for item in sublist])
CETA = np.array([item for sublist in ceta for item in sublist])


SS = [np.full((s,),s) for s in sizes]
EE = [np.full((s,),i) for i,s in zip(ee,sizes)]

SS = np.array([item for sublist in SS for item in sublist])
EE = np.array([item for sublist in EE for item in sublist])

CP = np.array([item for sublist in cp for item in sublist])
TR = np.array([item for sublist in idtr for item in sublist])

datas = np.vstack((EE,TR,XS,YS,ZS,RS,LL,ES,NC,NH,II,DRS,GPHI,GETA,CPHI,CETA,GPID,GEN,CP)).T

In [87]:
df = pd.DataFrame(datas,columns=newVars)
df = df.sort_values(["event","trackster","layer","E"],ascending=[True,True,True,False]).reset_index(drop=True)

In [46]:
# drop Trackster 0 which is fake
indexNames = df[df['trackster'] == 0].index
df.drop(indexNames, inplace=True)
df = df.reset_index(drop=True)

In [11]:
df.to_hdf(unpad_path + part + "_new.h5","data",complevel=0)

In [47]:
#get the tracksters' energy sums
energysums = df.groupby(['event', 'trackster'], as_index=False)['E'].sum()
energysums 

Unnamed: 0,event,trackster,E
0,1.0,1.0,61.541413
1,2.0,1.0,334.034832
2,2.0,2.0,1.378449
3,2.0,3.0,18.225733
4,3.0,1.0,187.648956
5,3.0,2.0,12.587460
6,3.0,3.0,20.442947
7,4.0,1.0,85.755085
8,5.0,1.0,121.511375
9,5.0,2.0,15.411405


In [48]:
#get the most energetic trackster id for each event 
maxen = energysums.loc[energysums.groupby('event', as_index=False)['E'].idxmax()]['trackster'].values

In [49]:
#get the index of all LC not belonging to the most energetic tracksters
indexNames = []
for i in range(int(df['event'].max())):
    indexNames.append(df[(df['event'] == i+1) & (df['trackster'] != maxen[i])].index)

In [51]:
indices = np.array([item for sublist in indexNames for item in sublist])
df.drop(indices, inplace=True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,event,trackster,x,y,z,r,layer,E,nCore,nHits,id,genDR,gen_phi,gen_eta,phi,eta,genPID,genE,cpID
42,2.0,1.0,74.192642,-26.592834,323.047272,78.814514,2.0,1.063933,4.0,4.0,14.0,0.004974,-0.339637,2.116348,-0.344165,2.118408,22.0,367.569397,22.0
43,2.0,1.0,74.477066,-28.977863,323.047272,79.915894,2.0,0.040499,2.0,2.0,14.0,0.033433,-0.339637,2.116348,-0.371061,2.104931,22.0,367.569397,22.0
44,2.0,1.0,74.766670,-26.483694,325.072754,79.318604,3.0,2.946281,12.0,12.0,14.0,0.002093,-0.339637,2.116348,-0.340427,2.118286,22.0,367.569397,22.0
45,2.0,1.0,73.590805,-28.771975,325.072754,79.015396,3.0,0.064400,2.0,2.0,14.0,0.033543,-0.339637,2.116348,-0.372700,2.122007,22.0,367.569397,22.0
46,2.0,1.0,74.959366,-26.738337,326.017273,79.585457,4.0,5.852627,12.0,12.0,14.0,0.003349,-0.339637,2.116348,-0.342635,2.117842,22.0,367.569397,22.0
47,2.0,1.0,78.639038,-26.206095,326.017303,82.890633,4.0,0.088000,2.0,2.0,14.0,0.042023,-0.339637,2.116348,-0.321671,2.078358,22.0,367.569397,22.0
48,2.0,1.0,75.373734,-26.627161,328.042755,79.938759,5.0,9.105394,16.0,16.0,14.0,0.003209,-0.339637,2.116348,-0.339584,2.119556,22.0,367.569397,22.0
49,2.0,1.0,74.140472,-28.230791,328.042755,79.333389,5.0,0.242331,3.0,3.0,14.0,0.026405,-0.339637,2.116348,-0.363824,2.126943,22.0,367.569397,22.0
50,2.0,1.0,76.423637,-26.762690,331.012726,80.974159,7.0,23.673971,33.0,33.0,14.0,0.002846,-0.339637,2.116348,-0.336843,2.115809,22.0,367.569397,22.0
51,2.0,1.0,77.986023,-20.977562,331.012726,80.758148,7.0,0.115324,2.0,2.0,14.0,0.076893,-0.339637,2.116348,-0.262771,2.118404,22.0,367.569397,22.0


In [16]:
trackster_sizes = df.groupby(["event"]).size().values.tolist()
trackster_places = np.cumsum(trackster_sizes)
num_tracksters = len(trackster_sizes)
track_startes = np.array( [0] + list(trackster_places[:-1]))
track_finishes = np.array(list(track_startes[1:]) +[len(df)])
track_id = np.arange(1,num_tracksters+1)
track_bounds = np.vstack((track_startes,track_finishes)).T

In [17]:
new_tracks = [[i for j in range(t[1]-t[0])] for i,t in zip(track_id, track_bounds)]
new_tracks = np.array([item for sublist in new_tracks for item in sublist])
df['trackster'] = new_tracks

In [18]:
df.head()

Unnamed: 0,event,trackster,x,y,z,r,layer,E,nCore,nHits,id,genDR,gen_phi,gen_eta,phi,eta,genPID,genE,cpID
0,1.0,1,75.449516,-22.178925,325.072754,78.641808,3.0,0.135014,2.0,2.0,23.0,0.054705,-0.339637,2.116348,-0.285904,2.126613,211.0,367.569397,211.0
1,1.0,1,74.818108,-26.049902,325.072754,79.223396,3.0,0.046474,2.0,2.0,23.0,0.00554,-0.339637,2.116348,-0.335049,2.119453,211.0,367.569397,211.0
2,1.0,1,75.16272,-25.949078,326.017273,79.515968,4.0,0.407647,4.0,4.0,23.0,0.007582,-0.339637,2.116348,-0.332427,2.11869,211.0,367.569397,211.0
3,1.0,1,75.733452,-26.873775,328.042755,80.360161,5.0,0.274669,4.0,4.0,23.0,0.002329,-0.339637,2.116348,-0.340986,2.114448,211.0,367.569397,211.0
4,1.0,1,75.668083,-26.670002,328.987244,80.230591,6.0,0.967133,7.0,7.0,23.0,0.002579,-0.339637,2.116348,-0.338865,2.118809,211.0,367.569397,211.0


In [19]:
# get rid of useless stuff
del df['id']
del df['event']
del df['nCore']

In [20]:
means = []
pca_coordinates = []
for i in range(1, num_tracksters+1):
    pe = np.array([df["phi"][df["trackster"]==i].values, df["eta"][df["trackster"]==i].values])
    en = np.array(df["E"][df["trackster"]==i].values).T

    # compute pca
    pcaVars= np.array([df["x"][df["trackster"]==i].values, df["y"][df["trackster"]==i].values, df["z"][df["trackster"]==i].values]).T  
#     print(pcaVars.shape)
#     M = np.mean(pcaVars.T, axis=1)
#     C = pcaVars - M
#     V = np.cov(C.T)
#     values, vectors = eig(V)
#     pca = vectors.T.dot(C.T)
#     pca_matrix = pca.T

    pca = PCA()
    pca.fit(pcaVars)
    pca_matrix = pca.transform(pcaVars)    

    temp = [np.average(pe, axis=1, weights=en)]    
    means.append(np.array(temp*trackster_sizes[i-1]))
    pca_coordinates.append(pca_matrix)

mean_values = np.array([item for sublist in means for item in sublist])
pca_variables = np.array([item for sublist in pca_coordinates for item in sublist])
print(mean_values.shape)
print(pca_variables.shape)

(1094679, 2)
(1094679, 3)


In [21]:
df.loc[:,"x_pca"] = pca_variables[:,0]
df.loc[:,"y_pca"] = pca_variables[:,1]
df.loc[:,"z_pca"] = pca_variables[:,2]
df.loc[:,"phi_mean"] = mean_values[:,0]
df.loc[:,"eta_mean"] = mean_values[:,1]

In [22]:
df[0:70]

Unnamed: 0,trackster,x,y,z,r,layer,E,nHits,genDR,gen_phi,...,phi,eta,genPID,genE,cpID,x_pca,y_pca,z_pca,phi_mean,eta_mean
0,1,75.449516,-22.178925,325.072754,78.641808,3.0,0.135014,2.0,0.054705,-0.339637,...,-0.285904,2.126613,211.0,367.569397,211.0,-43.092931,-3.174265,1.767477,-0.339771,2.117759
1,1,74.818108,-26.049902,325.072754,79.223396,3.0,0.046474,2.0,0.005540,-0.339637,...,-0.335049,2.119453,211.0,367.569397,211.0,-42.841661,0.571444,0.631840,-0.339771,2.117759
2,1,75.162720,-25.949078,326.017273,79.515968,4.0,0.407647,4.0,0.007582,-0.339637,...,-0.332427,2.118690,211.0,367.569397,211.0,-41.858168,0.384991,0.769738,-0.339771,2.117759
3,1,75.733452,-26.873775,328.042755,80.360161,5.0,0.274669,4.0,0.002329,-0.339637,...,-0.340986,2.114448,211.0,367.569397,211.0,-39.670852,1.091074,0.747959,-0.339771,2.117759
4,1,75.668083,-26.670002,328.987244,80.230591,6.0,0.967133,7.0,0.002579,-0.339637,...,-0.338865,2.118809,211.0,367.569397,211.0,-38.794592,0.758726,0.503891,-0.339771,2.117759
5,1,75.861526,-28.907150,328.987244,81.182480,6.0,0.175063,2.0,0.026031,-0.339637,...,-0.364066,2.107354,211.0,367.569397,211.0,-38.517863,2.984028,0.386829,-0.339771,2.117759
6,1,75.225067,-24.576128,328.987244,79.137833,6.0,0.046474,2.0,0.028617,-0.339637,...,-0.315770,2.132137,211.0,367.569397,211.0,-39.115300,-1.352469,0.360455,-0.339771,2.117759
7,1,76.822479,-26.598392,331.012726,81.296791,7.0,1.766114,12.0,0.007704,-0.339637,...,-0.333314,2.111947,211.0,367.569397,211.0,-36.572750,0.546414,1.181148,-0.339771,2.117759
8,1,75.012619,-24.128363,331.012756,78.797661,7.0,0.118631,2.0,0.038493,-0.339637,...,-0.311206,2.142296,211.0,367.569397,211.0,-37.254241,-2.083648,-0.231291,-0.339771,2.117759
9,1,76.769814,-26.714674,331.957245,81.285164,8.0,3.271002,10.0,0.004989,-0.339637,...,-0.334878,2.114853,211.0,367.569397,211.0,-35.660397,0.530799,0.905867,-0.339771,2.117759


In [23]:
theIndex = list(df.groupby(["trackster","layer"]).indices.values())
theIndex = np.array([item for sublist in theIndex for item in sublist[:min(len(sublist),10)]])

In [24]:
df = df.iloc[theIndex]
df.head()

Unnamed: 0,trackster,x,y,z,r,layer,E,nHits,genDR,gen_phi,...,phi,eta,genPID,genE,cpID,x_pca,y_pca,z_pca,phi_mean,eta_mean
0,1,75.449516,-22.178925,325.072754,78.641808,3.0,0.135014,2.0,0.054705,-0.339637,...,-0.285904,2.126613,211.0,367.569397,211.0,-43.092931,-3.174265,1.767477,-0.339771,2.117759
1,1,74.818108,-26.049902,325.072754,79.223396,3.0,0.046474,2.0,0.00554,-0.339637,...,-0.335049,2.119453,211.0,367.569397,211.0,-42.841661,0.571444,0.63184,-0.339771,2.117759
2,1,75.16272,-25.949078,326.017273,79.515968,4.0,0.407647,4.0,0.007582,-0.339637,...,-0.332427,2.11869,211.0,367.569397,211.0,-41.858168,0.384991,0.769738,-0.339771,2.117759
3,1,75.733452,-26.873775,328.042755,80.360161,5.0,0.274669,4.0,0.002329,-0.339637,...,-0.340986,2.114448,211.0,367.569397,211.0,-39.670852,1.091074,0.747959,-0.339771,2.117759
4,1,75.668083,-26.670002,328.987244,80.230591,6.0,0.967133,7.0,0.002579,-0.339637,...,-0.338865,2.118809,211.0,367.569397,211.0,-38.794592,0.758726,0.503891,-0.339771,2.117759


In [25]:
layer_sizes = df.groupby(["trackster","layer"]).size().values.tolist()
layer_places = np.cumsum(layer_sizes)

In [26]:
startes = np.array( [0] + list(layer_places[:-1]))
layers = df["layer"].values[startes]
ids = df["trackster"].values[startes]
finishes = np.array(list(startes[1:]) +[len(df)])
SSS = np.vstack((startes,finishes)).T

In [27]:
hitIds = [[j +(n-1)*max_perlayer + max_perlayer*number_layers*(e-1) for j in range(s[1]-s[0])] for n,s,e in zip(layers,SSS,ids)]
hitIds = np.array([item for sublist in hitIds for item in sublist])

In [28]:
df.loc[:,"hitIds"] = hitIds
df = df.set_index(hitIds.astype(int))

In [29]:
bigMask = np.zeros((num_tracksters*number_layers*max_perlayer,len(df.columns)))
bigDF = pd.DataFrame(bigMask,columns=df.columns)

In [30]:
fakeHit = [ [(i*max_perlayer + j) for j in range(max_perlayer)] for i in range(number_layers*num_tracksters)]
fakeHit = np.array([item for sublist in fakeHit for item in sublist])

In [31]:
fakeLayer = [ np.full(max_perlayer,i) for j in range(1,num_tracksters+1) for i in range(1,number_layers+1)]
fakeLayer = np.array([item for sublist in fakeLayer for item in sublist])

In [32]:
fakeTrackster = [ np.full(max_perlayer*number_layers,i) for i in range(1,num_tracksters+1)]
fakeTrackster = np.array([item for sublist in fakeTrackster for item in sublist])

In [33]:
bigDF["layer"] = fakeLayer
bigDF["trackster"] = fakeTrackster
bigDF["hitIds"] = fakeHit

In [34]:
bigDF.iloc[df.index] = df

In [35]:
bigDF[100:150]

Unnamed: 0,trackster,x,y,z,r,layer,E,nHits,genDR,gen_phi,...,eta,genPID,genE,cpID,x_pca,y_pca,z_pca,phi_mean,eta_mean,hitIds
100,1,77.271805,-27.428295,336.952759,81.995384,11.0,8.691777,31.0,0.004789,-0.339637,...,2.120913,211.0,367.569397,211.0,-30.640974,0.630024,0.19036,-0.339771,2.117759,100.0
101,1,79.315285,-30.721449,336.952759,85.057167,11.0,0.298018,4.0,0.043085,-0.339637,...,2.085329,211.0,367.569397,211.0,-29.819373,4.096464,1.71661,-0.339771,2.117759,101.0
102,1,74.252701,-33.263153,336.952759,81.36277,11.0,0.106178,3.0,0.082421,-0.339637,...,2.12844,211.0,367.569397,211.0,-30.749168,6.051866,-3.518073,-0.339771,2.117759,102.0
103,1,76.640121,-22.963013,336.952759,80.006302,11.0,0.098804,3.0,0.056249,-0.339637,...,2.14479,211.0,367.569397,211.0,-31.251159,-3.838246,0.186837,-0.339771,2.117759,103.0
104,1,74.120438,-25.14677,336.952759,78.270042,11.0,0.054416,2.0,0.05136,-0.339637,...,2.166149,211.0,367.569397,211.0,-31.618991,-1.959774,-2.5433,-0.339771,2.117759,104.0
105,1,72.027435,-27.12219,336.952759,76.964699,11.0,0.041808,2.0,0.069291,-0.339637,...,2.182538,211.0,367.569397,211.0,-31.907851,-0.240313,-4.83305,-0.339771,2.117759,105.0
106,1,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106.0
107,1,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0
108,1,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,108.0
109,1,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.0


In [36]:
del bigDF['hitIds']
bigDF.head()

Unnamed: 0,trackster,x,y,z,r,layer,E,nHits,genDR,gen_phi,...,phi,eta,genPID,genE,cpID,x_pca,y_pca,z_pca,phi_mean,eta_mean
0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
bigDF.to_hdf(pad_path + part + "_newPadded.h5","data",complevel=0)

In [40]:
images = bigDF.groupby('trackster')['x_pca','y_pca','z_pca'].values

AttributeError: Cannot access attribute 'values' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [39]:
images.shape

AttributeError: Cannot access attribute 'shape' of 'DataFrameGroupBy' objects, try using the 'apply' method