In [3]:
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
import pandas


import tensorflow as tf
import tf_data

In [4]:
data_path_train = {"ele_match": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/electrons/recordio_allinfo_v8/training/calo_matched/*.proto",
                  "gamma_match": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/gammas/recordio_allinfo_v8/training/calo_matched/*.proto",
                 "nomatch": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/electrons/recordio_allinfo_v8/training/no_calo_matched/*.proto",
                  #"gamma_nomatch": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/gammas/recordio_allinfo_v2/training/no_calo_matched/*.proto"
                  }

In [10]:
features = [ "en_cluster","et_cluster",
            "cluster_eta", "cluster_phi", 
            "cluster_ieta","cluster_iphi","cluster_iz",
            "cluster_deta", "cluster_dphi",
            "cluster_den_seed","cluster_det_seed",
            "cl_f5_r9", "cl_f5_sigmaIetaIeta", "cl_f5_sigmaIetaIphi",
            "cl_f5_sigmaIphiIphi","cl_f5_swissCross",
            "cl_r9", "cl_sigmaIetaIeta", "cl_sigmaIetaIphi",
            "cl_sigmaIphiIphi","cl_swissCross",
            "cl_nxtals", "cl_etaWidth","cl_phiWidth"]

window_features = [ "max_en_cluster","max_et_cluster","max_deta_cluster","max_dphi_cluster","max_den_cluster","max_det_cluster",
                    "min_en_cluster","min_et_cluster","min_deta_cluster","min_dphi_cluster","min_den_cluster","min_det_cluster",
                    "mean_en_cluster","mean_et_cluster","mean_deta_cluster","mean_dphi_cluster","mean_den_cluster","mean_det_cluster" ]

# Metadata about the window like true energy, true calo position, useful info
window_metadata = ["en_true_sim","et_true_sim", "en_true_gen", "et_true_gen",
                    "nclusters_insc",
                    "nVtx", "rho", "obsPU", "truePU",
                    "sim_true_eta", "sim_true_phi",  
                    "en_mustache_raw", "et_mustache_raw","en_mustache_calib", "et_mustache_calib",
                    "event_tot_simen_PU","wtot_simen_PU","wtot_recoen_PU","wtot_simen_sig"  ]
    
    

In [7]:
len(features)

15

In [12]:
# Load a balanced dataset from the list of paths given to the function. Selected only the requestes features from clusters and prepare batches
train_ds = tf_data.load_balanced_dataset_batch(data_path_train, features,window_features, window_metadata, batch_size= 100, 
                                                weights={"ele_match":0.33,"gamma_match":0.33, "nomatch":0.33})
# the indexes for energy and et are from the features list we requestes
train_ds = tf_data.training_format(train_ds, norm=False)

# Create training and validation
ds_train = train_ds.take(10000)

In [13]:
from training_data import data_tools

In [34]:
def parameters(ds, features):
    '''
    Function to calculate the parameters (mean, sigma) for features' distributions.
    
    Return: 
    - mean (dim: n_features): mean of the features' distributions.
    - sigma (dim: n_features): sigma of the features' distributions. 
    
    Args:
    - ds: tensorflow dataset (in the format after tf_data.training_format)
    - features: list of all the features recorded in the dataset. 
    '''
    
    # initialize counting variables
    n_features = len(features)
    
    total_cl = 0.
    m = tf.zeros(shape=n_features)
    s = tf.zeros(shape=n_features)
    
    # iterate through dataset to calculate mean 
    for el in ds:
        (cl_X, _, _, _, n_cl), (*_) = el
        cl_X = cl_X[:,:,0:n_features]

        m += tf.reduce_sum(cl_X, axis=(0,1)).numpy()
        total_cl += tf.reduce_sum(n_cl).numpy()
    
    # calculate mean for each feature, create dictionary with feature labels
    #ind = tf_data.get_cluster_features_indexes(features)
    m = m/total_cl
    mean = dict(zip(features, m))
    m = tf.reshape(m, shape=[1,1,-1])
    
    # iterate through dataset to calculate sigma
    for el in ds: 
        (cl_X, _,_, _, n_cl), (*_) = el
        cl_X = cl_X[:,:,0:n_features]
        # create mask to eliminate the padded values from calculation
        mask = tf.expand_dims(tf.cast(tf.reduce_sum(cl_X, axis=-1) != 0., tf.float32), axis=-1)
        
        dif_masked = mask*(cl_X-m)
        s +=tf.reduce_sum(tf.math.pow(dif_masked, 2), axis=(0,1)).numpy()
    s = tf.math.sqrt(s/total_cl)
    sigma = dict(zip(features, s))
    return m,s , mean, sigma

In [36]:
def parameters_wind(ds, features):
    '''
    Function to calculate the parameters (mean, sigma) for features' distributions.
    
    Return: 
    - mean (dim: n_features): mean of the features' distributions.
    - sigma (dim: n_features): sigma of the features' distributions. 
    
    Args:
    - ds: tensorflow dataset (in the format after tf_data.training_format)
    - features: list of all the features recorded in the dataset. 
    '''
    
    # initialize counting variables
    n_features = len(features)
    
    total_cl = 0.
    m = tf.zeros(shape=n_features)
    s = tf.zeros(shape=n_features)
    
    # iterate through dataset to calculate mean 
    for el in ds:
        (_, _, wind_X, _, n_cl), (*_) = el
        wind_X = wind_X[:,0:n_features]

        m += tf.reduce_sum(wind_X, axis=(0)).numpy()
        total_cl += len(wind_X)
    
    # calculate mean for each feature, create dictionary with feature labels
    #ind = tf_data.get_cluster_features_indexes(features)
    m = m/total_cl
    mean = dict(zip(features, m))
    m = tf.reshape(m, shape=[1,-1])
    
    # iterate through dataset to calculate sigma
    for el in ds: 
        (_, _,wind_X, _, n_cl), (*_) = el
        wind_X = wind_X[:,0:n_features]
        # create mask to eliminate the padded values from calculation
        #mask = tf.expand_dims(tf.cast(tf.reduce_sum(cl_X, axis=-1) != 0., tf.float32), axis=-1)
        
#         dif_masked = mask*(cl_X-m)
        s +=tf.reduce_sum(tf.math.pow(wind_X-m, 2), axis=(0)).numpy()
    s = tf.math.sqrt(s/total_cl)
    sigma = dict(zip(features, s))
    return m,s , mean, sigma

In [22]:
m ,s , mean, sigma = parameters(ds_train, features)

In [37]:
mw ,sw , meanw, sigmaw = parameters_wind(ds_train, window_features)

In [24]:
mean

{'en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=22.524303>,
 'et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=7.1766996>,
 'cluster_eta': <tf.Tensor: shape=(), dtype=float32, numpy=0.0010188665>,
 'cluster_phi': <tf.Tensor: shape=(), dtype=float32, numpy=-0.011558083>,
 'cluster_ieta': <tf.Tensor: shape=(), dtype=float32, numpy=17.339075>,
 'cluster_iphi': <tf.Tensor: shape=(), dtype=float32, numpy=136.76228>,
 'cluster_iz': <tf.Tensor: shape=(), dtype=float32, numpy=0.00061966974>,
 'cluster_deta': <tf.Tensor: shape=(), dtype=float32, numpy=0.020145211>,
 'cluster_dphi': <tf.Tensor: shape=(), dtype=float32, numpy=-0.00013252016>,
 'cluster_den_seed': <tf.Tensor: shape=(), dtype=float32, numpy=56.139122>,
 'cluster_det_seed': <tf.Tensor: shape=(), dtype=float32, numpy=24.030712>,
 'cl_f5_r9': <tf.Tensor: shape=(), dtype=float32, numpy=2.3928623>,
 'cl_f5_sigmaIetaIeta': <tf.Tensor: shape=(), dtype=float32, numpy=0.01664246>,
 'cl_f5_sigmaIetaIphi': <tf.Tensor: shape=(

In [25]:
sigma

{'en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=76.19602>,
 'et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=18.7965>,
 'cluster_eta': <tf.Tensor: shape=(), dtype=float32, numpy=1.4732423>,
 'cluster_phi': <tf.Tensor: shape=(), dtype=float32, numpy=1.8075603>,
 'cluster_ieta': <tf.Tensor: shape=(), dtype=float32, numpy=51.906487>,
 'cluster_iphi': <tf.Tensor: shape=(), dtype=float32, numpy=105.55555>,
 'cluster_iz': <tf.Tensor: shape=(), dtype=float32, numpy=0.5823184>,
 'cluster_deta': <tf.Tensor: shape=(), dtype=float32, numpy=0.058878865>,
 'cluster_dphi': <tf.Tensor: shape=(), dtype=float32, numpy=0.27938646>,
 'cluster_den_seed': <tf.Tensor: shape=(), dtype=float32, numpy=89.72599>,
 'cluster_det_seed': <tf.Tensor: shape=(), dtype=float32, numpy=29.370329>,
 'cl_f5_r9': <tf.Tensor: shape=(), dtype=float32, numpy=9.873369>,
 'cl_f5_sigmaIetaIeta': <tf.Tensor: shape=(), dtype=float32, numpy=0.017052714>,
 'cl_f5_sigmaIetaIphi': <tf.Tensor: shape=(), dtype=float32, 

In [26]:
np.savez("normalization.npz", mean=m, sigma=s)

In [38]:
sigmaw

{'max_en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=139.98135>,
 'max_et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=30.462042>,
 'max_deta_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=0.05672447>,
 'max_dphi_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=0.2092758>,
 'max_den_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=117.98411>,
 'max_det_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=30.259237>,
 'min_en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=92.93467>,
 'min_et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=13.695581>,
 'min_deta_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=0.03514311>,
 'min_dphi_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=0.2090637>,
 'min_den_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=6.1971254>,
 'min_det_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=2.021335>,
 'mean_en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=99.7142>,
 'mean_et_cluster': <tf.Ten

In [40]:
meanw

{'max_en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=98.56081>,
 'max_et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=30.730434>,
 'max_deta_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=0.06807884>,
 'max_dphi_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=0.267591>,
 'max_den_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=76.70408>,
 'max_det_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=27.073963>,
 'min_en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=21.644217>,
 'min_et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=3.581526>,
 'min_deta_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=-0.035444207>,
 'min_dphi_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=-0.2677228>,
 'min_den_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=-0.21257907>,
 'min_det_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=-0.07502321>,
 'mean_en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=44.903072>,
 'mean_et_cluster': 

In [41]:
np.savez("normalization_wind_features.npz", mean=mw, sigma=sw)