In [1]:
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
import pandas


import tensorflow as tf
import tf_data

In [2]:
data_path_train = {"ele_match": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/electrons/recordio_allinfo_v6/training/calo_matched/*.proto",
                  "gamma_match": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/gammas/recordio_allinfo_v6/training/calo_matched/*.proto",
                 "nomatch": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/electrons/recordio_allinfo_v6/training/no_calo_matched/*.proto",
                  #"gamma_nomatch": "/eos/user/r/rdfexp/ecal/cluster/output_deepcluster_dumper/windows_data/gammas/recordio_allinfo_v2/training/no_calo_matched/*.proto"
                  }

In [3]:
features = [ "en_cluster","et_cluster", 
            "cluster_ieta","cluster_iphi","cluster_iz",
            "cluster_deta", "cluster_dphi",
            "cl_r9", "cl_sigmaIetaIeta", "cl_sigmaIetaIphi",
            "cl_sigmaIphiIphi","cl_swissCross",
            "cl_nxtals", "cl_etaWidth","cl_phiWidth"]
                

metadata =  [  "en_true_sim","et_true_sim", "en_true_gen", "et_true_gen",
            "nclusters_insc","max_en_cluster_insc","max_deta_cluster_insc",
            "max_dphi_cluster_insc",
             "max_en_cluster","max_deta_cluster","max_dphi_cluster",
            "event_tot_simen_PU","wtot_simen_PU","wtot_simen_sig" ]

In [4]:
len(features)

15

In [5]:
# Load a balanced dataset from the list of paths given to the function. Selected only the requestes features from clusters and prepare batches
train_ds = tf_data.load_balanced_dataset_batch(data_path_train, features, metadata, batch_size= 100, weights={"ele_match":0.33,"gamma_match":0.33, "nomatch":0.33})
# the indexes for energy and et are from the features list we requestes
train_ds = tf_data.delta_energy_seed(train_ds, en_index=0, et_index=1)
train_ds = tf_data.training_format(train_ds)

# Create training and validation
ds_train = train_ds.take(10000)

In [6]:
from training_data import data_tools

In [7]:
def parameters(ds, features):
    '''
    Function to calculate the parameters (mean, sigma) for features' distributions.
    
    Return: 
    - mean (dim: n_features): mean of the features' distributions.
    - sigma (dim: n_features): sigma of the features' distributions. 
    
    Args:
    - ds: tensorflow dataset (in the format after tf_data.training_format)
    - features: list of all the features recorded in the dataset. 
    '''
    
    # initialize counting variables
    n_features = len(features)
    
    total_cl = 0.
    m = tf.zeros(shape=n_features)
    s = tf.zeros(shape=n_features)
    
    # iterate through dataset to calculate mean 
    for el in ds:
        (cl_X, _, _, n_cl), (*_) = el
        cl_X = cl_X[:,:,0:n_features]

        m += tf.reduce_sum(cl_X, axis=(0,1)).numpy()
        total_cl += tf.reduce_sum(n_cl).numpy()
    
    # calculate mean for each feature, create dictionary with feature labels
    #ind = tf_data.get_cluster_features_indexes(features)
    m = m/total_cl
    mean = dict(zip(features, m))
    m = tf.reshape(m, shape=[1,1,-1])
    
    # iterate through dataset to calculate sigma
    for el in ds: 
        (cl_X, _, _, n_cl), (*_) = el
        cl_X = cl_X[:,:,0:n_features]
        # create mask to eliminate the padded values from calculation
        mask = tf.expand_dims(tf.cast(tf.reduce_sum(cl_X, axis=-1) != 0., tf.float32), axis=-1)
        
        dif_masked = mask*(cl_X-m)
        s +=tf.reduce_sum(tf.math.pow(dif_masked, 2), axis=(0,1)).numpy()
    s = tf.math.sqrt(s/total_cl)
    sigma = dict(zip(features, s))
    return m,s , mean, sigma

In [8]:
m ,s , mean, sigma = parameters(ds_train, features+["delta_seed_en","delta_seed_et"])

In [9]:
m

<tf.Tensor: shape=(1, 1, 17), dtype=float32, numpy=
array([[[2.1569590e+01, 6.8596444e+00, 1.7554193e+01, 1.3628761e+02,
         4.8938330e-04, 1.9939605e-02, 8.8600042e-05, 9.8884904e-01,
         5.2043227e-03, 9.4845706e-07, 6.4802836e-03, 7.8155839e-01,
         2.6977096e+00, 5.7275230e-03, 5.9658368e-03, 5.2845585e+01,
         2.2607422e+01]]], dtype=float32)>

In [10]:
mean

{'en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=21.56959>,
 'et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=6.8596444>,
 'cluster_ieta': <tf.Tensor: shape=(), dtype=float32, numpy=17.554193>,
 'cluster_iphi': <tf.Tensor: shape=(), dtype=float32, numpy=136.28761>,
 'cluster_iz': <tf.Tensor: shape=(), dtype=float32, numpy=0.0004893833>,
 'cluster_deta': <tf.Tensor: shape=(), dtype=float32, numpy=0.019939605>,
 'cluster_dphi': <tf.Tensor: shape=(), dtype=float32, numpy=8.860004e-05>,
 'cl_r9': <tf.Tensor: shape=(), dtype=float32, numpy=0.98884904>,
 'cl_sigmaIetaIeta': <tf.Tensor: shape=(), dtype=float32, numpy=0.0052043227>,
 'cl_sigmaIetaIphi': <tf.Tensor: shape=(), dtype=float32, numpy=9.4845706e-07>,
 'cl_sigmaIphiIphi': <tf.Tensor: shape=(), dtype=float32, numpy=0.0064802836>,
 'cl_swissCross': <tf.Tensor: shape=(), dtype=float32, numpy=0.7815584>,
 'cl_nxtals': <tf.Tensor: shape=(), dtype=float32, numpy=2.6977096>,
 'cl_etaWidth': <tf.Tensor: shape=(), dtype=float

In [12]:
sigma

{'en_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=74.28398>,
 'et_cluster': <tf.Tensor: shape=(), dtype=float32, numpy=18.324402>,
 'cluster_ieta': <tf.Tensor: shape=(), dtype=float32, numpy=51.565166>,
 'cluster_iphi': <tf.Tensor: shape=(), dtype=float32, numpy=105.40503>,
 'cluster_iz': <tf.Tensor: shape=(), dtype=float32, numpy=0.5844528>,
 'cluster_deta': <tf.Tensor: shape=(), dtype=float32, numpy=0.059056774>,
 'cluster_dphi': <tf.Tensor: shape=(), dtype=float32, numpy=0.279133>,
 'cl_r9': <tf.Tensor: shape=(), dtype=float32, numpy=0.042185422>,
 'cl_sigmaIetaIeta': <tf.Tensor: shape=(), dtype=float32, numpy=0.008838983>,
 'cl_sigmaIetaIphi': <tf.Tensor: shape=(), dtype=float32, numpy=8.4691026e-05>,
 'cl_sigmaIphiIphi': <tf.Tensor: shape=(), dtype=float32, numpy=0.010501431>,
 'cl_swissCross': <tf.Tensor: shape=(), dtype=float32, numpy=0.35644677>,
 'cl_nxtals': <tf.Tensor: shape=(), dtype=float32, numpy=3.96378>,
 'cl_etaWidth': <tf.Tensor: shape=(), dtype=float32, numpy

In [13]:
np.savez("normalization.npz", mean=m, sigma=s)