In [2]:
import ROOT
import numpy as np
import matplotlib.pyplot as plt
import uproot
import os
import pandas as pd


Welcome to JupyROOT 6.28/04


In [3]:
print("prepdata")

prepdata


In [4]:
class OneHotEncoder_int(object):
    """One hot encoder for integer inputs with overflows
    
    Arguments:
        object {[type]} -- [description]
    """
    def __init__(self, categorical_features, lowerlimit=None, upperlimit=None):
        self.iscategorical = categorical_features
        self.ncolumns = len(categorical_features)
        self.ncats=0
        self.categories_per_feature = []

        self.ncatgroups = 0
        for b in categorical_features:
            if b:
                self.ncatgroups += 1
        self.lowerlimit = lowerlimit # initial set to the input, but will be checked later
        self.upperlimit = upperlimit # initial set to the input, but will be checked later
        self.categories_fixed = False
        pass

    def applylimit(self, categoricalinputdata):
        # should check whether lower limit set makes sense
        if self.lowerlimit is None:
            self.lowerlimit = np.min(categoricalinputdata, axis=0)
        else:
            self.lowerlimit = np.maximum(self.lowerlimit, np.min(categoricalinputdata, axis=0))
        
        # should check whether upper limit set makes sense
        if self.upperlimit is None:
            self.upperlimit = np.max(categoricalinputdata, axis=0)
        else:
            self.upperlimit = np.minimum(self.upperlimit, np.max(categoricalinputdata, axis=0))

        lowerlimitapp = np.maximum(categoricalinputdata, self.lowerlimit)
        #limitapp = np.minimum(lowerlimitapp, self.upperlimit).astype(int)
        limitapp = np.minimum(lowerlimitapp, self.upperlimit)
        return limitapp

    def _encode(self, inputdata):
        categorical_columns=inputdata[:, self.iscategorical]
        float_columns=inputdata[:, [not i for i in self.iscategorical]]

        cat_limited = self.applylimit(categorical_columns)-self.lowerlimit.astype(int)

        catshape = categorical_columns.shape

        arraylist=[]
        if not self.categories_fixed:
            for cat in range(catshape[1]):
                ncats = int(self.upperlimit[cat] - self.lowerlimit[cat] + 1) # number of categories
                self.categories_per_feature.append(ncats)
                self.ncats += ncats
            self.categories_fixed = True

        for cat in range(catshape[1]):
            ncats = int(self.upperlimit[cat] - self.lowerlimit[cat] + 1) # number of categories
            res = np.eye(ncats)[cat_limited[:,cat]]
            #print(res)
            arraylist.append(res)
        if float_columns.shape[1]>0:
            arraylist.append(float_columns)
        encoded = np.concatenate(tuple(arraylist), axis=1).astype(np.float32)
        return encoded

    def encode(self, inputdata):

        cat_limited = self.applylimit(inputdata)-self.lowerlimit
        #print(self.applylimit(inputdata))
        #print(self.lowerlimit)
        #print(cat_limited)

        # one hot encoding information
        if not self.categories_fixed:
            for icol, iscat in zip(range(self.ncolumns), self.iscategorical):
                if iscat:
                    ncats = int(self.upperlimit[icol] - self.lowerlimit[icol] + 1) # number of categories
                    self.categories_per_feature.append(ncats)
                    self.ncats += ncats
                else:
                    self.categories_per_feature.append(0)
            self.categories_fixed = True

        # the actual encoding part
        arraylist=[]
        for icol, ncat_feat in zip(range(self.ncolumns), self.categories_per_feature):
            if ncat_feat>0:
                res = np.eye(ncat_feat)[cat_limited[:,icol].astype(int)]
                arraylist.append(res)
            else:
                arraylist.append(inputdata[:,icol].reshape((inputdata.shape[0], 1)))

        encoded = np.concatenate(tuple(arraylist), axis=1).astype(np.float32)
        return encoded
    
    def encodedcategories(self):
        return self.ncats

    def transform(self, inputdata):
        return self.encode(inputdata)

    def _decode(self, onehotdata):
        colstart = 0
        
        arraylist = []
        for i in range(self.ncatgroups):
            ncats = int(self.upperlimit[i] - self.lowerlimit[i]+1)  # number of categories
            datatoconvert = onehotdata[:, colstart:colstart+ncats]
            converted = np.argmax(datatoconvert, axis=1) + self.lowerlimit[i]
            converted = np.reshape(converted, newshape=(converted.shape[0], 1))
            arraylist.append(converted)
            colstart += ncats
        if colstart<onehotdata.shape[1]:
            arraylist.append(onehotdata[:, colstart:])
        decoded = np.concatenate(tuple(arraylist), axis=1)
        return decoded

    def decode(self, onehotdata):
        current_col = 0 # start from column 0
        arraylist = []
        for ifeat, ncats in zip(range(len(self.categories_per_feature)), self.categories_per_feature):
            if ncats>0:
                datatoconvert = onehotdata[:, current_col:current_col+ncats]
                converted = np.argmax(datatoconvert, axis=1) + self.lowerlimit[ifeat]
                converted = np.reshape(converted, newshape=(converted.shape[0], 1))
                arraylist.append(converted)
                current_col += ncats
            else:
                arraylist.append(onehotdata[:, current_col].reshape((onehotdata.shape[0], 1)))
                current_col += 1
        decoded = np.concatenate(tuple(arraylist), axis=1)
        return decoded

    pass


In [5]:
x = np.array([[ 0,  1,  2], [ 3,  4,  5], [ 6,  7,  8], [ 9, 10, 11]])
ohe = OneHotEncoder_int(categorical_features=[True, False, True], lowerlimit=[2,0,2], upperlimit=[8,100,8])
xlimited = ohe.applylimit(x)
print(xlimited)
encodedx = ohe.encode(x)
print(encodedx)
decoded = ohe.decode(encodedx)
print(decoded)
print()
ohe2 = OneHotEncoder_int(categorical_features=[True, False, True])
xlimited = ohe2.applylimit(x)
print(xlimited)
encodedx = ohe2.encode(x)
print(encodedx)
decoded = ohe2.decode(encodedx)
print(decoded)

[[ 2  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 8 10  8]]
[[ 1.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  4.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  7.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  1. 10.  0.  0.  0.  0.  0.  0.  1.]]
[[ 2.  1.  2.]
 [ 3.  4.  5.]
 [ 6.  7.  8.]
 [ 8. 10.  8.]]

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.
   0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  4.  0.  0.  0.  1.  0.  0.  0.
   0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  7.  0.  0.  0.  0.  0.  0.  1.
   0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1. 10.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  1.]]
[[ 0.  1.  2.]
 [ 3.  4.  5.]
 [ 6.  7.  8.]
 [ 9. 10. 11.]]


In [6]:
featurevars = ['met', 'ht', 'pt5', 'pt6', 'njet', 'nbtag']

rootfile='ttjjresult.root'

ttjj = uproot.open(rootfile)
ttjjtree = ttjj['mytree']
iscategorical = [False, False, False, False, True, True]
upperlimit = [10, 10, 10, 10, 9, 3]

In [7]:
#inputtmp = ttjjtree.df(featurevars)
arrays = ttjjtree.arrays(featurevars, library="pd")
inputtmp = pd.DataFrame(arrays)

In [8]:
_onehotencoder = OneHotEncoder_int(iscategorical, upperlimit=upperlimit)
iscategorical = np.array(inputtmp.dtypes == np.int32)

inputnumpy = inputtmp.to_numpy(dtype=np.float32)
inputs = _onehotencoder.encode(inputnumpy)
ncats = _onehotencoder.ncats
ncat_per_feature = _onehotencoder.categories_per_feature

In [9]:
print(inputnumpy)
print(inputs)
print(ncats)
print(ncat_per_feature)

[[  84.002205  765.41595    96.49143    83.413086    7.          2.      ]
 [  95.23368   924.5049     71.028915   68.11747    11.          2.      ]
 [  54.86022  1668.8479     76.10241    46.142303   10.          3.      ]
 ...
 [   8.826685  555.999      38.395134   34.18226     9.          2.      ]
 [  41.213657  687.2156     58.986603   51.859745    9.          2.      ]
 [  56.09024   498.0826     51.162266   47.752556    8.          2.      ]]
[[8.4002205e+01 7.6541595e+02 9.6491432e+01 ... 0.0000000e+00
  1.0000000e+00 0.0000000e+00]
 [9.5233681e+01 9.2450488e+02 7.1028915e+01 ... 1.0000000e+00
  1.0000000e+00 0.0000000e+00]
 [5.4860222e+01 1.6688479e+03 7.6102409e+01 ... 1.0000000e+00
  0.0000000e+00 1.0000000e+00]
 ...
 [8.8266850e+00 5.5599902e+02 3.8395134e+01 ... 1.0000000e+00
  1.0000000e+00 0.0000000e+00]
 [4.1213657e+01 6.8721558e+02 5.8986603e+01 ... 1.0000000e+00
  1.0000000e+00 0.0000000e+00]
 [5.6090240e+01 4.9808261e+02 5.1162266e+01 ... 0.0000000e+00
  1.0000000e

In [10]:
meanslist = []
sigmalist = []
currentcolumn = 0
for ifeat, ncatfeat in zip(range(inputtmp.shape[1]), ncat_per_feature):
    if ncatfeat == 0: # fir float features, get mean and sigma
        mean = np.mean(inputnumpy[:, currentcolumn], axis=0, dtype=np.float32).reshape(1,1)
        meanslist.append(mean)
        sigma = np.std(inputnumpy[:, currentcolumn], axis=0, dtype=np.float32).reshape(1,1)
        sigmalist.append(sigma)
        currentcolumn += 1
    else: # categorical features do not get changed
        mean = np.zeros(shape=(1, ncatfeat), dtype=np.float32) 
        meanslist.append(mean)
        sigma = np.ones(shape=(1, ncatfeat), dtype=np.float32)
        sigmalist.append(sigma)
        currentcolumn += ncatfeat

inputmeans = np.hstack(meanslist)
inputsigma = np.hstack(sigmalist)

normedinputs = (inputs-inputmeans) / inputsigma

print(inputmeans)
print(meanslist, sigmalist)
print(normedinputs)

[[ 52.657894 698.1734    58.784267  46.991592   0.         0.
    0.         0.         0.      ]]
[array([[52.657894]], dtype=float32), array([[698.1734]], dtype=float32), array([[58.784267]], dtype=float32), array([[46.991592]], dtype=float32), array([[0., 0., 0.]], dtype=float32), array([[0., 0.]], dtype=float32)] [array([[47.973377]], dtype=float32), array([[267.8131]], dtype=float32), array([[22.807163]], dtype=float32), array([[17.591656]], dtype=float32), array([[1., 1., 1.]], dtype=float32), array([[1., 1.]], dtype=float32)]
[[ 0.65336883  0.25108016  1.6533036  ...  0.          1.
   0.        ]
 [ 0.88748777  0.8451098   0.5368773  ...  1.          1.
   0.        ]
 [ 0.04590729  3.6244473   0.7593291  ...  1.          0.
   1.        ]
 ...
 [-0.9136569  -0.5308716  -0.8939794  ...  1.          1.
   0.        ]
 [-0.23855391 -0.04091594  0.00887157 ...  1.          1.
   0.        ]
 [ 0.0715469  -0.7471284  -0.33419332 ...  0.          1.
   0.        ]]


In [11]:
normedinputs.shape[1] - 4

5

In [13]:
a = inputs[0:10]
print(a.shape)

b = np.random.permutation(a.shape[0])
print(b)
nextconditional = a[b[2],4:]
print(np.random.permutation(4))


(10, 9)
[4 9 2 6 7 0 3 8 5 1]
[2 3 1 0]


In [15]:
import tensorflow as tf

In [18]:
print(a[:,0])
print(a)
tf.reshape(a[:,0],[-1,1])

[ 84.002205  95.23368   54.86022   37.8228    29.449005  57.611637
  40.065826  62.580334  59.275562 133.99251 ]
[[8.4002205e+01 7.6541595e+02 9.6491432e+01 8.3413086e+01 1.0000000e+00
  0.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00]
 [9.5233681e+01 9.2450488e+02 7.1028915e+01 6.8117470e+01 0.0000000e+00
  0.0000000e+00 1.0000000e+00 1.0000000e+00 0.0000000e+00]
 [5.4860222e+01 1.6688479e+03 7.6102409e+01 4.6142303e+01 0.0000000e+00
  0.0000000e+00 1.0000000e+00 0.0000000e+00 1.0000000e+00]
 [3.7822800e+01 3.5504123e+02 3.8004093e+01 3.6965599e+01 0.0000000e+00
  1.0000000e+00 0.0000000e+00 0.0000000e+00 1.0000000e+00]
 [2.9449005e+01 5.5566846e+02 6.2723263e+01 4.5919170e+01 1.0000000e+00
  0.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00]
 [5.7611637e+01 4.8155209e+02 5.8068291e+01 4.3973988e+01 0.0000000e+00
  1.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00]
 [4.0065826e+01 9.4344019e+02 9.5859024e+01 9.1362709e+01 0.0000000e+00
  0.0000000e+00 1.0000000e+00

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[ 84.002205],
       [ 95.23368 ],
       [ 54.86022 ],
       [ 37.8228  ],
       [ 29.449005],
       [ 57.611637],
       [ 40.065826],
       [ 62.580334],
       [ 59.275562],
       [133.99251 ]], dtype=float32)>

In [35]:
def invsigmoid(x):
    xclip = tf.clip_by_value(x, 1e-6, 1.0-1e-6)
    #xclip = x
    return tf.math.log(xclip/(1.0-xclip))

xin = tf.keras.layers.Input(shape=(4+5, ))
xcondin = xin[:, 4:]
outlist = []

tfk = tf.keras
nafdim = 16

for iv in range(2):
    xiv = tf.reshape(a[:,iv],[-1,1])
    net = xiv
    condnet = xcondin

    print(net)

    condnet = tfk.layers.Dense(128, activation=tf.nn.swish)(condnet)
    condnet = tfk.layers.Dense(128, activation=tf.nn.swish)(condnet)
    w1 = tfk.layers.Dense(nafdim, activation=tf.nn.softplus)(condnet)
    b1 = tfk.layers.Dense(nafdim, activation=None)(condnet)

    net1 = tf.nn.sigmoid(w1 * net + b1)
    print(w1)
    print(b1)
    print(net1)
    condnet = xcondin
    condnet = tfk.layers.Dense(128, activation=tf.nn.swish)(condnet)
    condnet = tfk.layers.Dense(128, activation=tf.nn.swish)(condnet)
    w2 = tfk.layers.Dense(nafdim, activation=tf.nn.softplus)(condnet)
    w2 = w2/ (1.0e-3 + tf.reduce_sum(w2, axis=1,keepdims=True)) # normalize

    net = invsigmoid(tf.reduce_sum(net1 * w2, axis=1, keepdims=True))
    print(w2)
    print(net)
    outlist.append(net)
    print(xcondin,xiv)
    xcondin = tf.concat([xcondin, xiv], axis=1)
    print(xcondin)


tf.Tensor(
[[ 84.002205]
 [ 95.23368 ]
 [ 54.86022 ]
 [ 37.8228  ]
 [ 29.449005]
 [ 57.611637]
 [ 40.065826]
 [ 62.580334]
 [ 59.275562]
 [133.99251 ]], shape=(10, 1), dtype=float32)
KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='dense_109/Softplus:0', description="created by layer 'dense_109'")
KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='dense_110/BiasAdd:0', description="created by layer 'dense_110'")
KerasTensor(type_spec=TensorSpec(shape=(10, 16), dtype=tf.float32, name=None), name='tf.math.sigmoid_15/Sigmoid:0', description="created by layer 'tf.math.sigmoid_15'")
KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='tf.math.truediv_29/truediv:0', description="created by layer 'tf.math.truediv_29'")
KerasTensor(type_spec=TensorSpec(shape=(10, 1), dtype=tf.float32, name=None), name='tf.math.log_14/Log:0', description="created by layer 'tf.math.log_14'")
KerasTensor(ty

In [36]:
outputlayer_permuted = tf.concat(outlist, axis=1)
print(outputlayer_permuted)
#outputlayer = permuter.inverse(outputlayer_permuted)
#nextfeature = outputlayer

KerasTensor(type_spec=TensorSpec(shape=(10, 2), dtype=tf.float32, name=None), name='tf.concat_16/concat:0', description="created by layer 'tf.concat_16'")


In [47]:
cond = [[1., 0., 0.,   1., 0., ]]
minibatch = 3
cond_to_append = np.repeat(cond, minibatch, axis=0)
print(cond_to_append)
xin = a[0:3,:4]
np.hstack((xin, cond_to_append))

[[1. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0.]]


array([[8.40022049e+01, 7.65415955e+02, 9.64914322e+01, 8.34130859e+01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [9.52336807e+01, 9.24504883e+02, 7.10289154e+01, 6.81174698e+01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [5.48602219e+01, 1.66884790e+03, 7.61024094e+01, 4.61423035e+01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00]])