The code below is the model solution to exercises 1, 2 and 3, the solution of which is needed to start exercise 4:

In [None]:
import uproot
import numpy as np

#open files
signal_f = uproot.open('GluGluHToWWTo2L2Nu_M125.root')
bkg_f = ( uproot.open('DYJetsToLL_M-10to50-LO.root'), uproot.open('DYJetsToLL_M-50-LO.root') )

#open trees
sig_tree = signal_f[ 'Events' ]
bkg_trees = [ f['Events'] for f in bkg_f ]

#list of input variables
input_variables = [ 'ptll', 'mth', 'jetpt1_cut', 'uperp', 'upara', 'PfMetDivSumMet', 'recoil', 'mpmet', 'mtw1', 'mtw2', 'PuppiMET_pt', 'MET_pt', 'TkMET_pt', 'projtkmet', 'projpfmet', 'dphilljet_cut', 'dphijet1met_cut', 'dphillmet', 'dphilmet1', 'dphilmet2', 'jetpt2_cut', 'dphijet2met_cut', 'dphilljetjet_cut', 'dphijjmet_cut', 'ptTOT_cut', 'mTOT_cut', 'PV_npvsGood' ]

#function to convert dictionary of arrays to 2D array
def arrayDictTo2DArray( array_dict ):
    ret_array = None
    for key in array_dict:
        if ret_array is None:
            ret_array = array_dict[key]
            ret_array = np.expand_dims( ret_array, 1 )
        else :
            new_array = np.expand_dims( array_dict[key], 1 )
            ret_array = np.concatenate( [ret_array, new_array], axis = 1 )
    return ret_array


#function to normalize array of inputs ( center distribution at 0 and give them unit variance )
def normalizeInputArray( array2D ):

    #calculate mean along event axis, and expand dimensions of array so that it can be subtracted from previous array
    array2D -= np.expand_dims( np.mean( array2D, axis = 1 ), axis = 1 )
    array2D /= np.expand_dims( np.std( array2D, axis = 1 ), axis = 1 )
    return array2D

#build signal and background arrays
signal_data = normalizeInputArray( arrayDictTo2DArray( sig_tree.arrays( input_variables ) ) )
background_data_list = [ normalizeInputArray( arrayDictTo2DArray(tree.arrays( input_variables ) ) ) for tree in bkg_trees ]

#merge arrays for two different backgrounds
background_data = np.concatenate( background_data_list, axis = 0 )


#list of variables representing weights 
weight_variables = [ 'XSWeight', 'SFweight2l', 'LepSF2l__ele_mvaFall17V1Iso_WP90__mu_cut_Tight_HWWW', 'LepCut2l__ele_mvaFall17V1Iso_WP90__mu_cut_Tight_HWWW', 'GenLepMatch2l', 'METFilter_MC' ]

#function to build the total weight array for one tree 
def getWeightArray( tree, weight_variables ):
    weight_array = None
    for key in weight_variables:
        if weight_array is None:
            weight_array = tree.array( key )
        else :
            weight_array *= tree.array( key )
    return weight_array

#read signal and background weights
signal_weights = getWeightArray( sig_tree, weight_variables )
bkg_weights = np.concatenate( [ getWeightArray( bkg_tree, weight_variables ) for bkg_tree in bkg_trees ], axis = 0 )

#avoid large numerical scales of weights
signal_weights /= np.mean( signal_weights )
bkg_weights /= np.mean( bkg_weights )

#define arrays with labels for signal and background events
#these are what the neural network will try to predict
signal_labels = np.ones( len( signal_weights ) )
bkg_labels = np.zeros( len( bkg_weights ) )


Exercise 4: Now ramdonly shuffle the input data, weights and labels simultaneously for both signal and background.

Exercise 4: Now split all signal and background information into training validation and test sets. Do this by writing a function that splits an array given a validation and test fraction. 

Exercise 5: Now merge signal and background arrays for the training, validation and test sets, and randomize each of them after merging.

Exercise 5: You are now done reading in the data! Now we will feed it into a (currently VERY VERY BAD) neural network, train it and evaluate its performance. Just fill in the names of your datasets where 'train_data', etc are written below and run the code.

In [None]:
#Now we have the datasets we need for training a neural network!
#Given below is a ( currently VERY VERY BAD!!!) neural network that will be trained

#keras modules
from keras import models
from keras import layers
from keras import optimizers
from keras import losses

network = models.Sequential()
network.add( layers.Dense(64, activation='linear' ) )
network.add( layers.Dense(1, activation='sigmoid' ) )

#to use auc as a keras metric 
import tensorflow as tf

network.compile(
    optimizer=optimizers.Nadam(),
    loss=losses.binary_crossentropy,
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

history = network.fit(
    train_data,
    train_labels,
    sample_weight =train_weights,
    epochs=10,
    batch_size=1024,
    validation_data=( val_data, val_labels, val_weights ),
    verbose = 1
)

train_output_signal = network.predict( signal_data_train )
train_output_bkg = network.predict( bkg_data_train )
val_output_signal = network.predict( signal_data_val )
val_output_bkg = network.predict( bkg_data_val )


#plot ROC curve and compute AUC
from diagnosticPlotting import plotKerasMetricComparison, plotROC, computeROC, plotOutputShapeComparison, areaUnderCurve

sig_eff, bkg_eff = computeROC( val_output_signal, signal_weights_val, val_output_bkg, bkg_weights_val, num_points = 10000 )

plotROC( sig_eff, bkg_eff, 'roc' )
print('######################################' )
roc_integral = areaUnderCurve( sig_eff, bkg_eff )
print( 'ROC INTEGRAL = {}'.format( roc_integral ) )
print('######################################' )

#plot output shape for training and validation sets
plotOutputShapeComparison( train_output_signal, signal_weights_train, train_output_bkg, bkg_weights_train,
    val_output_signal, signal_weights_val,
    val_output_bkg, bkg_weights_val,
    'model'
)