In [1]:
import tensorflow as tf
from tensorflow.keras import layers 
from tensorflow.keras.regularizers import l2
from tensorflow.keras import initializers
from tensorflow.keras import optimizers
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sklearn
import pandas as pd
import numpy as np
print(tf.__version__)

import matplotlib.pyplot as plt
%matplotlib inline

2.3.1


Deep Learning (p. 5)

We explored the use of deep neural networks as a practical tool for applications in high-energy physics. Hyper-parameters were chosen using a subset of the HIGGS data consisting of 2.6 million training examples and 100,000 validation examples. Due to computational costs, this optimization was not thorough, but included combinations of the pre-training methods, network architectures, initial learning rates, and regularization methods shown in Supplementary Table 1. We selected a five-layer neural network with 300 hidden units in each layer, a learning rate of 0.05, and a weight decay coefficient of 1 × 10−5. Pre-training, extra hidden units, and additional hidden layers significantly increased training time without noticeably increasing performance. To facilitate comparison, shallow neural networks were trained with the same hyper-parameters and the same number of units per hidden layer. Additional training details are provided in the Methods section below. The hyper-parameter optimization was performed using the full set of HIGGS features. To investigate whether the neural networks were able to learn the discriminative information contained in the high-level features, we trained separate classifiers for each of the three feature sets described above: low-level, high-level, and combined feature sets. For the SUSY benchmark, the networks were trained with the same hyper-parameters chosen for the HIGGS, as the datasets have similar characteristics and the hyper-parameter search is computationally expensive.

Performance (p.6)
Classifiers were tested on 500,000 simulated examples generated from the same Monte Carlo procedures as the training sets. We produced Receiver Operating Characteristic (ROC) curves to illustrate the performance of the classifiers. Our primary metric for comparison is the area under the ROC curve (AUC), with larger AUC values indicating higher classification accuracy across a range of threshold choices. This metric is insightful, as it is directly connected to classification accuracy, which is the quantity optimized for in training. In practice, physicists may be interested in other metrics, such as signal efficiency at some fixed background rejection, or discovery significance as calculated by p-value in the null hypothesis. We choose AUC as it is a standard in machine learning, and is closely correlated with the other metrics. In addition, we calculate discovery significance – the standard metric in highenergy physics – to demonstrate that small increases in AUC can represent significant enhancement in discovery significance.

Neural Network Training (p. 9)

In training the neural networks, the following hyperparameters were predetermined without optimization. Hidden units all used the tanh activation function. Weights were initialized from a normal distribution with zero mean and standard deviation 0.1 in the first layer, 0.001 in the output layer, and 0.05 all other hidden layers. Gradient computations were made on mini-batches of size 100. A momentum term increased linearly over the first 200 epochs from 0.9 to 0.99, at which point it remained constant. The learning rate decayed by a factor of 1.0000002 every batch update until it reached a minimum of 10−6. Training ended when the momentum had reached its maximum value and the minimum error on the validation set (500,000 examples) had not decreased by more than a factor of 0.00001 over 10 epochs. This early stopping prevented overfitting and resulted in each neural network being trained for 200-1000 epochs. Autoencoder pretraining was performed by training a stack of single-hidden-layer autoencoder networks as in [9], then fine-tuning the full network using the class labels. Each autoencoder in the stack used tanh hidden units and linear outputs, and was trained with the same initialization scheme, learning algorithm, and stopping parameters as in the fine-tuning stage. When training with dropout, we increased the learning rate decay factor to 1.0000003, and only ended training when the momentum had reached its maximum value and the error on the validation set had not decreased for 40 epochs.

In [2]:
# Import Data
df = pd.read_csv('HIGGS/HIGGS.csv',header=None)
df.columns =['target', 'lepton_ph', 'lepton_eta', 'lepton_phi','missing_energy_magnitude','missing_energy_phi',
             'jet_1_pt','jet_1_eta','jet_1_phi','jet_1_btag','jet_2_pt','jet_2_eta','jet_2_phi','jet_2_btag',
             'jet_3_pt','jet_3_eta','jet_3_phi','jet_3_btag','jet_4_pt','jet_4_eta','jet_4_phi','jet_4_btag',
             'm_jj','m_jjj','m_lv','m_jlv','m_bb','m_wbb','m_wwbb'] 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000000 entries, 0 to 10999999
Data columns (total 29 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   target                    float64
 1   lepton_ph                 float64
 2   lepton_eta                float64
 3   lepton_phi                float64
 4   missing_energy_magnitude  float64
 5   missing_energy_phi        float64
 6   jet_1_pt                  float64
 7   jet_1_eta                 float64
 8   jet_1_phi                 float64
 9   jet_1_btag                float64
 10  jet_2_pt                  float64
 11  jet_2_eta                 float64
 12  jet_2_phi                 float64
 13  jet_2_btag                float64
 14  jet_3_pt                  float64
 15  jet_3_eta                 float64
 16  jet_3_phi                 float64
 17  jet_3_btag                float64
 18  jet_4_pt                  float64
 19  jet_4_eta                 float64
 20  jet_4_phi             

# Clean Data

In [12]:
train = df.sample(n=1000000, random_state=123)

In [13]:
y = train['target']
pre_X = train.loc[:, train.columns != 'target']

scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(pre_X)
X = pd.DataFrame(data=scaled_train, columns=pre_X.columns)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1776)

# Keras Model Definition

In [7]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [8]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6171675237797197555
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 18003363401230734884
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6495077664
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2940008660304954004
physical_device_desc: "device: 0, name: GeForce RTX 2080, pci bus id: 0000:07:00.0, compute capability: 7.5"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 13234590549812127673
physical_device_desc: "device: XLA_GPU device"
]


In [15]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28,)),
  tf.keras.layers.Dense(300, activation='tanh', 
                       kernel_initializer=initializers.RandomNormal(stddev=0.1)),
  tf.keras.layers.Dense(300, activation='tanh', 
                       kernel_initializer=initializers.RandomNormal(stddev=0.05)),
  tf.keras.layers.Dense(300, activation='tanh', 
                       kernel_initializer=initializers.RandomNormal(stddev=0.05)),
  tf.keras.layers.Dense(300, activation='tanh', 
                       kernel_initializer=initializers.RandomNormal(stddev=0.05)),
  tf.keras.layers.Dense(1, activation='sigmoid',
                       kernel_initializer=initializers.RandomNormal(stddev=0.001))
])

In [16]:
opt = tf.keras.optimizers.SGD(learning_rate=0.05, momentum=1e-5)
model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['AUC'])

In [None]:
# Model Training

In [18]:
model.fit(x_train, y_train, epochs=10, validation_data=(x_test,y_test), batch_size=13)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [None]:
# Evaluation