In [None]:
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import scipy

## Ingrediet for ML

   * A large, curated dataset
   * A model, taking inputs and making predictions (e.g. a neural network)
   * A loss, evaluating how well the model is performing, including a regularization to contrain the model
   * A minimization procedure, to optimize the loss tuning the model parameters
   * Several metrics, to evaluate the performance of the trained model
   * Powerful hardware

## Automatic differentiation
The key ingredient for optimize neural network is the ability to compute the gradient of the loss with respect to the parameters of the model. This is achived with automatic differentiation.

In [None]:
def f(x):
    return (x - 1) ** 2

### In Tensorflow

In [None]:
x = tf.Variable(3.0)
with tf.GradientTape() as tape:
    y = f(x)

tape.gradient(y, x)

### In Autograd

In [None]:
import autograd

f_dx = autograd.grad(f)
f_dx(3.)

### In Jax

In [None]:
import jax

f_dx = jax.grad(f)
f_dx(3.)

## Using control flow
You can differentiate function with `if`/`for`/...

In [None]:
def f(x):
    if x > 2:
        for i in range(10):
            x += jax.numpy.sqrt(x)
        return x / 10.
    else:
        return jax.numpy.cos(x ** 3)
    
f_dx = jax.grad(f)
    
xspace = np.linspace(-2., 5, 500)
yi = np.asarray([f(xx) for xx in xspace])
plt.plot(xspace, yi, label='function')
yi = np.asarray([f_dx(xx) for xx in xspace])
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(xspace, yi, label='derivative')
ax.legend()
plt.show()

## Simplest neural network
Consider a fully connected neural network with one single layer. Each neuron $i$ takes a vector $x\in \mathbb{R}^N$ and returns as output $\sigma(W^{(i)} \cdot  x + b^{(i)})$, where $W^{(i)}\in\mathbb{R}^N$ is a vector of weights and $b^{(i)}\in\mathbb{R}$ is the bias. $\sigma$ is the response function and it must be non-linear. If we stack the output of all the $L$ neurons in a vector $y$ (the response of the layer):

$$
y = \sigma(W x + b)
$$

in the formula above $\sigma$ is applied on each elements in the parenthesis. Here $W\in\mathbb{R}^{L\times N}$ while $y, b\in\mathbb{R}^L$

## Deep dense neural network

We can stack several layers:

$$
y_1 = \sigma_{L1}(W^{L1} x + b^{L1}) \\
y_2 = \sigma_{L2}(W^{L2} y_1 + b^{L2}) \\
\ldots
$$

## Activation functions

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
xspace = np.linspace(-2, 2, 100)
for fname in 'relu', 'elu', 'gelu', 'selu', 'swish', 'tanh':
    f = getattr(tf.keras.activations, fname)
    ax.plot(xspace, f(xspace), label=fname)
ax.legend(ncol=2, fontsize=20)

## Keras functional API

In [None]:
x_input = tf.keras.Input(shape=(4))
x = tf.keras.layers.Dense(64, activation="relu")(x_input)
x = tf.keras.layers.Dense(32, activation="relu")(x)
x = tf.keras.layers.Dense(1, activation="softmax")(x)

model = tf.keras.Model(inputs=x_input, outputs=x)
model.summary()

In [None]:
test_input = np.array([[1, 2, 3, 4]])  # note the [[  ]]
model(test_input)

In [None]:
test_input = np.array([[1, 2, 3, 4], [1, 2, 3, 4]])  
model(test_input)

## Keras functional API

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.Input(shape=(4, )),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation="softmax"),
    ]
)

tf.keras.utils.plot_model(model, show_layer_activations=True)

## Not only ML



### Statistics

Let define the likeilhood of a counting experiments, one category, one signal, background uncertainty. The parameters are the POI (signal strenght) and the NP about the background uncertainty.

In [None]:
import pyhf
pyhf.set_backend('jax')

# make a counting experiment
model = pyhf.simplemodels.uncorrelated_background(signal=[5.], bkg=[10.], bkg_uncertainty=[3.5])
pars = jnp.array(model.config.suggested_init())

# generate an Asimov dataset (e.g. 15 events observed)
data = jnp.array(model.expected_data(model.config.suggested_init()))

bestfit = pyhf.infer.mle.fit(data, model)  # not really needed since it is an Asimov
bestfit

In [None]:
H = -2 * jax.hessian(model.logpdf)(bestfit, data)[0]
np.linalg.inv(H)

We are able to compute the expected errros without any minimization!

Plot the likelihood as a function of the parameters with ***the gradient***

In [None]:
grid = x, y = np.meshgrid(np.linspace(0.5, 1.5, 101), np.linspace(0.5, 1.5, 101))

points = np.swapaxes(grid,0,-1).reshape(-1,2)
v = jax.vmap(model.logpdf, in_axes = (0,None))(points,data)
v = np.swapaxes(v.reshape(101,101),0,-1)

fig, ax = plt.subplots()
ax.contourf(x,y,v, levels = 100)
ax.contour(x,y,v, levels = 20, colors = 'w')


grid = x,y = np.meshgrid(np.linspace(0.5, 1.5, 11), np.linspace(0.5, 1.5, 11))
points = np.swapaxes(grid,0,-1).reshape(-1,2)
values, gradients = jax.vmap(
    jax.value_and_grad(
        lambda p,d: model.logpdf(p,d)[0]
    ), in_axes = (0,None)
)(points,data)

ax.quiver(
    points[:,0],
    points[:,1],
    gradients[:,0],
    gradients[:,1],
    angles = 'xy',
    scale = 75
)
ax.scatter(bestfit[0],bestfit[1], c = 'r')

ax.set_xlim(0.5,1.5)
ax.set_ylim(0.5,1.5)
ax.set_aspect('equal')

## Heavy number crunching
Even if the interface to most of the ML is in python, the expressions (the model, but also the minimization steps, the preprocessing, ...) are represented as a computational graph, which is optimized, compiled and distributed to the optimal hardware (CPU/GPU/TPU)

In [None]:
ymin, ymax = -1.5, 1.5
xmin, xmax = -1.5, 1.5

nx, ny = 500, 500

X, Y = np.meshgrid(np.linspace(xmin, xmax, nx), np.linspace(ymin, ymax, ny))
Z = X + 1j * Y

# Grid of complex numbers
xs = tf.constant(Z.astype(np.complex64))

# Z-values for determining divergence; initialized at zero
zs = tf.zeros_like(xs)

# N-values store the number of iterations taken before divergence
ns = tf.Variable(tf.zeros_like(xs, tf.float32))

def step(c, z, n):
    z = z * z + c
    
    not_diverged = tf.abs(z) < 4
    n = tf.add(n, tf.cast(not_diverged, tf.float32))
    
    return c, z, n

fig, axs = plt.subplots(1, 2, figsize=(15, 7))
iterations = 1000

# mandelbrot
for _ in range(iterations): 
    xs, zs, ns = step(xs, zs, ns)

def shade_fractal(fractal):
    fractal = np.where(fractal == 0, iterations, fractal)
    fractal = fractal / fractal.max()
    fractal = np.log10(fractal)  
    return fractal

axs[0].pcolormesh(X, Y, shade_fractal(ns), shading='gouraud')    

#julia
zs = tf.zeros_like(xs)
ns = tf.Variable(tf.zeros_like(xs, tf.float32))

for _ in range(iterations): 
    zs, xs, ns = step(-0.7269 + 0.1889j, xs, ns)
    
axs[1].pcolormesh(X, Y, shade_fractal(ns), shading='gouraud')    

for ax in axs:
    ax.set_aspect('equal')

## Hardware

<img src="imgs/ai-and-compute-all-error-no-title.png">

from https://arxiv.org/abs/2005.04305
<img src="imgs/ai-and-efficiency-compute.png">

GPT-3 175B model (175B parameters) required $3.14\times 10^{23}$ flop for training. Even at theoretical 28 TFLOPS for V100 GPU (1 = 10k\\$) and lowest 3 year reserved cloud pricing we could find, this will take 355 GPU-years and cost \\$4.6M for a single training.

<img src="imgs/gpt3_table.png">