# Maximum likelihood estimator
This notebook shows how to do maximum likelihood estimation (MLE) in tensorflow
based on [https://thelongrun.blog/2019/11/24/maximum-likelihood-with-tensorflow-2-0/]

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

tf.config.set_visible_devices([], 'GPU')
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
    assert device.device_type != 'GPU'

This notebook shows how to do maximum likelihood estimation (MLE), and for demostration purposes, data will be generated with two distributions. I encourage you try with your data instead.

The main libraries we use for this tutorial are Tensorflow and Tensorflow Probability. Just because of the easiness with computing maximum likelihood.

## Helper functions

In [None]:
params_to_func = {'mu': tf.math.reduce_mean, # Computes the mean of elements across dimensions of a tensor.
                  'sigma':tf.math.reduce_std, # Computes the standard deviation of elements across dimensions of a tensor.
                  'rate': tf.math.reduce_mean}

colors_encoding=['b','m','g']

In [None]:
# Functions to perform MLE


# A loss function, although for us, it has to be the likelihood function. 
# Since optimization in Tensorflow follows a minimization procedure we use the negative 
# of the log-likelihood function which is equivalent to maximizing the likelihood. 
# The log operator deters from running into underflow issues when calculating the likelihood of hundreds of samples. 
# This function will depend on the data and parameters that have been declared.
def loss(model, data):
    total_log_prob = -tf.reduce_mean(model.log_prob(data))
    return total_log_prob
    
# A function to record the gradients and later use them to update the parameters during the learning process. 
# These gradients are calculated by differentiation of the loss function with respect to the parameters. 
# The tape object in Tensorflow takes care of everything.
# This process is called Automatic Differentiation
def grad(model, inputs):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)


# During the training process, the grad function returns the loss and the gradients. 
# These are then used to update the parameters with the method apply_gradients of the optimizer. 
# The choice for this example is the Adam optimizer. 
# We show the loss and the parameter values are updated every 10 iterations. 
def mle_run(data, model, parameters, optimizer, steps=1000, verbose=False):
    update_list = []
    prob_values = []
    value_range = tf.linspace(tf.reduce_min(data),tf.reduce_max(data), 100)
    
    for i in range(steps):
        loss_value, grads = grad(model, data)
        optimizer.apply_gradients(zip(grads, parameters))
    
        if i % 10 == 0:
            update_list.append((
                optimizer.iterations.numpy(),loss_value.numpy(), 
                *[p.numpy()[0] for p in parameters]))
            param_str = ", ".join([p.name.split(':')[0]+": "+str(p.numpy()) for p in parameters])
            iter_info = f"Step: {optimizer.iterations.numpy()}, initial loss: {loss_value.numpy()}, {param_str}"
            if verbose:
                print(iter_info)
            prob_values.append(model.prob(value_range))
    
    return update_list, prob_values, value_range

In [None]:
# Functions for plotting
# Code adapted from: https://alexgude.com/blog/matplotlib-blitting-supernova/
%matplotlib inline
from functools import partial
from collections import namedtuple
import matplotlib.pyplot as plt
plt.rcParams["animation.html"] = "jshtml"
from matplotlib import animation

def init_fig(f, ax, artists, data):
    
    freq, bins, _ = plt.hist(data.numpy(), bins=50, density = True);
    # Set axis and plot titles
    ax.set_title("Fitting data with Maximum Likelihood", fontsize=20)
    ax.set_xlabel("Values of the distribution", fontsize=14)
    ax.set_ylabel(r"$p(D|\theta)$", fontsize=14)
    
    ax.set_xlim((tf.reduce_min(data)-0.1*tf.math.abs(tf.reduce_min(data))).numpy(), 
                (tf.reduce_max(data)+0.1*tf.math.abs(tf.reduce_max(data))).numpy()
               )
    ax.set_ylim(0.,freq.max())
    
    
    return artists

def frame_iter(update_list, prob_values):
    for i in range(len(prob_values)):
        yield (update_list[i][0], prob_values[i])
        
def update_artists(frames, artists, value_range):
    s, p = frames
    artists.prob.set_data(value_range.numpy(), p)
    artists.step.set_text("Step " + str(s))

# Function to plot learning curves
def plot_curves(update_lists, data, params_list):
    from pandas import DataFrame
    param_names = [p.name.split(':')[0] for p in params_list]
    learning_df = DataFrame(update_lists, columns=["Step", "Loss", *param_names])
    
    f, ax = plt.subplots(1+len(params_list), 1, sharex = True, figsize=(16, 8))
    learning_df.plot(x = "Step", y='Loss',style="r--X" ,ax=ax[0])
    for i, p in enumerate(param_names):
        learning_df.plot(x = "Step", y=p, style=f"{colors_encoding[i]}--*",linewidth=3,ax=ax[i+1])
        ax[i+1].axhline(params_to_func[p](data).numpy(), color=colors_encoding[i])
        ax[i+1].legend([f"MLE {p}",f"True {p}"])
    plt.tight_layout()
    

## MLE to Normal random variable
The first example generates data sampling from a Normal distribution

In [None]:
# Data
x = 0.66+tf.random.normal([5000])
# Two variables
mu = tf.Variable([0.], name="mu") # Mean of the distribution
sigma = tf.Variable([3.], name="sigma") # Standard deviation of the distribution

# The model is a probability distribution
model = tfd.Normal(loc=mu, scale=sigma)

In [None]:
optimizer = tf.optimizers.Adam(0.01)
ul, pv, vr = mle_run(x, model, [mu, sigma], optimizer, steps=300, verbose=True)

Plotting an animation of updates in the probability distribution fitted by the model

In [None]:
f, ax = plt.subplots(figsize=(18,10))
Artists = namedtuple("Artists",("prob","step"))
artists = Artists(plt.plot([], [], "r--",animated=True)[0], 
                  ax.text(x=0.9*vr[-1].numpy(), y = 0.05, s=""))

init = partial(init_fig, f=f, ax=ax, artists=artists, data=x)
update = partial(update_artists, artists=artists, value_range=vr)
frame = partial(frame_iter, update_list=ul, prob_values=pv)

ani = animation.FuncAnimation(fig=f, 
                              func=update, 
                              frames = frame,
                              init_func = init)
ani

Now let's plot the learning curves:

In [None]:
plot_curves(ul, x, [mu,sigma])

## MLE to Poisson random variable
The second example tries to estimate the parameter from data generated with a discrete distribution like Poisson.

In [None]:
fake_samples = tf.random.poisson([300], 0.5)
fake_samples = tf.cast(tf.math.ceil(fake_samples), tf.float32)

In [None]:
# The model is a probability distribution
rate = tf.Variable([5.], name="rate") # Give the parameter an initial value
model = tfd.Poisson(rate=rate)

In [None]:
optimizer = tf.optimizers.Adam(0.01)
ulp, pvp, vrp = mle_run(fake_samples, model, [rate], optimizer, steps=800, verbose=True)

In [None]:
f, ax = plt.subplots(figsize=(15,10))
Artists = namedtuple("Artists",("prob","step"))
artists = Artists(plt.plot([], [], "r--",animated=True)[0], 
                  ax.text(x=0.9*vrp[-1].numpy(), y = 0.05, s=""))

init = partial(init_fig, f=f, ax=ax, artists=artists, data=fake_samples)
update = partial(update_artists, artists=artists, value_range=vrp)
frame = partial(frame_iter, update_list=ulp, prob_values=pvp)

ani = animation.FuncAnimation(fig=f, 
                              func=update, 
                              frames = frame,
                              init_func = init)

ani

And we plot the learning curves:

In [None]:
plot_curves(ulp, fake_samples, [rate])