In [1]:
import numpy as np
import torch as t
from torch.distributions import MultivariateNormal as MvNormal
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib widget
from ipywidgets import FloatSlider, IntSlider, interact, interact_manual

$$
\newcommand{\bracket}[3]{\left#1 #3 \right#2}
\newcommand{\b}{\bracket{(}{)}}
\newcommand{\Bernoulli}{{\rm Bernoulli}\b}
\newcommand{\x}{\mathbf{x}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\m}{\boldsymbol{\mu}}
\newcommand{\P}{{\rm P}\b}
\newcommand{\dd}[2][]{\frac{\partial #1}{\partial #2}}
\newcommand{\S}{\mathbf{\Sigma}}
\newcommand{\Sh}{\mathbf{\hat{\Sigma}}}
\newcommand{\mh}{\boldsymbol{\hat{\mu}}}
\newcommand{\N}{\mathcal{N}\b}
\newcommand{\det}{\bracket{\lvert}{\rvert}}
\newcommand{\sb}{\bracket{[}{]}}
\newcommand{\E}{\mathbb{E}\sb}
\newcommand{\Var}{{\rm Var}\sb}
\newcommand{\Cov}{{\rm Cov}\sb}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\ph}{\hat{p}}
\newcommand{\at}{\bracket{.}{\rvert}}
\newcommand{\w}{\mathbf{w}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\Wh}{\mathbf{\hat{W}}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\L}{\mathcal{L}}
\newcommand{\wh}{\mathbf{\hat{w}}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\La}{\mathbf{\Lambda}}
\newcommand{\S}{\mathbf{\Sigma}}
\newcommand{\Sprior}{\S_\text{prior}}
\newcommand{\Spost}{\S_\text{post}}
\newcommand{\mprior}{\m_\text{prior}}
\newcommand{\mpost}{\m_\text{post}}
\newcommand{\Xt}{\tilde{\X}}
\newcommand{\yt}{\tilde{\y}}
$$

<h1> Question sheet 1: maximum likelihood regression </h1>

<h2> Question 1 </h2>

Derive the regularised maximum likelihood solution to the following optimization problem,

\begin{align}
  \L(\w) &= \log \P{\y| \X, \w} - \tfrac{1}{2} \w^T \La \w
\end{align}

<h3> Answer </h3>

We begin by taking the gradient of $\log \P{\y| \X, \w}$ from the notes,

\begin{align}
  \dd[\log \P{\y| \X, \w}]{\w} &= \tfrac{1}{\sigma^2} \X^T \b{\y - \X \w}
\end{align}

Next, we consider the gradient of the second term,

\begin{align}
  \dd{w_\alpha} \sb{-\tfrac{1}{2} \sum_{ij} w_i \Lambda_{ij} w_j} 
  &= -\tfrac{1}{2} \sb{\sum_{ij} \dd[w_i]{w_\alpha} \Lambda_{ij} w_j + \sum_{ij} w_i \Lambda_{ij} \dd[w_j]{w_\alpha}}\\
  &= -\tfrac{1}{2} \sb{\sum_{j} \Lambda_{\alpha j} w_j + \sum_{i} w_i \Lambda_{i \alpha}}\\
\end{align}

as $\La$ is symmetric,

\begin{align}
  &= -\tfrac{1}{2} \sb{\sum_{j} \Lambda_{\alpha j} w_j + \sum_{i} w_i \Lambda_{\alpha i}}\\
  &= - \sum_{i} \Lambda_{\alpha i} w_i
\end{align}

Putting everything back in matrix notation,

\begin{align}
  \dd{\w} \sb{ - \tfrac{1}{2} \w^T \La \w} &= - \La \w
\end{align}

Combining the first and second terms, we can compute the gradient of the objective,

\begin{align}
  \dd[\L(\w)]{\w} &= \tfrac{1}{\sigma^2} \X^T \b{\y - \X \w} - \La \w.
\end{align}

Finally, we solve for the location, $\wh$, where this gradient is zero,

\begin{align}
  \0 &= \X^T \b{\y - \X \wh} - \La \wh\\
  \0 &= \X^T \b{\y - \X \wh} - \sigma^2 \La \wh\\
  \0 &= \X^T \y - \X^T \X \wh - \sigma^2 \La \wh\\
  \0 &= \X^T \y - \b{\X^T \X + \sigma^2 \La} \wh\\
  \b{\X^T \X + \sigma^2 \La} \wh &= \X^T \y \\
  \wh &= \b{\X^T \X + \sigma^2 \La}^{-1} \X^T \y \\
\end{align}

<h2> Question 2 </h2>

For the data sample in the table, and a model of the form $y = w_0 + w_1 x$, a noise-level of $\sigma = 1$, and a regulariser, $\La = 2 \I$, compute the regularised ML solution.
\begin{align}
  \L\b{\w} &= \log \N{\y; \X \w, \sigma^2} - \tfrac{1}{2}\w^T \La \w
\end{align}

\begin{tabular}{rr}
  x   & y   \\
  \hline
   -2.0 &  -6.2 \\
   -1.0 &  -2.6 \\
    0.0 &   0.5 \\
    1.0 &   2.7 \\
    2.0 &   5.7
\end{tabular}

Do this using a calculator, as if you were in an exam.

<h3> Answer </h3>

First, write down $\X$, $\y$, $\La$ and $\sigma$ for error-checking

In [2]:
X = t.tensor([
    [1., -2.],
    [1., -1.],
    [1.,  0.],
    [1.,  1.],
    [1.,  2.]
])

y = t.tensor([
    [-6.2],
    [-2.6],
    [ 0.5],
    [ 2.7],
    [ 5.7]
])

La = 2*t.eye(2)
s2 = 1

Begin by computing $\X^T \X$,

In [3]:
XTX = t.zeros(2,2)

XTX[0,0] = ( 1.)**2 + ( 1.)**2 + ( 1.)**2 + ( 1.)**2 + ( 1.)**2
XTX[1,1] = (-2.)**2 + (-1.)**2 + ( 0.)**2 + ( 1.)**2 + ( 2.)**2
XTX[0,1] = 1.*(-2.) + 1.*(-1.) + 1.*( 0.) + 1.*( 1.) + 1.*( 2.)
XTX[1,0] = XTX[0,1]

assert t.allclose(XTX, X.T@X)
XTX

tensor([[ 5.,  0.],
        [ 0., 10.]])

Next compute, $\X^T \X + \sigma^2 \La$,

In [4]:
XTX_s2La = t.zeros(2,2)

XTX_s2La[0,0] = XTX[0,0] + s2*2
XTX_s2La[1,1] = XTX[1,1] + s2*2
XTX_s2La[1,0] = XTX[1,0]
XTX_s2La[0,1] = XTX[0,1]

assert t.allclose(XTX_s2La, X.T@X + s2*La)
XTX_s2La

tensor([[ 7.,  0.],
        [ 0., 12.]])

Now, compute $\b{\X^T \X + \sigma^2 \La}^{-1}$ inverse using the standard formula,

\begin{align}
  \begin{pmatrix} a &  b \\  c & d \end{pmatrix}^{-1} &=
  \frac{1}{ad-bc}
  \begin{pmatrix} d & -b \\ -c & a \end{pmatrix}
\end{align}

In [5]:
inv_XTX_s2La = t.zeros(2,2)

det = XTX_s2La[0,0]*XTX_s2La[1,1] - XTX_s2La[1,0]*XTX_s2La[0,1]
print(det)

inv_XTX_s2La[0,0] =  XTX_s2La[1,1]/det
inv_XTX_s2La[1,1] =  XTX_s2La[0,0]/det
inv_XTX_s2La[1,0] = -XTX_s2La[1,0]/det
inv_XTX_s2La[0,1] = -XTX_s2La[0,1]/det

assert t.allclose(inv_XTX_s2La, t.inverse(X.T@X + s2*La))
inv_XTX_s2La

tensor(84.)


tensor([[0.1429, -0.0000],
        [-0.0000, 0.0833]])

Now, compute $\X^T \y$,

In [6]:
XTy = t.zeros(2, 1)

XTy[0,0] = ( 1.)*(-6.2) + ( 1.)*(-2.6) + ( 1.)*(0.5) + ( 1.)*(2.7) + ( 1.)*(5.7)
XTy[1,0] = (-2.)*(-6.2) + (-1.)*(-2.6) + ( 0.)*(0.5) + ( 1.)*(2.7) + ( 2.)*(5.7)

assert t.allclose(XTy, X.T@y)
XTy

tensor([[ 0.1000],
        [29.1000]])

Finally, we compute $\b{\X^T \X + \sigma^2 \La}^{-1} \X^T \y$ as a matrix-vector multiplication,

In [7]:
wh = t.zeros(2, 1)

wh[0,0] = inv_XTX_s2La[0,0] * XTy[0,0] + inv_XTX_s2La[0,1] * XTy[1,0]
wh[1,0] = inv_XTX_s2La[1,0] * XTy[0,0] + inv_XTX_s2La[1,1] * XTy[1,0]

assert t.allclose(wh, t.inverse(X.T@X + s2*La) @ X.T@y)
wh

tensor([[0.0143],
        [2.4250]])

In [8]:
wh.shape

torch.Size([2, 1])