## Lecture 15: Cross-Validation & Information Criteria

In [1]:
import numpy as np
import os

from sklearn import linear_model

import matplotlib.pyplot as plt
from matplotlib import rc

plt.rcParams['xtick.labelsize']=16      # change the tick label size for x axis
plt.rcParams['ytick.labelsize']=16      # change the tick label size for x axis
plt.rcParams['axes.linewidth']=1        # change the line width of the axis
plt.rcParams['xtick.major.width'] = 3   # change the tick line width of x axis
plt.rcParams['ytick.major.width'] = 3   # change the tick line width of y axis
rc('text', usetex=False)                # disable LaTeX rendering in plots
rc('font',**{'family':'DejaVu Sans'})   # set the font of the plot to be DejaVu Sans

### 1. $k$-fold Cross-Validation

In [None]:
import warnings
warnings.filterwarnings('ignore')

n = 10000
L = 4
x = np.linspace(0, L, n)
lam = 0.1
mu = 0.0
sigma = 0.1
f = x**2 + np.random.normal(mu, sigma, *x.shape)

M = 20 # Polynomial degree

trials = np.array([2, 10, 100])

fig, axs = plt.subplots(3,3)
fig.set_size_inches(9, 9)

E1 = np.zeros(len(trials))
E2 = np.zeros(len(trials))
E3 = np.zeros(len(trials))

for j in range(len(trials)):
  trial = trials[j]

  X1 = np.zeros((M, trial))
  X2 = np.zeros((M, trial))
  X3 = np.zeros((M, trial))

  # idx_test = np.random.choice(n, int(n*0.1), replace = False)
  idx_test_start = np.random.randint(low = 0, high = int(n-n*0.1))
  # idx_test_start = 0
  # f_test = f[idx_test]
  # x_test = x[idx_test]
  f_test = f[idx_test_start:idx_test_start + int(n*0.1)]
  x_test = x[idx_test_start:idx_test_start + int(n*0.1)]

  # f_train = np.delete(np.copy(f), idx_test)
  # x_train = np.delete(np.copy(x), idx_test)
  f_train = np.delete(np.copy(f), range(idx_test_start, idx_test_start + int(n*0.1)))
  x_train = np.delete(np.copy(x), range(idx_test_start, idx_test_start + int(n*0.1)))

  idx_train_all = np.random.choice(len(f_train), (trial, int((n-n*0.1)/trial)), replace = False)

  jj = 0
  for idx_train in idx_train_all:
    A = np.zeros((len(idx_train), M))
    for k in range(M):
      A[:,k] = x_train[idx_train]**k # build matrix A
    A_inv = np.linalg.pinv(np.copy(A))

    x1 = A_inv @ np.copy(f_train[idx_train])
    f1 = A @ x1

    x2 = np.linalg.lstsq(np.copy(A), f_train[idx_train], rcond=None)[0]
    f2 = A @ x2

    regr3 = linear_model.Lasso(alpha = lam*2)
    regr3.fit(A, f_train[idx_train])
    x3 = regr3.coef_
    f3 = A @ x3

    X1[:,jj] = x1
    X2[:,jj] = x2
    X3[:,jj] = x3

    jj += 1

  X1m = np.mean(X1, axis=1)
  X2m = np.mean(X2, axis=1)
  X3m = np.mean(X3, axis=1)

  A = np.zeros((len(idx_test), M))
  for k in range(M):
    A[:,k] = x_test**k # build matrix A
  f_valid_1 = A @ X1m
  f_valid_2 = A @ X2m
  f_valid_3 = A @ X3m

  E1[j] = np.linalg.norm(f_valid_1-f_test, ord=2)/np.linalg.norm(f_test, ord=2)
  E2[j] = np.linalg.norm(f_valid_2-f_test, ord=2)/np.linalg.norm(f_test, ord=2)
  E3[j] = np.linalg.norm(f_valid_3-f_test, ord=2)/np.linalg.norm(f_test, ord=2)

  if j == 0:
    axs[0,j].set_ylabel("pinv", fontsize = 18)
    axs[1,j].set_ylabel("backslash", fontsize = 18)
    axs[2,j].set_ylabel("LASSO", fontsize = 18)

  axs[0,j].bar(range(M), X1m)
  axs[0,j].set_title("k="+str(trial), fontsize = 18)
  axs[1,j].bar(range(M), X2m)
  axs[2,j].bar(range(M), X3m)

plt.show()

#### Error for different $k$ of three different methods

In [None]:
x_label = np.arange(3)  # the label locations
width = 0.25  # the width of the bars
multiplier = 0
methods = ['pinv', 'backslash', 'LASSO']
E = np.array([E1, E2, E3])
E = E.T
k = ['k=2', 'k=10', 'k=100']

fig, ax = plt.subplots(layout='constrained')

for error, kk in zip(E, k):
  offset = width * multiplier
  rects = ax.bar(x_label + offset, error, width, label=kk)
  multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Error')
ax.set_xticks(x_label + width, methods)
ax.legend(loc='upper left', ncols=1, fontsize = 18)

plt.show()

#### Visualization of Training Data and Testing Data

In [None]:
fig = plt.figure(figsize = (10, 5), dpi = 80)
plt.plot(x_train, f_train, 'o', linewidth=2, label = "Training Data")
plt.plot(x_test, f_test, 'o', linewidth=2, label = "Testing Data")
plt.legend(frameon = False, fontsize = 18)
plt.xlabel('$x$', fontsize = 18)
plt.ylabel('$f(x)$', fontsize = 18)
plt.show()

### 2. Information Criteria

The Kullback-Leibler (KL) divergence measures the distance between two probability density distributions (or data sets which represent the truth and a model) and is the core of modern information theory criteria for evaluating the viability of a model.

The KL divergence between two models $f(\mathrm{X},\mathrm{\beta})$ and $g(\mathrm{X},\mathrm{\mu})$ is defined as:

$$ I(f, g) = \int f(\mathrm{X},\mathrm{\beta})\log\left[\frac{f(\mathrm{X},\mathrm{\beta})}{g(\mathrm{X},\mathrm{\mu})}\right]d\mathrm{X} $$

$\mathrm{\beta}$: parameterizations of model $f$.

$\mathrm{\mu}$: parameterizations of model $g$.

From an information theory perspective, the quantity $I(f,g)$ measures the information lost when $g$ is used to represent $f$. Note that if $f = g$, then the log term is zero (i.e., $\log(1) = 0$) and $I(f, g) = 0$, so that there is no information lost.

In practice, $f$ will represent the \emph{truth}, or measurements of an experiment, while $g$ will be a model proposed to describe $f$.

#### Textbook example (Fig. 4.21)

In [18]:
n = 10000
x1 = np.random.randn(n) # "truth" model (data)
x2 = 0.8 * np.random.randn(n) + 1 # model 1
x3 = 0.5 * np.random.randn(n) - 1 # model 3 components
x4 = 0.7 * np.random.randn(n) - 3
x5 = 5.0 * np.random.rand(n) - 0.5

x = np.arange(-6,6.01,0.01) # range for data
x_bincenters = np.arange(-6.005,6.01,0.01)

In [None]:
f = np.histogram(x1,bins=x_bincenters)[0] + 0.01 # generate PDFs
g1 = np.histogram(x2,bins=x_bincenters)[0] + 0.01
g2a = np.histogram(x3,bins=x_bincenters)[0]
g2b = np.histogram(x4,bins=x_bincenters)[0]
g2 = g2a + 0.3*g2b + 0.01
g3 = np.histogram(x5,bins=x_bincenters)[0] + 0.01

f = f/np.trapz(f,x) # normalize data
g1 = g1/np.trapz(g1,x)
g2 = g2/np.trapz(g2,x)
g3 = g3/np.trapz(g3,x)

# Compute integrand
Int1 = f * np.log(np.divide(f,g1))
Int2 = f * np.log(np.divide(f,g2))
Int3 = f * np.log(np.divide(f,g3))

# KL divergence
I1 = np.trapz(Int1,x)
I2 = np.trapz(Int2,x)
I3 = np.trapz(Int3,x)

plt.figure()
plt.plot(x,f,linewidth=2,label='f')
plt.plot(x,g1,linewidth=2,label='g1')
plt.plot(x,g2,linewidth=2,label='g2')
plt.plot(x,g3,linewidth=2,label='g3')
plt.text(-6, 0.78, 'I1='+str(round(I1, 3)), fontsize = 15)
plt.text(-6, 0.73, 'I2='+str(round(I2, 3)), fontsize = 15)
plt.text(-6, 0.68, 'I3='+str(round(I3, 3)), fontsize = 15)
plt.legend(fontsize = 18)
plt.show()

#### Apply KL Divergence Calculation for $f(x) = x^2$ fitting

In [None]:
Int_pinv = f_test * np.log(np.abs((f_test/f_valid_1)))
Int_back = f_test * np.log(np.abs((f_test/f_valid_2)))
Int_lasso = f_test * np.log(np.abs((f_test/f_valid_3)))

I_pinv = np.trapz(Int_pinv, x_test)
I_back = np.trapz(Int_back, x_test)
I_lasso = np.trapz(Int_lasso, x_test)

print("KL Divergence for pinv is: ", I_pinv)
print("KL Divergence for backslash is: ", I_back)
print("KL Divergence for LASSO is: ", I_lasso)