In [1]:
import warnings

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
from ipywidgets import interact
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ConstantKernel as C

In [8]:
# Generate data for the polynomial degree demo.
n_polynomial_degrees = 20
num_samples = 10


class PolynomialDegreeDemo:
    x = np.linspace(-10, 10, num_samples) + np.random.normal(0, 1, num_samples)
    y = np.linspace(0, 1, num_samples) + np.random.normal(0, 0.1, num_samples)
    xp = np.linspace(-15, 15, 200)
    yp = []


for i in range(n_polynomial_degrees):
    degree = i + 1

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', np.RankWarning)
        x = PolynomialDegreeDemo.x
        y = PolynomialDegreeDemo.y
        xp = PolynomialDegreeDemo.xp
        p = np.poly1d(np.polyfit(x, y, degree))
        PolynomialDegreeDemo.yp.append(p(xp))


In [3]:
# Generate data for the bias and variance demos.
class BiasAndVarianceDemo:
    alphas = np.geomspace(2**3 * 1e-4, 2**13 * 1e-4, num=6, endpoint=True)
    xp = np.linspace(0.05, 0.95, 1000)
    ypss = []
    yps_mean = []
    alpha_to_index = {}
    
    def func(x):
        return np.sin(2 * np.pi * x)


size = 25    
for i, alpha in enumerate(BiasAndVarianceDemo.alphas):
    xp = BiasAndVarianceDemo.xp
    yps = []
    for _ in range(20):
        x = np.random.uniform(-0.1, 1.1, size)
        y = BiasAndVarianceDemo.func(x) + np.random.normal(0, 0.1, size)
        kernel = C(1.0, (1e-3, 1e3)) * RBF(0.5, (1e-1, 1e1))
        model = GaussianProcessRegressor(kernel=kernel, alpha=alpha, n_restarts_optimizer=24)
        
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            model = model.fit(x[:, np.newaxis], y)
        yp = model.predict(xp[:, np.newaxis])
        yps.append(yp)
    BiasAndVarianceDemo.ypss.append(yps)
    BiasAndVarianceDemo.yps_mean.append(np.mean(yps, axis=0))
    BiasAndVarianceDemo.alpha_to_index[alpha] = i            

In [4]:
def _polynomial_degree(degree):
    x = PolynomialDegreeDemo.x
    y = PolynomialDegreeDemo.y
    xp = PolynomialDegreeDemo.xp
    yp = PolynomialDegreeDemo.yp[degree - 1]

    plt.plot(xp, yp)
    plt.scatter(x, y)
    plt.ylim([-0.1, 1.1])
    plt.xlabel('x')
    plt.ylabel('y')
    plt.show()
    
def _bias(alpha):
    i = BiasAndVarianceDemo.alpha_to_index[alpha]
    xp = BiasAndVarianceDemo.xp
    yp = BiasAndVarianceDemo.yps_mean[i]

    plt.plot(xp, yp)
    plt.plot(xp, BiasAndVarianceDemo.func(xp), color='k', linestyle='--', linewidth=1)

    plt.ylim([-1.5, 1.5])
    plt.xlabel('x')
    plt.ylabel('y')
    plt.show()
    
def _variance(alpha):
    i = BiasAndVarianceDemo.alpha_to_index[alpha]
    xp = BiasAndVarianceDemo.xp
    yps = BiasAndVarianceDemo.ypss[i]

    for yp in yps:
        plt.plot(xp, yp, linewidth=1, alpha=0.5)
    plt.plot(xp, BiasAndVarianceDemo.func(xp), color='k', linestyle='--', linewidth=1)
    plt.ylim([-1.5, 1.5])
    plt.xlabel('x')
    plt.ylabel('y')
    plt.show()
    
    
class Demos:
    def polynomial_degree(self):
        interact(_polynomial_degree, degree=widgets.IntSlider(min=1,max=n_polynomial_degrees, step=1, value=1));
        
    def bias(self):
        interact(_bias, alpha=widgets.SelectionSlider(options=BiasAndVarianceDemo.alphas));

    def variance(self):
        interact(_variance, alpha=widgets.SelectionSlider(options=BiasAndVarianceDemo.alphas));
        
demos = Demos()

# Machine Learning Basics

- *Hyperparameters and Validation Sets*

- *Estimators, Bias & Variance*

## Hyperparameters and Validation Sets




> Notes
> - In this section:
>  - What are hyperparameters and validations sets
>  - How do they realate

### Hyperparameters

- Settings that control a model's behaviour

- Not adapted by the learning algorithm

- There could be a second learning algorithm to learn the hyperparameters

Polynomial regression has a single **capacity** hyperparameter - the degree of the polynomial $D$:

$$\hat{y} = b + \sum_{i=1}^{D}{w_{i} x^{i}}$$

In [15]:
demos.polynomial_degree()

interactive(children=(IntSlider(value=1, description='degree', max=20, min=1), Output()), _dom_classes=('widge…

The weight decay strenght $\lambda$ is another example of a hyperparameter.

> Notes:
> - An example of polynomial regression.
> - The function that generates the data is a line equation with added gaussian noise.
> ---
> - It is often innapropriate to learn the hyperparameters from the training dataset.
> - What would happen in the example above if the degree was learnt from the training dataset?
> - If learned on the training set, the hyperparameters would change such that the capacity is maximised and hence 
>   overfit.
> ---
> - As seen above, a high degree polynomial fits the training data perfectly, but a low degree polynomial matches 
>   the true data-generating distribution much better.
> ---
> - How are hyperparameters learned then?

## Train / Validation Split

- Typically an **80/20 split** of the whole dataset.

- Training set used to **learn model parameters**.

- Validation set used to **estimate generalisation error** during or after training.

![assets/capacity_vs_error.png](assets/capacity_vs_error.png)

- <font color='red'>**Problematic if the dataset is small.**</font>

> Notes:
>  - Split the whole dataset into two disjoint datasets.
> 
>  - Validation set helps with tuning the hyperparameters!
>
>  - If the dataset is small, the test set becomes tiny; this implies statistical uncertainty aroung the estimated
>    generalisation error, making it harder to compare algorithms.

## Cross-Validation

$$ \mathbb{D} = \left\{ z_i \right\}_{i=1}^{20} $$

![assets/cv0.png](assets/cv0.png)

Example of 5-fold cross-validation:

![assets/cv.gif](assets/cv.gif)

## Estimators, Bias & Variance

*Foundational concepts useful for formalizing generalization, underfitting and overfitting.*

### Point Estimation


### Function Estimation

### Bias
$$ bias\left( \hat{\boldsymbol{\theta}}_m \right) = \mathbb{E} \left( \hat{\boldsymbol{\theta}}_m \right) - \boldsymbol{\theta} $$

### Variance
$$ Var\left( \hat{\boldsymbol{\theta}} \right) $$
                



Illustration of the dependence of bias and variance on model complexity, governed by a regularization parameter λ, using the sinusoidal data set from Chapter 1. There are L = 100 data sets, each having N = 25
data points, and there are 24 Gaussian basis functions in the model so that the total number of parameters is
M = 25 including the bias parameter. The left column shows the result of fitting the model to the data sets for
various values of ln λ (for clarity, only 20 of the 100 fits are shown). The right column shows the corresponding
average of the 100 fits (red) along with the sinusoidal function from which the data sets were generated

In [10]:
demos.variance()

interactive(children=(SelectionSlider(description='alpha', options=(0.0008, 0.0032000000000000023, 0.012799999…

In [11]:
demos.bias()

interactive(children=(SelectionSlider(description='alpha', options=(0.0008, 0.0032000000000000023, 0.012799999…

![assets/capacity_vs_bias_variance.png](assets/capacity_vs_bias_variance.png)