In [None]:
%matplotlib inline
from __future__ import print_function

Univariate regression
=================
Fit a line or a polynome of specified degree to a 1d dataset.

Make training data
---------------------

In [None]:
# Example 1: define manually
import numpy as np
x = np.array([5., 3., 0., 4.])
y = np.array([4., 4., 1., 3.])
x_test = np.array([1.0,  2., 2.5, 4.])
y_test = np.array([1.5, 2., 3.5, 4.])

In [None]:
# Example 2: the diabetes dataset
import numpy as np
from sklearn import datasets
diabetes = datasets.load_diabetes()
diabetes.target /= diabetes.target.max()

# Use only one feature
x = diabetes.data[-20:, np.newaxis, 2].ravel()
y = diabetes.target[-20:]
x_test = diabetes.data[:-20, np.newaxis, 2].ravel()
y_test = diabetes.target[:-20]

In [None]:
# Example 3: artificial dataset
# http://www.scipy-lectures.org/packages/scikit-learn/auto_examples/plot_bias_variance.html#bias-and-variance-of-polynomial-fit
import numpy as np
generating_func = lambda x, err=0.5: np.random.normal(10 - 1. / (x + 0.1), err)
n_samples = 20
np.random.seed(0)
x = 10 ** np.linspace(-2, 0, n_samples)
y = generating_func(x)
x_test = np.linspace(-0.2, 1.2, 100)
y_test = generating_func(x_test)

plot the training points

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(3, 3.5))
ax = plt.axes()
ax.scatter(x_test, y_test, marker='o', c='0.75', s=20)
ax.scatter(x, y, marker='x', c='b', s=50)
xmin, xmax, dx = x.min(), x.max(), (x.max()-x.min())*0.2
ymin, ymax, dy = y.min(), y.max(), (y.max()-y.min())*0.2
ax.set_xlim(xmin-dx, xmax+dx)
ax.set_ylim(ymin-dy, ymax+dy)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()

Compute the model parameters
-------------------------------------------------

In [None]:
from sklearn.preprocessing import PolynomialFeatures
degree = 1
X = PolynomialFeatures(degree).fit_transform(x[:, np.newaxis])
if degree < 4:
    print("X =");    print(X)
    print("X.T.dot(X) =");    print(X.T.dot(X))
w = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))
print("w = X.T.dot(X)).dot(X.T.dot(y) =", w)

In [None]:
plt.figure(figsize=(3, 3.5))
ax = plt.axes()
x_regr = np.linspace(xmin-dx, xmax+dx, 100)
X_regr = PolynomialFeatures(degree).fit_transform(x_regr[:, np.newaxis])
y_regr = X_regr.dot(w)
ax.plot(x_regr, y_regr, 'b-')
ax.scatter(x_test, y_test, marker='o', c='0.75', s=20)
ax.scatter(x, y, marker='x', c='b', s=50)
ax.set_xlim(xmin-dx, xmax+dx)
ax.set_ylim(ymin-dy, ymax+dy)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()
plt.savefig('univariate_regression.png', transparent=True,dpi=300)

from sklearn.metrics import mean_squared_error
print("l2 loss (residual) = %.2f" % mean_squared_error(y, X.dot(w)))

Validation of degrees
-------------------------

In [None]:
plt.figure(figsize=(3, 3.5))
ax = plt.axes()
ax.scatter(x_test, y_test, marker='o', c='0.75', s=20)

dmax = min(len(y), 16)
residual = np.zeros(dmax)
mse_test = np.zeros(dmax)
for degree in range(dmax):
    # regression
    X = PolynomialFeatures(degree).fit_transform(x[:, np.newaxis])
    w = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))

    # compute the mean residual of training data and the mean square error of test data
    residual[degree] = mean_squared_error(y, X.dot(w))
    X_test = PolynomialFeatures(degree).fit_transform(x_test[:, np.newaxis])
    mse_test[degree] = mean_squared_error(y_test, X_test.dot(w))

    # plot the regression curve
    if degree in [1, 2, 3, 5, 7, 11]:
        x_regr = np.linspace(xmin-dx, xmax+dx, 100)
        X_regr = PolynomialFeatures(degree).fit_transform(x_regr[:, np.newaxis])
        y_regr = X_regr.dot(w)
        alpha = min(1, 2**mse_test[0]/mse_test[degree])
        val = min(1, 2**residual[degree]/residual[0])
        ax.plot(x_regr, y_regr,  lw=1, color =[1-val,1-val,1], alpha=alpha)

ax.scatter(x, y, marker='x', c='b', s=50)
ax.set_xlim(xmin-dx, xmax+dx)
ax.set_ylim(ymin-dy, ymax+dy)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()
plt.savefig('polynomial_regression.png', transparent=True,dpi=300)

plt.figure()
ax = plt.axes()
if mse_test.max() - mse_test.min() < 1e+2:
    ax.plot(range(dmax), residual, 'bs--', label='Train')
    ax.plot(range(dmax), mse_test, 'ko--', label='Test')
else:
    ax.semilogy(range(dmax), residual, 'bs--', label='Train')
    ax.semilogy(range(dmax), mse_test, 'ko--', label='Test')
plt.axis('tight')
plt.xlabel('Degree', fontsize=20)
plt.ylabel('$\\ell_2$ loss (residual)', fontsize=20)
plt.legend(loc="upper right", fontsize=16, frameon=True)
from matplotlib.ticker import MaxNLocator
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
ax.set_xlim(-0.1,dmax-0.9)
plt.tight_layout()
plt.savefig('univariate_regression_loss_vs_degree.png', transparent=True,dpi=300)