# Machine learning intro

Polynomial regression using scikit-learn.

In [None]:
# Import packages
import numpy as np
import matplotlib.pyplot as plt

import sklearn.metrics
import sklearn.linear_model
from sklearn.preprocessing import PolynomialFeatures

# Make the results reproducible
np.random.seed(42)

In [None]:
# Generate synthetic dataset
n = 100  # 100 data samples
beta_true = [0.1, 0.3142, 0.0618, -0.2718]  # True regression coefficients


def f(x, noise_std=0.0):
  """Creates a degree three polynomial from the input."""

  y = beta_true[0] \
      + beta_true[1] * x \
      + beta_true[2] * x**2 \
      + beta_true[3] * x**3
  y += noise_std * np.random.randn(n, 1)
  return y


X = np.random.rand(n, 1) * 20 - 10  # Generate independent variable
y = f(X, noise_std=20.0)  # Generate the outputs

X_val = np.random.rand(n, 1) * 20 - 10  # Validation data set
y_val = f(X_val, noise_std=20.0)  # Validation data output

In [None]:
plt.figure()
plt.plot(X, y, '.')
plt.show()

In [None]:
# Transform dataset using a polynomial of degree 3
# It will fit an intercept (the bias), and first through third order terms
poly = PolynomialFeatures(degree=3, include_bias=True)
poly.fit(X)
X_ = poly.transform(X)
X_val_ = poly.transform(X_val)

print(f"X_.shape    : {X_.shape}")
print(f"X_val_.shape: {X_val_.shape}")

In [None]:
# Create model and fit it to the training data
model = sklearn.linear_model.LinearRegression(fit_intercept=True)
_ = model.fit(X_, y)  # Note that I'm using the X_ here, i.e. transformed data

In [None]:
# Use model to predict outputs
yhat = model.predict(X_)
yhat_val = model.predict(X_val_)
mse = sklearn.metrics.mean_squared_error(y, yhat)
mse_val = sklearn.metrics.mean_squared_error(y_val, yhat_val)
print(f"Training error  : {mse:.3f}")
print(f"Validation error: {mse_val:.3f}")

In [None]:
# Plot predictions
plt.figure()
plt.plot(X, y, '.', color="blue")
plt.plot(X, yhat, '.', color="red")
plt.legend(["Training data", "Model Prediction"])
plt.show()

In [None]:
# Regression coefficients
print(f"True coefficients : {beta_true}")
print(f"Model coefficients: {model.coef_[0].round(4).tolist()}")