# HW5 : Kernel Methods for Regression

Official instructions:

https://www.cs.tufts.edu/comp/135/2020f/hw5.html

This is the starter code.

## Import relevant packages

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import time
import importlib
import itertools

In [None]:
import sklearn.linear_model
import sklearn.pipeline
import sklearn.metrics

In [None]:
# Plotting utils
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.25, style='whitegrid')

# Load Training set data for the 'ByYear' split

Make sure you have downloaded the CSV files in the `data_melbourne_daily_min_temp` folder

In [None]:
DATA_DIR = os.path.join("../data_melbourne_daily_min_temp/") # TODO fix me

In [None]:
data_trByYear_df = pd.read_csv(os.path.join(DATA_DIR, 'data_train_ByYear.csv'))

In [None]:
# Inspect the first few entries
data_trByYear_df.head()

In [None]:
# Inspect the last few entries
data_trByYear_df.tail()

In [None]:
# Read relevant columns into numpy arrays for x and y

x_trByYear_N1 = data_trByYear_df['years_since_19850101'].values[:][:,np.newaxis].copy()
y_trByYear_N = data_trByYear_df['temp_deg_C'].values[:].copy()

print("Training data (ByYear split strategy)")
print("x_trByYear_N1.shape: %s" % str(x_trByYear_N1.shape))
print("y_trByYear_N.shape : %s" % str(y_trByYear_N.shape))
print("mean(y_trByYear_N) : %.3f" % np.mean(y_trByYear_N))

# Load Validation set data for the 'ByYear' split

In [None]:
data_vaByYear_df = pd.read_csv(os.path.join(DATA_DIR, 'data_valid_ByYear.csv'))

data_vaByYear_df.head()

In [None]:
x_vaByYear_T1 = data_vaByYear_df['years_since_19850101'].values[:][:,np.newaxis].copy()
y_vaByYear_T = data_vaByYear_df['temp_deg_C'].values[:].copy()

print("Validation data (ByYear split strategy)")
print("x_vaByYear_T1.shape: %s" % str(x_vaByYear_T1.shape))
print("y_vaByYear_T.shape : %s" % str(y_vaByYear_T.shape))
print("mean(y_vaByYear_T) : %.3f" % np.mean(y_vaByYear_T))

# Plot both training and validation sets

In [None]:
fig, axgrid = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, figsize=(20,5))

axgrid[0].plot(x_trByYear_N1, y_trByYear_N, 'bs', alpha=0.4)
axgrid[0].set_xlabel('x : Fractional Years since 1985');
axgrid[0].set_title('Training Set (%d examples, %s-%s)' % (
    y_trByYear_N.size,
    pd.to_datetime(data_trByYear_df['date'].values[0]).year,
    pd.to_datetime(data_trByYear_df['date'].values[-1]).year))

axgrid[1].plot(x_vaByYear_T1, y_vaByYear_T, 'cs', alpha=0.4)
axgrid[1].set_xlabel('x : Fractional Years since 1985');
axgrid[1].set_title('Validation Set (%d examples, %s-%s)' % (
    y_vaByYear_T.size,
    pd.to_datetime(data_vaByYear_df['date'].values[0]).year,
    pd.to_datetime(data_vaByYear_df['date'].values[-1]).year))

axgrid[0].set_ylabel("y : Min Daily Temp (deg C)");

## Prepare to use sklearn splitter tools for hyperparameter search

### Prep a splitter object that knows which examples are 'train' and which are 'validation'

In [None]:
x_all_ByYear_L1 = np.vstack([x_trByYear_N1, x_vaByYear_T1])
y_all_ByYear_L = np.hstack([y_trByYear_N, y_vaByYear_T])

In [None]:
# Create splitter object using Predefined Split
my_ByYear_splitter = sklearn.model_selection.PredefinedSplit(np.hstack([
    -1 * np.ones(y_trByYear_N.size), # -1 means never include this example in any test split
    0  * np.ones(y_vaByYear_T.size), #  0 means include in the first test split (we count starting at 0 in python)
    ]))

# Problem 3: Linear Kernel Regression Baseline



### Load kernel function from your completed `linear_kernel.py` file

In [None]:
import linear_kernel
importlib.reload(linear_kernel)

### Establish a pipeline for LInear Kernel Regression

Makes use of sklearn's `FunctionTransformer` (see [docs](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)), which will apply kernel function as first 'transformation' step in our pipeline.

This pipeline has two steps: 'linearkernelTransformer' and 'ridgeRegressor'

What happens when we call `fit(x_train_NF)` with this pipeline?

* 1) Transforms input features `x_train_NF` to kernel features `k_NN`
* * Entry i,j is the kernel between the i-th training feature and j-th training feature

* 2) Learns regression weight coefficient for each of the N columns of `k_NN`, by minimizing least squares objective with ridge penalty

$$
\min_{w \in \mathbb{R}^N, b} ~~ \sum_{n=1}^N (y_n - b - {\textstyle \sum_{i=1}^N w_i k(x_n, x_i)} )^2 ~ + \alpha \sum_{i=1}^N w_i^2
$$

What happens when we call `predict(x_QF)` with this pipeline?

* 1) Transform input features x_QF to the kernel features k_QN
* * Entry q,n is the kernel between the q-th test feature and n-th training feature

* 2) Apply the learned ridge regression to the k_QN 'kernel features'
* * Uses the learned weight coefficients for each of the N training examples to make a prediction

$$
\hat{y}(x_*) = b + {\textstyle \sum_{i=1}^N w_i k(x_*, x_i)} )^2 
$$

In [None]:
klr = sklearn.pipeline.Pipeline([
    ('linearkernelTransformer', sklearn.preprocessing.FunctionTransformer(linear_kernel.calc_linear_kernel)),
    ('ridgeRegressor', sklearn.linear_model.Ridge(alpha=1.0)),
    ])

### Setup grid search 

Performance metric:

* `neg_mean_absolute_error` ([docs](https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules), uses sklearn's built-in [mean_absolute_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn.metrics.mean_absolute_error) metric, with negative sign to follow "higher is better" convention)

Hyperparameters to search:

* `alpha` penalty for RidgeRegression (larger values mean larger strength penalty on sum-of-squares of weights)

In [None]:
alpha_grid = np.logspace(-5, 5, 11)

print("Possible Values for ridgeRegressor__alpha")
for alpha in alpha_grid:
    print("% 11.3f" % alpha)

In [None]:
# Create dictionary where:
# each key is a pipeline parameter name
# each value is a grid of possible values

klr_param_grid_by_name = dict(
    ridgeRegressor__alpha=alpha_grid,
    linearkernelTransformer__kw_args=[dict(x_train_NF=x_trByYear_N1)],
    )

#### TODO create the grid search

Make sure you use:

* The estimator defined above
* The parameter grid defined above
* The 'splitter' object defined above
* The 'scoring' performance metric defined above


In [None]:
# Package up into a sklearn GridSearch object

klr_grid_searcher = sklearn.model_selection.GridSearchCV() # TODO fixme

### Run the grid search

In [None]:
start_time_sec = time.time()
klr_grid_searcher.fit(x_all_ByYear_L1, y_all_ByYear_L)
elapsed_time_sec = time.time() - start_time_sec

# Build dataframe of results
klr_search_results_df = pd.DataFrame(klr_grid_searcher.cv_results_).copy()
print("Grid search of %3d configurations done after %6.1f sec" % (
    klr_search_results_df.shape[0], elapsed_time_sec))

### Display search results

#### TODO determine the best hyperparameters

In [None]:
best_alpha = None; # TODO

### Refit model with best hyperparameters on ALL development data (train+valid)

In [None]:
#### TODO create new pipeline for ALL development data, using best hyperparameters

best_klr = sklearn.pipeline.Pipeline([
    ('linearkernelTransformer', sklearn.preprocessing.FunctionTransformer(
        linear_kernel.calc_linear_kernel,
        kw_args=dict(
            x_train_NF=x_all_ByYear_L1))),
    ('ridgeRegressor', sklearn.linear_model.Ridge(alpha=best_alpha)), # TODO make sure you use best alpha here
    ])

#### TODO fit model with the best hyperparameters

In [None]:
best_klr # TODO call fit on ALL development data (train + valid)

### Make figure of predictions and data side-by-side

In [None]:
plt.plot(x_trByYear_N1, y_trByYear_N, 'bs', alpha=0.4, label='train data');
plt.plot(x_vaByYear_T1, y_vaByYear_T, 'cs', alpha=0.4, label='valid data');

L = 101
xgrid_L1 = np.linspace(-5, 6, L).reshape((L,1))

# TODO compute the predictions of the linear kernel regressor
yhat_L = xgrid_L1 # TODO fixme

# TODO Plot your predictions
plt.plot(xgrid_L1, yhat_L, 'r-', linewidth=3, label='Linear Kernel prediction');

plt.legend(bbox_to_anchor=(1.0, 0.5));

# TODO don't forget your axis labels and title for this figure

# Problem 3: Squared Exponential Kernel Regression

In [None]:
import sqexp_kernel

# Allow any simultaneous edits to sqexp_kernel.py
# to be reloaded again into this notebook when this cell is executed
importlib.reload(sqexp_kernel)

### Establish a pipeline for SqExp Kernel Regression

Again, makes use of sklearn's `FunctionTransformer` (see [docs](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)), which will apply our `calc_sqexp_kernel` kernel function as first 'transformation' step in our pipeline.

In [None]:
sqexp_klr = sklearn.pipeline.Pipeline([
    ('sqexpKernelTransformer', sklearn.preprocessing.FunctionTransformer(sqexp_kernel.calc_sqexp_kernel)),
    ('ridgeRegressor', sklearn.linear_model.Ridge(alpha=1.0)),
    ])

### Setup grid search for SqExp Kernel hyperparameters

In [None]:
length_scale_grid = np.logspace(-2, 0, 9)
alpha_grid = np.logspace(-2, 3, 5)

print("Possible Values for sqexpKernelTransformer__kw_args['length_scale']")
for length_scale in length_scale_grid:
    print("% 11.3f" % length_scale)

print("Possible Values for ridgeRegressor__alpha")
for alpha in alpha_grid:
    print("% 11.3f" % alpha)

In [None]:
# Create dictionary where:
# each key is a pipeline parameter name
# each value is a grid of possible values
sqexp_param_grid_by_name = dict(
    sqexpKernelTransformer__kw_args=[dict(
        length_scale=ell,
        x_train_NF=x_trByYear_N1,
        ) for ell in length_scale_grid],
    ridgeRegressor__alpha=alpha_grid,
    )

In [None]:
# Package up into a sklearn GridSearch object
sqexp_klr_grid_searcher = sklearn.model_selection.GridSearchCV()

### Run grid search: find hyperparameters that yield best heldout predictions

In [None]:
start_time_sec = time.time()
sqexp_klr_grid_searcher.fit(x_all_ByYear_L1, y_all_ByYear_L)
elapsed_time_sec = time.time() - start_time_sec

# Build dataframe of results
sqexp_klr_search_results_df = pd.DataFrame(sqexp_klr_grid_searcher.cv_results_).copy()
print("Grid search of %3d configurations done after %6.1f sec" % (
    sqexp_klr_search_results_df.shape[0], elapsed_time_sec))

### Display search results

#### TODO determine the best hyperparameters

In [None]:
best_length_scale = None; # TODO
best_alpha = None; # TODO

### Refit model with best hyperparameters on ALL development data (train+valid)

In [None]:
#### TODO create new pipeline for ALL development data, using best hyperparameters

best_sqexp_klr = sklearn.pipeline.Pipeline([
    ('sqexpKernelTransformer', sklearn.preprocessing.FunctionTransformer(
        sqexp_kernel.calc_sqexp_kernel,
        kw_args=dict(
            length_scale=best_length_scale, # TODO make sure you use best length scale here
            x_train_NF=x_all_ByYear_L1))),
    ('ridgeRegressor', sklearn.linear_model.Ridge(alpha=best_alpha)), # TODO make sure you use best alpha here
    ])

In [None]:
best_sqexp_klr # TODO call fit on ALL development data (train + valid)

### Figure 4: Predictions and actual temperatures for all data

In [None]:
plt.plot(x_trByYear_N1, y_trByYear_N, 'bs', alpha=0.4, label='train data');
plt.plot(x_vaByYear_T1, y_vaByYear_T, 'cs', alpha=0.4, label='valid data');

L = 101
xgrid_L1 = np.linspace(-5, 6, L).reshape((L,1))

# TODO compute the predictions of the sqexp kernel regressor
yhat_L = xgrid_L1 # TODO fixme

# TODO Plot your predictions
plt.plot(xgrid_L1, yhat_L, 'r-', linewidth=3, label='SqExp Kernel prediction');

plt.legend(bbox_to_anchor=(1.0, 0.5));

# TODO don't forget your axis labels and title for this figure

# Problem 5: Periodic kernel

In [None]:
import periodic_kernel

# Allow any simultaneous edits to periodic_kernel.py
# to be reloaded again into this notebook when this cell is executed
importlib.reload(periodic_kernel)

In [None]:
periodic_klr = sklearn.pipeline.Pipeline([
    ('periodicKernelTransformer', sklearn.preprocessing.FunctionTransformer(periodic_kernel.calc_periodic_kernel)),
    ('ridgeRegressor', sklearn.linear_model.Ridge(alpha=1.0)),
    ])

In [None]:
periodic_grid = [0.5, 1.0, 1.5, 2.0]
length_scale_grid = [0.01, 0.03, 0.09, 0.27]
alpha_grid = [1.0, 10.0]

print("Possible Values for periodicKernelTransformer__kw_args['periodic']")
for period in periodic_grid:
    print("% 11.3f" % period)

print("Possible Values for periodicKernelTransformer__kw_args['length_scale']")
for length_scale in length_scale_grid:
    print("% 11.3f" % length_scale)

print("Possible Values for ridgeRegressor__alpha")
for alpha in alpha_grid:
    print("% 11.3f" % alpha)

In [None]:
periodic_param_grid_by_name = dict(
    periodicKernelTransformer__kw_args=[
        dict(length_scale=ell, period=p,
            x_train_NF=x_trByYear_N1) for (ell,p) in itertools.product(length_scale_grid, periodic_grid)],
    ridgeRegressor__alpha=alpha_grid,
    )

In [None]:
# Create grid search object

periodic_klr_grid_searcher = sklearn.model_selection.GridSearchCV() # TODO fix me

### Run grid search for 'period' and 'length_scale' and 'alpha' hyperparameters

In [None]:
start_time_sec = time.time()
periodic_klr_grid_searcher.fit(x_all_ByYear_L1, y_all_ByYear_L)
elapsed_time_sec = time.time() - start_time_sec

# Build dataframe of results
periodic_klr_search_results_df = pd.DataFrame(periodic_klr_grid_searcher.cv_results_).copy()
print("Grid search of %3d configurations done after %6.1f sec" % (
    periodic_klr_search_results_df.shape[0], elapsed_time_sec))

### Determine the best hyperparameters

#### TODO identify the best hyperparameter

In [None]:
best_alpha = None
best_length_scale = None
best_period = None

### Retrain model with best hyperparameters on ALL development data

In [None]:
best_periodic_klr = sklearn.pipeline.Pipeline([
    ('periodicKernelTransformer', sklearn.preprocessing.FunctionTransformer(
        periodic_kernel.calc_periodic_kernel,
        kw_args=dict(
            length_scale=best_length_scale,
            period=best_period,
            x_train_NF=x_all_ByYear_L1,
            ))),
    ('ridgeRegressor', sklearn.linear_model.Ridge(alpha=1.0)),
    ])

In [None]:
best_periodic_klr.fit # TODO call fit with all development data

In [None]:
plt.plot(x_trByYear_N1, y_trByYear_N, 'bs', alpha=0.4, label='train data');
plt.plot(x_vaByYear_T1, y_vaByYear_T, 'cs', alpha=0.4, label='valid data');

L = 101
xgrid_L1 = np.linspace(-5, 6, L).reshape((L,1))

# TODO compute the predictions of the periodic kernel regressor
yhat_L = xgrid_L1 # TODO fixme

# TODO Plot your predictions
plt.plot(xgrid_L1, yhat_L, 'r-', linewidth=3, label='Periodic Kernel prediction');

plt.legend(bbox_to_anchor=(1.0, 0.5));

# TODO don't forget your axis labels and title for this figure

# Problem 6: Compare Methods on Test Set

### Load test data

In [None]:
data_te_df = pd.read_csv(os.path.join(DATA_DIR, 'data_test_ByYear.csv'))
x_te_T1 = data_te_df['years_since_19850101'].values[:][:,np.newaxis].copy()  # Load (T,1) feature values array
y_te_T = data_te_df['temp_deg_C'].values[:].copy()                           # Load (T,) feature values array

### Implement Baseline: Periodic nearest neighbor

#### TODO implement the method below to make baseline predictions

In [None]:
def predict_periodic_nearest_neighbor(
        x_te_T1,
        x_all_ByYear_L1, y_all_ByYear_L,
        years_back=[2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]):
    ''' Compute baseline prediction for each day in test set
    
    Args
    ----
    x_te_T1 : 2D array, shape (T, 1) = (n_test_examples, 1)
        Contains x features for the test set
    x_all_ByYear_L1 : 2D array, shape (n_trainvalid_examples, 1)
        Contains x features for train+valid set
    y_all_ByYear_L : 1D array, shape (n_trainvalid_examples,)
        Contains y response values for train+valid set
    
    Returns
    -------
    yhat_T : 1D array, shape (T,)
        Predicted y values for each test example
        Using 'periodic' nearest neighbors method
    '''
    assert x_te_T1.ndim == 2
    T, F = x_te_T1.shape
    
    # TODO compute the baseline
    yhat_T = np.zeros(T)
    
    return yhat_T


# Table 6: Comparison of methods on the test set

Please report MEAN ABSOLUTE ERROR on both the train+valid and test sets, to 3 digits of precision

### TODO make a table like this

![image.png](attachment:image.png)