# HW2 : Cross-Validation and  Regularization

### Configuration and imports


In [None]:
import os
import numpy as np
import pandas as pd
 
import sklearn.preprocessing
import sklearn.pipeline
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection


In [2]:
from matplotlib import pyplot as plt

import seaborn as sns
#This sets the default style for all figures. 
sns.set('notebook', font_scale=1.25, style='whitegrid')

You may need to adjust your data directory to point towards the auto data. Please do not adjust the random seed for the version of your results you turn in.  

In [3]:
SEED = 12345 

DATA_DIR = 'data_auto/'

## Helper Functions

Here we provide some functions you may find useful for your analysis. You are not **required** to use these helper functions, but you may find them useful.

### Data Loading 

In [4]:
def load_2d_arr_from_csv(fname, include_header=False):
    x = np.loadtxt(os.path.join(DATA_DIR, fname), delimiter=',', skiprows=1)
    assert x.ndim == 2
    if include_header:
        header_cols = np.loadtxt(os.path.join(DATA_DIR, fname), delimiter=',', dtype=str)[0].tolist()
        return x, header_cols
    else:
        return x
    
def load_1d_arr_from_csv(fname):
    x = np.loadtxt(os.path.join(DATA_DIR, fname), delimiter=',', skiprows=1)
    if x.ndim == 1:
        return x
    else:
        raise ValueError("Not 1d")

These functions can then be used as follows.

In [5]:
x_tr_MF, xcolnames_F = load_2d_arr_from_csv('x_train.csv', include_header=True)
x_va_NF = load_2d_arr_from_csv('x_valid.csv')
x_te_PF = load_2d_arr_from_csv('x_test.csv')

y_tr_M = load_1d_arr_from_csv('y_train.csv')
y_va_N = load_1d_arr_from_csv('y_valid.csv')
y_te_P = load_1d_arr_from_csv('y_test.csv')

print(xcolnames_F)
print(x_tr_MF[:5])
print(y_tr_M[:5,np.newaxis])

['horsepower', 'weight', 'cylinders', 'displacement']
[[ 115. 2595.    6.  173.]
 [ 180. 4380.    8.  350.]
 [ 150. 4457.    8.  318.]
 [ 105. 3897.    6.  250.]
 [ 193. 4732.    8.  304.]]
[[28.8]
 [16.5]
 [14. ]
 [16. ]
 [ 9. ]]


## Plotting

In [6]:
def plot_train_and_valid_error_vs_hyper(
        hyper_list, err_tr_list=None, err_va_list=None,
        ymax=40,
        leg_loc='upper right',
        xlabel='polynomial degree',
        ylabel='RMSE'):
    if err_va_list is not None:
        plt.plot(hyper_list, err_va_list, 'rs-', label='valid');
    if err_tr_list is not None:
        plt.plot(hyper_list, err_tr_list, 'bd:', label='train');
    plt.ylim([0, ymax]);
    plt.legend(loc=leg_loc);
    plt.xlabel(xlabel);
    plt.ylabel(ylabel);

## Building a regression pipeline

### Sanitizing output
You should sanitize any model's predictions to help ensure the predicted values that are physically plausible. 
We are predicting MPG, which should 
* (1) always be positive, and
* (2) will probably never exceed 120% of the largest value we see in train data

Predictions should be sanitized before being used to calculate errors or select best-performing models for your report. 

In [7]:
Y_MAX = 60.0 # The max MPG is about 48 in the training set

def sanitize(yhat_N):
    yhat_N = np.maximum(yhat_N, 0)
    yhat_N = np.minimum(yhat_N, Y_MAX)
    return yhat_N

### Creating multiple pipelines

This function returns a scikit-learn `pipeline` that of the specified degree. 
You'll need to make a version that takes in an additional parameter, `alpha` and use `sklearn.linear_model.Ridge` to create ridge polynomial regression pipelines. 

In [8]:
def make_pipeline__unpenalized_linear_regr_with_poly_feats(degree=1):
    pipeline = sklearn.pipeline.Pipeline(
        steps=[
         ('rescaler', sklearn.preprocessing.MinMaxScaler()),
         ('poly_transformer', sklearn.preprocessing.PolynomialFeatures(degree=degree, include_bias=False)),
         ('regr', sklearn.linear_model.LinearRegression())
        ])
    
    # Return the constructed pipeline
    # We can treat it as if it has a 'regression' API
    # e.g. a fit and a predict method
    return pipeline

### Inspecting learned weights

We can access individual steps of the pipeline to extract information, as demonstrated in the blow function.

In [9]:
def pretty_print_learned_weights(pipeline, xcolnames_F):
    ''' Print the learned parameters of given pipeline.
        Note that this function assumes that the pipeline has the named steps "poly_transformer" and "regr"
    '''
    my_lin_regr = pipeline.named_steps['regr']

    feat_names = pipeline.named_steps['poly_transformer'].get_feature_names_out()
    coef_values = my_lin_regr.coef_

    print("intercept: %.2f" % (my_lin_regr.intercept_))
    print("")

    print("%9s   %s" % ("weight", "feature var"))
    for feat, coef in zip(feat_names, coef_values):
        print("% 9.2f * %s" % (coef, feat))
    
    print("where ")
    for ff, colname in enumerate(xcolnames_F):
        print("x%d = %s" % (ff, colname))

# Your Code