# 1. Setting up

Import all required modules

In [37]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

Read in the weather data and view it. For additional info on the weather dataset we're using, see [this](https://corgis-edu.github.io/corgis/csv/weather/).

In [38]:
# TODO: read in weather data and view
data = pd.read_csv('weather.csv')

# 2. Pre-processing

We want to predict the max temperature for a particular week. **What input columns should we use?** (There are multiple reasonable answers.)

In [39]:
input_cols = ['Date.Month', 'Date.Week of', 'Data.Temperature.Avg Temp', 'Data.Temperature.Min Temp']
output_col = 'Data.Temperature.Max Temp'

Split the weather data into a training and test set and then store the input and target data seperately for each set. Use [train/test split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) from sklearn.

In [40]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.8)

train_X = train[input_cols]
train_y = train[output_col]

test_X = test[input_cols]
test_y = test[output_col]

Check the shape of each set to make sure they make sense!

In [41]:
print("train input data shape:", train_X.shape)
print("train target data shape:", train_y.shape)
print()
print("test input data shape:", test_X.shape)
print("test target data shape:", test_y.shape)

train input data shape: (3348, 4)
train target data shape: (3348,)

test input data shape: (13395, 4)
test target data shape: (13395,)


Normalize training and test set input data (X) using statistics generated from the training set. To do this, use the [Standard Scaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) from sklearn. (**Conceptual Check**: Why is it important to use statistics generated from the training set?)

In [42]:
scaler = StandardScaler()
scaler.fit(train[input_cols], train[output_col])
train_X_norm = scaler.transform(train_X)

scaler.fit(test[input_cols], test[output_col])
test_X_norm = scaler.transform(test_X)

View the type of the data post-normalization (as well as the data itself).

In [43]:
print("data type after normalizaton:", type(train_X_norm))
pd.DataFrame(train_X_norm)

data type after normalizaton: <class 'numpy.ndarray'>


Unnamed: 0,0,1,2,3
0,1.018106,-1.524898,0.072466,-0.339383
1,1.308547,1.278642,-0.411903,-0.502435
2,0.437223,-0.179199,0.072466,-0.339383
3,0.146782,0.942217,1.364116,1.399837
4,0.727664,-0.515624,1.364116,1.508538
...,...,...,...,...
3343,0.727664,0.269368,0.987385,0.965032
3344,-0.434100,-0.852048,-0.680997,-0.719837
3345,1.308547,1.278642,0.341559,0.312825
3346,-1.305424,1.390784,-0.196628,-0.556785


# 3. Regularization with Ridge

Create a [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) linear model with a regularization coefficent of 1. 

Note: This coefficent is referred to as "lambda (λ)" in course material and "alpha" in the sklearn docs. They are the same thing!

In [44]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1.0)

Train the model using the training data and output the training error. To do so, define a function rmse(mode, X, y) that calculates the RMSE error for a given model, input, and target data.

In [45]:
def rmse(model, X, y):
    predictions = model.predict(X)
    return mean_squared_error(predictions, y, squared=False)

In [46]:
ridge_model.fit(train_X, train_y)
rmse(ridge_model, test_X_norm, test_y)



68.44204780668359

Perform 5-fold cross validation with your Ridge model. Output the array of errors (length 5) as well as the mean error. You should use [Cross Validation Score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html?highlight=cross_val_scor) from sklearn to do this.

In [47]:
# TODO: fill out parameters for cross_val_score() and print errors
ridge_CV_scores = cross_val_score(ridge_model, train_X_norm, train_y, cv=5, scoring=rmse)

Perform 5-fold cross validation on Ridge models with a range of alpha values. For each alpha, print the alpha value and the corresponding mean CV score.

In [48]:
for reg_coef in [0.1, 1, 10, 100, 1000, 10e4]:
    ridge_model = Ridge(alpha=reg_coef)
    ridge_CV_scores = cross_val_score(ridge_model, train_X_norm, train_y, cv=5, scoring=rmse)
    print(reg_coef, ridge_CV_scores.mean(), sep='\t')

0.1	1.3252714051732424
1	1.332910589536583
10	1.582447909331862
100	3.832236652048197
1000	6.36110043092546
100000.0	18.682624319200322


Take a look at how the weights of Ridge models change as you change the regularization coefficient!

In [49]:
for reg_coef in [1, 100, 10e4, 10e7, 10e12]:
    ridge_model = Ridge(alpha=reg_coef)
    ridge_model.fit(train_X_norm, train_y)
    print(ridge_model.intercept_, ridge_model.coef_)

66.55585424133811 [-2.72691328e-02 -1.43659140e-03  3.56992139e+01 -1.69182130e+01]
66.55585424133811 [ 0.02952249  0.15925421 20.96505634 -2.48285882]
66.55585424133811 [0.11537528 0.04575605 0.60286437 0.56094186]
66.55585424133811 [1.27199023e-04 4.99301174e-05 6.42239492e-04 6.00268369e-04]
66.55585424133811 [1.27211795e-09 4.99346174e-10 6.42281604e-09 6.00310430e-09]


**BONUS**: How would the weights be different if you didn't regularize them? (i.e., use `LinearRegression` instead of `Ridge`.)

# 4. Regularization with LASSO

Create a [LASSO](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) linear model with a regularization coefficent of 1.

In [51]:
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=1.0)

Train the model using the training data and output the training error.

In [52]:
lasso_model.fit(train_X_norm, train_y)
rmse(lasso_model, test_X_norm, test_y)

4.090033346755294

Perform 5-fold cross validation with your LASSO model. Output the array of errors (length 5) as well as the mean error.

In [53]:
# TODO: fill out parameters for cross_val_score() and print errors
lasso_CV_scores = cross_val_score(lasso_model, train_X_norm, train_y, cv=5, scoring=rmse)

Perform 5-fold cross validation on LASSO models with a range of alpha values. For each alpha, print the alpha value and the corresponding mean CV score.

In [54]:
for reg_coef in [0.1, 1, 10, 100, 1000, 10e4]:
    lasso_model = Lasso(alpha=reg_coef)
    lasso_CV_scores = cross_val_score(lasso_model, train_X_norm, train_y, cv=5, scoring=rmse)
    print(reg_coef, lasso_CV_scores.mean(), sep='\t')

0.1	1.663222053488721
1	4.100843144438938
10	10.769562567291945
100	19.59484029659118
1000	19.59484029659118
100000.0	19.59484029659118


Take a look at how the weights of LASSO models change as you change the regularization coefficient!

Note: In python, -0 is the same as 0!

In [55]:
for reg_coef in [0, 0.1, 1, 10, 100]:
    lasso_model = Lasso(alpha=reg_coef)
    lasso_model.fit(train_X_norm, train_y)
    print(lasso_model.intercept_, lasso_model.coef_)

66.55585424133811 [-2.78578960e-02 -5.01221518e-03  3.60355838e+01 -1.72513867e+01]
66.55585424133811 [ -0.           0.          31.71964134 -12.94296107]
66.55585424133811 [ 0.          0.         18.18403835  0.        ]
66.55585424133811 [0.         0.         9.18403835 0.        ]
66.55585424133811 [0. 0. 0. 0.]


  lasso_model.fit(train_X_norm, train_y)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


# 5. Computing final test scores

Using the regularization coefficient that leads to the best validation error, compute test scores for a Ridge and LASSO model.

In [59]:
# TODO: choose best alphas from above and calculate test errors
print("Ridge", rmse(Ridge(alpha=100000.0).fit(train_X_norm, train_y), test_X_norm, test_y))
print("LASSO", rmse(Lasso(alpha=100.0).fit(train_X_norm, train_y), test_X_norm, test_y))
print("LinearRegression", rmse(LinearRegression().fit(train_X_norm, train_y), test_X_norm, test_y))

Ridge 18.71119819148913
LASSO 19.84408295677865
LinearRegression 1.5852190216197686
