# Cross-Validation with Ridge and Lasso

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
#SK LEARN
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error


We will euse the Kings County housing dataset in this notebook

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/learn-co-curriculum/dsc-mod-2-project-v2-1/master/kc_house_data.csv',index_col=0)

## Data Cleaning and Prep

In [3]:
df['date'] = pd.to_datetime(df['date'])

df['yr_sold']=  df['date'].map(lambda x: x.year)

df['yrs_old'] =  df['yr_built'].map(lambda x: 2016-x)

df['yr_since_reno'] =  df['yr_renovated'].map(lambda x: 2016-x if x > 0 else np.nan)

df['yrs_since_update'] = df.apply(lambda x: min(x['yrs_old'], x['yr_since_reno']), axis=1)


df['bedrooms']=df['bedrooms'].map(lambda x: x if x < 10 else 10)

df.replace('?', 0, inplace=True)

df['sqft_basement'] = pd.to_numeric(df['sqft_basement'])

df.fillna(0, inplace=True)

In [4]:
zip_df = pd.get_dummies(df['zipcode'], drop_first=True)


In [5]:
df.shape

(21597, 24)

In [6]:
zip_df.shape

(21597, 69)

In [7]:
target= np.log(df.price)

features = df.drop(columns=['date', 'price', 'lat', 'long', 'yr_built', 'yr_renovated', 'yr_since_reno', 'zipcode'])

In [8]:
polynomial_features_2= PolynomialFeatures(degree=2, include_bias=False)
features_poly = polynomial_features_2.fit_transform(features)
poly_columns = polynomial_features_2.get_feature_names(features.columns)

In [9]:
features_poly = pd.DataFrame(features_poly, columns=poly_columns)

In [10]:
zip_df.reset_index(drop=True, inplace=True)

In [11]:
features_poly.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,yr_sold,yrs_old,yrs_since_update,bedrooms^2,bedrooms bathrooms,bedrooms sqft_living,bedrooms sqft_lot,bedrooms floors,bedrooms waterfront,bedrooms view,bedrooms condition,bedrooms grade,bedrooms sqft_above,bedrooms sqft_basement,bedrooms sqft_living15,bedrooms sqft_lot15,bedrooms yr_sold,bedrooms yrs_old,bedrooms yrs_since_update,bathrooms^2,bathrooms sqft_living,bathrooms sqft_lot,bathrooms floors,bathrooms waterfront,bathrooms view,bathrooms condition,bathrooms grade,bathrooms sqft_above,bathrooms sqft_basement,bathrooms sqft_living15,bathrooms sqft_lot15,bathrooms yr_sold,bathrooms yrs_old,bathrooms yrs_since_update,sqft_living^2,sqft_living sqft_lot,sqft_living floors,...,view sqft_living15,view sqft_lot15,view yr_sold,view yrs_old,view yrs_since_update,condition^2,condition grade,condition sqft_above,condition sqft_basement,condition sqft_living15,condition sqft_lot15,condition yr_sold,condition yrs_old,condition yrs_since_update,grade^2,grade sqft_above,grade sqft_basement,grade sqft_living15,grade sqft_lot15,grade yr_sold,grade yrs_old,grade yrs_since_update,sqft_above^2,sqft_above sqft_basement,sqft_above sqft_living15,sqft_above sqft_lot15,sqft_above yr_sold,sqft_above yrs_old,sqft_above yrs_since_update,sqft_basement^2,sqft_basement sqft_living15,sqft_basement sqft_lot15,sqft_basement yr_sold,sqft_basement yrs_old,sqft_basement yrs_since_update,sqft_living15^2,sqft_living15 sqft_lot15,sqft_living15 yr_sold,sqft_living15 yrs_old,sqft_living15 yrs_since_update,sqft_lot15^2,sqft_lot15 yr_sold,sqft_lot15 yrs_old,sqft_lot15 yrs_since_update,yr_sold^2,yr_sold yrs_old,yr_sold yrs_since_update,yrs_old^2,yrs_old yrs_since_update,yrs_since_update^2
0,3.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1340.0,5650.0,2014.0,61.0,61.0,9.0,3.0,3540.0,16950.0,3.0,0.0,0.0,9.0,21.0,3540.0,0.0,4020.0,16950.0,6042.0,183.0,183.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1340.0,5650.0,2014.0,61.0,61.0,1392400.0,6667000.0,1180.0,...,0.0,0.0,0.0,0.0,0.0,9.0,21.0,3540.0,0.0,4020.0,16950.0,6042.0,183.0,183.0,49.0,8260.0,0.0,9380.0,39550.0,14098.0,427.0,427.0,1392400.0,0.0,1581200.0,6667000.0,2376520.0,71980.0,71980.0,0.0,0.0,0.0,0.0,0.0,0.0,1795600.0,7571000.0,2698760.0,81740.0,81740.0,31922500.0,11379100.0,344650.0,344650.0,4056196.0,122854.0,122854.0,3721.0,3721.0,3721.0
1,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3.0,7.0,2170.0,400.0,1690.0,7639.0,2014.0,65.0,25.0,9.0,6.75,7710.0,21726.0,6.0,0.0,0.0,9.0,21.0,6510.0,1200.0,5070.0,22917.0,6042.0,195.0,75.0,5.0625,5782.5,16294.5,4.5,0.0,0.0,6.75,15.75,4882.5,900.0,3802.5,17187.75,4531.5,146.25,56.25,6604900.0,18611940.0,5140.0,...,0.0,0.0,0.0,0.0,0.0,9.0,21.0,6510.0,1200.0,5070.0,22917.0,6042.0,195.0,75.0,49.0,15190.0,2800.0,11830.0,53473.0,14098.0,455.0,175.0,4708900.0,868000.0,3667300.0,16576630.0,4370380.0,141050.0,54250.0,160000.0,676000.0,3055600.0,805600.0,26000.0,10000.0,2856100.0,12909910.0,3403660.0,109850.0,42250.0,58354321.0,15384946.0,496535.0,190975.0,4056196.0,130910.0,50350.0,4225.0,1625.0,625.0
2,2.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,2720.0,8062.0,2015.0,83.0,83.0,4.0,2.0,1540.0,20000.0,2.0,0.0,0.0,6.0,12.0,1540.0,0.0,5440.0,16124.0,4030.0,166.0,166.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,2720.0,8062.0,2015.0,83.0,83.0,592900.0,7700000.0,770.0,...,0.0,0.0,0.0,0.0,0.0,9.0,18.0,2310.0,0.0,8160.0,24186.0,6045.0,249.0,249.0,36.0,4620.0,0.0,16320.0,48372.0,12090.0,498.0,498.0,592900.0,0.0,2094400.0,6207740.0,1551550.0,63910.0,63910.0,0.0,0.0,0.0,0.0,0.0,0.0,7398400.0,21928640.0,5480800.0,225760.0,225760.0,64995844.0,16244930.0,669146.0,669146.0,4060225.0,167245.0,167245.0,6889.0,6889.0,6889.0
3,4.0,3.0,1960.0,5000.0,1.0,0.0,0.0,5.0,7.0,1050.0,910.0,1360.0,5000.0,2014.0,51.0,51.0,16.0,12.0,7840.0,20000.0,4.0,0.0,0.0,20.0,28.0,4200.0,3640.0,5440.0,20000.0,8056.0,204.0,204.0,9.0,5880.0,15000.0,3.0,0.0,0.0,15.0,21.0,3150.0,2730.0,4080.0,15000.0,6042.0,153.0,153.0,3841600.0,9800000.0,1960.0,...,0.0,0.0,0.0,0.0,0.0,25.0,35.0,5250.0,4550.0,6800.0,25000.0,10070.0,255.0,255.0,49.0,7350.0,6370.0,9520.0,35000.0,14098.0,357.0,357.0,1102500.0,955500.0,1428000.0,5250000.0,2114700.0,53550.0,53550.0,828100.0,1237600.0,4550000.0,1832740.0,46410.0,46410.0,1849600.0,6800000.0,2739040.0,69360.0,69360.0,25000000.0,10070000.0,255000.0,255000.0,4056196.0,102714.0,102714.0,2601.0,2601.0,2601.0
4,3.0,2.0,1680.0,8080.0,1.0,0.0,0.0,3.0,8.0,1680.0,0.0,1800.0,7503.0,2015.0,29.0,29.0,9.0,6.0,5040.0,24240.0,3.0,0.0,0.0,9.0,24.0,5040.0,0.0,5400.0,22509.0,6045.0,87.0,87.0,4.0,3360.0,16160.0,2.0,0.0,0.0,6.0,16.0,3360.0,0.0,3600.0,15006.0,4030.0,58.0,58.0,2822400.0,13574400.0,1680.0,...,0.0,0.0,0.0,0.0,0.0,9.0,24.0,5040.0,0.0,5400.0,22509.0,6045.0,87.0,87.0,64.0,13440.0,0.0,14400.0,60024.0,16120.0,232.0,232.0,2822400.0,0.0,3024000.0,12605040.0,3385200.0,48720.0,48720.0,0.0,0.0,0.0,0.0,0.0,0.0,3240000.0,13505400.0,3627000.0,52200.0,52200.0,56295009.0,15118545.0,217587.0,217587.0,4060225.0,58435.0,58435.0,841.0,841.0,841.0


In [12]:
features_poly = pd.merge(features_poly, zip_df, left_index=True, right_index=True)

In [13]:
#call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(features_poly, target, random_state=22,test_size=0.25)


In [14]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled =pd.DataFrame(data=scaler.transform(X_train), columns=features_poly.columns)
X_test_scaled =pd.DataFrame(data=scaler.transform(X_test), columns=features_poly.columns)

In [15]:
#instantiate a linear regression object
lm = LinearRegression()

#fit the linear regression to the data
lm = lm.fit(X_train_scaled, y_train)


print(lm.intercept_)
print(lm.coef_)

13.049903861156416
[ 8.08025912e+00 -1.01133320e+01  1.04941807e+02 -4.64872845e+00
 -1.43477299e+01 -5.18587475e+00  8.40690391e+00  3.01622411e+01
  8.32723972e-01 -8.63885605e+01 -5.32452667e+01  3.46420622e+00
  2.50311259e+00  2.40279026e-02 -1.00601863e+01 -2.72591225e+01
  1.37126475e-02 -1.17816966e-02 -5.11910932e-03  5.23645615e-02
  6.01561750e-02  4.27629316e-04  6.35957653e-04 -9.73613937e-03
  1.18344701e-01 -1.73332284e-01 -3.83254400e-02 -7.56426404e-03
 -4.47044150e-02 -8.13336132e+00 -1.52639253e-02  2.71755713e-02
 -8.35275072e-02  2.80916440e-01 -3.63185374e-02 -4.81369246e-02
  1.07795136e-02 -2.07636713e-02 -4.68840946e-03 -4.77510679e-02
 -8.28666126e-02 -5.39584846e-02  6.91707057e-04  1.38991241e-02
  1.02420255e+01 -1.48479332e-02 -1.56968574e-02 -3.73297648e-01
 -2.18139194e-02 -4.09708681e-01  1.99289100e-01 -5.55497506e-02
 -5.04298489e-02 -4.07379385e-01  4.16016317e-01  2.57322154e-02
  1.44906786e-01  8.91760897e-03 -1.04313777e+02  1.26553579e-01
 -1.65

In [16]:
y_train_pred = lm.predict(X_train)
y_pred = lm.predict(X_test)

In [17]:
train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
train_mse = metrics.mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))


print('Mean Absolute Error:', train_mae )
print('Mean Squared Error:',  train_mse)
print('Root Mean Squared Error:' , train_rmse)

Mean Absolute Error: 105006091.13314073
Mean Squared Error: 5.209109205726952e+17
Root Mean Squared Error: 721741588.5015185


In [18]:
from sklearn.feature_selection import RFECV
ols = LinearRegression()

In [19]:
#recursive wrapper method

# Create recursive feature eliminator that scores features by mean squared errors
selector = RFECV(estimator=ols, step=3, cv=5, scoring='neg_mean_squared_error', verbose =1, n_jobs=-1)

# Fit recursive feature eliminator 
selector.fit(X_train_scaled, y_train)



Fitting estimator with 221 features.
Fitting estimator with 218 features.
Fitting estimator with 215 features.
Fitting estimator with 212 features.
Fitting estimator with 209 features.
Fitting estimator with 206 features.
Fitting estimator with 203 features.
Fitting estimator with 200 features.
Fitting estimator with 197 features.
Fitting estimator with 194 features.


RFECV(cv=5,
      estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                 normalize=False),
      min_features_to_select=1, n_jobs=-1, scoring='neg_mean_squared_error',
      step=3, verbose=1)

In [20]:
selected_columns = X_train_scaled.columns[selector.support_]
removed_columns = X_train_scaled.columns[~selector.support_]

In [21]:
len(selected_columns)

191

In [22]:
lm_rfe = LinearRegression()

lm_rfe = lm_rfe.fit(X_train_scaled[selected_columns], y_train)

y_rfe=lm_rfe.predict(X_train_scaled[selected_columns])

trainRFE_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_rfe))

print('Training Root Mean Squared Error:' , trainRFE_rmse)

y_pred_rfe = lm_rfe.predict(X_test_scaled[selected_columns])

testRFE_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfe))

print('Testing Root Mean Squared Error:' , testRFE_rmse)


print("Train RMSE", int(trainRFE_rmse), "Test RMSE: ", int(testRFE_rmse))

Training Root Mean Squared Error: 0.1721665513896679
Testing Root Mean Squared Error: 0.18094078197433625
Train RMSE 0 Test RMSE:  0


In [23]:
#Calculate sum of coefficients for this model

In [25]:
## training the model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01, normalize=False)

lasso.fit(X_train_scaled,y_train)

y_train_pred_lasso = lasso.predict(X_train_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

train_rmse_lasso = metrics.mean_absolute_error(y_train, y_train_pred_lasso)
test_rmse_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso))
print('Training Error: '+ str(train_rmse_lasso) )
print('Testing Error: '+ str(test_rmse_lasso) )



Training Error: 0.1570219176728787
Testing Error: 0.2124900450050747


In [26]:
from sklearn.linear_model import SGDRegressor


In [27]:
lasso = SGDRegressor(penalty='l1', alpha=0.01, early_stopping=True, verbose=1)

lasso.fit(X_train_scaled,y_train)




-- Epoch 1
Norm: 121446079527.76, NNZs: 221, Bias: -3644150769.197832, T: 14577, Avg. loss: 8878306384561901142016.000000
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 142442617351.42, NNZs: 221, Bias: 483083307.407479, T: 29154, Avg. loss: 7717969029409954332672.000000
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 67737371382.54, NNZs: 221, Bias: 1235269695.888133, T: 43731, Avg. loss: 6504499853399560290304.000000
Total training time: 0.13 seconds.
-- Epoch 4
Norm: 77772101419.64, NNZs: 221, Bias: 267444760.586261, T: 58308, Avg. loss: 3830581982302479319040.000000
Total training time: 0.17 seconds.
-- Epoch 5
Norm: 83910864909.54, NNZs: 221, Bias: -1426852446.347853, T: 72885, Avg. loss: 2818217575577080037376.000000
Total training time: 0.22 seconds.
-- Epoch 6
Norm: 86078807323.17, NNZs: 221, Bias: 1742099175.044543, T: 87462, Avg. loss: 2737209804175282536448.000000
Total training time: 0.31 seconds.
-- Epoch 7
Norm: 70043387688.41, NNZs: 221, Bias: -1375677931.861110

SGDRegressor(alpha=0.01, average=False, early_stopping=True, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l1', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=1,
             warm_start=False)

In [28]:
y_train_pred_lasso = lasso.predict(X_train_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

train_rmse_lasso = metrics.mean_absolute_error(y_train, y_train_pred_lasso)
test_rmse_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso))
print('Training Error: '+ str(train_rmse_lasso) )
print('Testing Error: '+ str(test_rmse_lasso) )

Training Error: 8452581604.625785
Testing Error: 33680563738.496895


In [None]:
#Calculate sum of coefficients for this model

In [None]:
ridge = SGDRegressor(penalty='l2', alpha=0.01, early_stopping=True, verbose=1)

ridge.fit(X_train_scaled,y_train)



In [None]:
y_train_pred_ridge = ridge.predict(X_train_scaled)
y_pred_ridge = ridge.predict(X_test_scaled)

train_rmse_ridge = metrics.mean_absolute_error(y_train, y_train_pred_ridge)
test_rmse_ridge = np.sqrt(metrics.mean_squared_error(y_test, y_pred_ridge))
print('Training Error: '+ str(train_rmse_ridge) )
print('Testing Error: '+ str(test_rmse_ridge) )

In [None]:
#Calculate sum of coefficients for this model

In [None]:
lasso_coef01 = pd.DataFrame(data=lasso.coef_).T
lasso_coef01.columns = X_train_scaled.columns
lasso_coef01 = lasso_coef01.T.sort_values(by=0).T
#lasso_coef01.plot(kind='bar', title='Modal Coefficients', legend=False, figsize=(16,8))
lasso_coef01.T

We want to pick our best model, but this is more complicated than just choosing between linear regression, Lasso, or Ridge. We now have to also consider the different models that we get from different alpha values for Ridge and Lasso.


How do we determine the best model that will not overfit to the training data? 

___

## Cross Validation

Cross-validation is a statistical method used to protect against overfitting a predictive model, particularly in a case where the amount of data may be limited. In cross-validation, you make a fixed number of folds (or partitions) of the data, run the analysis on each fold, and then average the overall error estimate.

### Steps for K-fold cross-validation



1. Split the dataset into K **equal** partitions (or "folds").
2. Use fold 1 as the **testing set** and the union of the other folds as the **training set**.
3. Calculate **testing accuracy**.
4. Repeat steps 2 and 3 K times, using a **different fold** as the testing set each time.
5. Use the **average testing accuracy** as the estimate of out-of-sample accuracy.

Diagram of **10-fold cross-validation:**

<img src="https://miro.medium.com/max/1354/1*qPMFLEbvc8QQf38Cf77wQg.png">

In [None]:
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

In [None]:
# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:^25}'.format(iteration, data[0], str(data[1])))

- Dataset contains **25 observations** (numbered 0 through 24)
- 5-fold cross-validation, thus it runs for **5 iterations**
- For each iteration, every observation is either in the training set or the testing set, **but not both**
- Every observation is in the testing set **exactly once**

### Comparing cross-validation to train/test split



Advantages of **cross-validation:**

- More accurate estimate of out-of-sample accuracy
- More "efficient" use of data (every observation is used for both training and testing)

Advantages of **train/test split:**

- Runs K times faster than K-fold cross-validation
- Simpler to examine the detailed results of the testing process

### Cross-validation recommendations



1. K can be any number, but **K=10** is generally recommended
2. For classification problems, **stratified sampling** is recommended for creating the folds
    - Each response class should be represented with equal proportions in each of the K folds
    - scikit-learn's `cross_val_score` function does this by default

In [29]:
from sklearn.linear_model import LassoCV, RidgeCV

In [30]:
lassoCV_model = LassoCV(cv=5, random_state=0, verbose=1)
lassoCV_model.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.2s finished
  positive)


LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=0, selection='cyclic',
        tol=0.0001, verbose=1)

In [31]:
lassoCV_model.alpha_

0.0003732575008002979

In [32]:
RidgeCV_model = RidgeCV(cv=5)
RidgeCV_model.fit(X_train_scaled, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [33]:
RidgeCV_model.alpha_

10.0

Now that we have used cross validation to help us determine the best **alpha** for Ridge and Lasso, we can then use those fitted models to compare on our test set.  

## Improvements to cross-validation

**Repeated cross-validation**

- Repeat cross-validation multiple times (with **different random splits** of the data) and average the results
- More reliable estimate of out-of-sample performance by **reducing the variance** associated with a single trial of cross-validation

**Creating a hold-out set**

- "Hold out" a portion of the data **before** beginning the model building process
- Locate the best model using cross-validation on the remaining data, and test it **using the hold-out set**
- More reliable estimate of out-of-sample performance since hold-out set is **truly out-of-sample**

**Feature engineering and selection within cross-validation iterations**

- Normally, feature engineering and selection occurs **before** cross-validation
- Instead, perform all feature engineering and selection **within each cross-validation iteration**
- More reliable estimate of out-of-sample performance since it **better mimics** the application of the model to out-of-sample data


## Resources


- scikit-learn documentation: [Cross-validation](http://scikit-learn.org/stable/modules/cross_validation.html), [Model evaluation](http://scikit-learn.org/stable/modules/model_evaluation.html)
- scikit-learn issue on GitHub: [MSE is negative when returned by cross_val_score](https://github.com/scikit-learn/scikit-learn/issues/2439)
- Section 5.1 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) (11 pages) and related videos: [K-fold and leave-one-out cross-validation](https://www.youtube.com/watch?v=nZAM5OXrktY&list=PL5-da3qGB5IA6E6ZNXu7dp89_uv8yocmf) (14 minutes), [Cross-validation the right and wrong ways](https://www.youtube.com/watch?v=S06JpVoNaA0&list=PL5-da3qGB5IA6E6ZNXu7dp89_uv8yocmf) (10 minutes)
- Scott Fortmann-Roe: [Accurately Measuring Model Prediction Error](http://scott.fortmann-roe.com/docs/MeasuringError.html)
- Machine Learning Mastery: [An Introduction to Feature Selection](http://machinelearningmastery.com/an-introduction-to-feature-selection/)
- Harvard CS109: [Cross-Validation: The Right and Wrong Way](https://github.com/cs109/content/blob/master/lec_10_cross_val.ipynb)
- Journal of Cheminformatics: [Cross-validation pitfalls when selecting and assessing regression and classification models](http://www.jcheminf.com/content/pdf/1758-2946-6-10.pdf)