# Cross-Validation with Ridge and Lasso

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
#SK LEARN
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error


We will euse the Kings County housing dataset in this notebook

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/learn-co-curriculum/dsc-mod-2-project-v2-1/master/kc_house_data.csv',index_col=0)

## Data Cleaning and Prep

In [3]:
df['date'] = pd.to_datetime(df['date'])

df['yr_sold']=  df['date'].map(lambda x: x.year)

df['yrs_old'] =  df['yr_built'].map(lambda x: 2016-x)

df['yr_since_reno'] =  df['yr_renovated'].map(lambda x: 2016-x if x > 0 else np.nan)

df['yrs_since_update'] = df.apply(lambda x: min(x['yrs_old'], x['yr_since_reno']), axis=1)

df['bedrooms']=df['bedrooms'].map(lambda x: x if x < 10 else 10)

df.replace('?', 0, inplace=True)

df['sqft_basement'] = pd.to_numeric(df['sqft_basement'])

df.fillna(0, inplace=True)

In [4]:
zip_df = pd.get_dummies(df['zipcode'], drop_first=True)


In [5]:
target= df.price

features = df.drop(columns=['date', 'price', 'lat', 'long', 'yr_built', 'yr_renovated', 'yr_since_reno', 'zipcode'])

In [6]:
features.head()

Unnamed: 0_level_0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,yr_sold,yrs_old,yrs_since_update
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7129300520,3,1.0,1180,5650,1.0,0.0,0.0,3,7,1180,0.0,1340,5650,2014,61,61.0
6414100192,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,400.0,1690,7639,2014,65,25.0
5631500400,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,0.0,2720,8062,2015,83,83.0
2487200875,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,910.0,1360,5000,2014,51,51.0
1954400510,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,0.0,1800,7503,2015,29,29.0


In [7]:
polynomial_features_2= PolynomialFeatures(degree=2, include_bias=False)
features_poly = polynomial_features_2.fit_transform(features)
poly_columns = polynomial_features_2.get_feature_names(features.columns)

In [8]:
features_poly = pd.DataFrame(features_poly, columns=poly_columns)

In [9]:
zip_df.reset_index(drop=True, inplace=True)

In [10]:
features_poly = pd.merge(features_poly, zip_df, left_index=True, right_index=True)

In [11]:
features_poly.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,yr_sold,yrs_old,yrs_since_update,bedrooms^2,bedrooms bathrooms,bedrooms sqft_living,bedrooms sqft_lot,bedrooms floors,bedrooms waterfront,bedrooms view,bedrooms condition,bedrooms grade,bedrooms sqft_above,bedrooms sqft_basement,bedrooms sqft_living15,bedrooms sqft_lot15,bedrooms yr_sold,bedrooms yrs_old,bedrooms yrs_since_update,bathrooms^2,bathrooms sqft_living,bathrooms sqft_lot,bathrooms floors,bathrooms waterfront,bathrooms view,bathrooms condition,bathrooms grade,bathrooms sqft_above,bathrooms sqft_basement,bathrooms sqft_living15,bathrooms sqft_lot15,bathrooms yr_sold,bathrooms yrs_old,bathrooms yrs_since_update,sqft_living^2,sqft_living sqft_lot,sqft_living floors,...,98032,98033,98034,98038,98039,98040,98042,98045,98052,98053,98055,98056,98058,98059,98065,98070,98072,98074,98075,98077,98092,98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199
0,3.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1340.0,5650.0,2014.0,61.0,61.0,9.0,3.0,3540.0,16950.0,3.0,0.0,0.0,9.0,21.0,3540.0,0.0,4020.0,16950.0,6042.0,183.0,183.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1340.0,5650.0,2014.0,61.0,61.0,1392400.0,6667000.0,1180.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3.0,7.0,2170.0,400.0,1690.0,7639.0,2014.0,65.0,25.0,9.0,6.75,7710.0,21726.0,6.0,0.0,0.0,9.0,21.0,6510.0,1200.0,5070.0,22917.0,6042.0,195.0,75.0,5.0625,5782.5,16294.5,4.5,0.0,0.0,6.75,15.75,4882.5,900.0,3802.5,17187.75,4531.5,146.25,56.25,6604900.0,18611940.0,5140.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,2720.0,8062.0,2015.0,83.0,83.0,4.0,2.0,1540.0,20000.0,2.0,0.0,0.0,6.0,12.0,1540.0,0.0,5440.0,16124.0,4030.0,166.0,166.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,2720.0,8062.0,2015.0,83.0,83.0,592900.0,7700000.0,770.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4.0,3.0,1960.0,5000.0,1.0,0.0,0.0,5.0,7.0,1050.0,910.0,1360.0,5000.0,2014.0,51.0,51.0,16.0,12.0,7840.0,20000.0,4.0,0.0,0.0,20.0,28.0,4200.0,3640.0,5440.0,20000.0,8056.0,204.0,204.0,9.0,5880.0,15000.0,3.0,0.0,0.0,15.0,21.0,3150.0,2730.0,4080.0,15000.0,6042.0,153.0,153.0,3841600.0,9800000.0,1960.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,3.0,2.0,1680.0,8080.0,1.0,0.0,0.0,3.0,8.0,1680.0,0.0,1800.0,7503.0,2015.0,29.0,29.0,9.0,6.0,5040.0,24240.0,3.0,0.0,0.0,9.0,24.0,5040.0,0.0,5400.0,22509.0,6045.0,87.0,87.0,4.0,3360.0,16160.0,2.0,0.0,0.0,6.0,16.0,3360.0,0.0,3600.0,15006.0,4030.0,58.0,58.0,2822400.0,13574400.0,1680.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
features_poly.shape

(21597, 221)

In [13]:
#call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(features_poly, target, random_state=22,test_size=0.25)


In [14]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled =pd.DataFrame(data=scaler.transform(X_train), columns=features_poly.columns)
X_test_scaled =pd.DataFrame(data=scaler.transform(X_test), columns=features_poly.columns)

In [15]:
X_train_scaled.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,yr_sold,yrs_old,yrs_since_update,bedrooms^2,bedrooms bathrooms,bedrooms sqft_living,bedrooms sqft_lot,bedrooms floors,bedrooms waterfront,bedrooms view,bedrooms condition,bedrooms grade,bedrooms sqft_above,bedrooms sqft_basement,bedrooms sqft_living15,bedrooms sqft_lot15,bedrooms yr_sold,bedrooms yrs_old,bedrooms yrs_since_update,bathrooms^2,bathrooms sqft_living,bathrooms sqft_lot,bathrooms floors,bathrooms waterfront,bathrooms view,bathrooms condition,bathrooms grade,bathrooms sqft_above,bathrooms sqft_basement,bathrooms sqft_living15,bathrooms sqft_lot15,bathrooms yr_sold,bathrooms yrs_old,bathrooms yrs_since_update,sqft_living^2,sqft_living sqft_lot,sqft_living floors,...,98032,98033,98034,98038,98039,98040,98042,98045,98052,98053,98055,98056,98058,98059,98065,98070,98072,98074,98075,98077,98092,98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199
0,0.698568,0.176627,-0.312662,-0.04565,-0.913867,-0.078021,-0.303802,-0.628011,-0.563268,-0.686584,0.654619,-0.356763,-0.139512,1.446652,-0.270957,-0.208109,0.57281,0.361423,-0.066559,0.003965,-0.457905,-0.073901,-0.29044,0.128589,0.195405,-0.36102,0.647588,0.007401,-0.082655,0.700065,0.005577,0.070817,0.000623,-0.219357,-0.043495,-0.538131,-0.072187,-0.278164,-0.141553,-0.127092,-0.422112,0.457068,-0.196322,-0.110202,0.17762,-0.00576,0.077292,-0.361541,-0.107329,-0.674778,...,-0.076811,-0.142645,-0.160346,-0.168078,-0.048494,-0.115712,-0.161154,-0.098296,-0.166323,-0.13714,-0.108658,-0.135967,6.870615,-0.151198,-0.118956,-0.075167,-0.11266,-0.141515,-0.133832,-0.094712,-0.127227,-0.072202,-0.171351,-0.102068,-0.123157,-0.112941,-0.094046,-0.069113,-0.111814,-0.169047,-0.123157,-0.165734,-0.153531,-0.094046,-0.118689,-0.139228,-0.126976,-0.154581,-0.112379,-0.131179,-0.118153,-0.051593,-0.147307,-0.107782,-0.1135,-0.108075,-0.107782,-0.076811,-0.112379,-0.123673
1,0.698568,0.502201,0.245442,0.002568,0.930263,-0.078021,-0.303802,0.912485,-0.563268,0.623003,-0.647824,-0.239606,-0.095542,-0.691251,0.170889,0.240206,0.57281,0.600686,0.351931,0.057481,1.164248,-0.073901,-0.29044,1.161826,0.195405,0.67497,-0.578532,0.095976,-0.032463,0.697861,0.498992,0.577371,0.322876,0.205261,0.031315,0.781819,-0.072187,-0.278164,1.031659,0.086992,0.444244,-0.537788,0.020671,-0.042172,0.501693,0.71102,0.848808,0.027284,-0.021193,0.600008,...,-0.076811,-0.142645,-0.160346,-0.168078,-0.048494,-0.115712,6.20523,-0.098296,-0.166323,-0.13714,-0.108658,-0.135967,-0.145547,-0.151198,-0.118956,-0.075167,-0.11266,-0.141515,-0.133832,-0.094712,-0.127227,-0.072202,-0.171351,-0.102068,-0.123157,-0.112941,-0.094046,-0.069113,-0.111814,-0.169047,-0.123157,-0.165734,-0.153531,-0.094046,-0.118689,-0.139228,-0.126976,-0.154581,-0.112379,-0.131179,-0.118153,-0.051593,-0.147307,-0.107782,-0.1135,-0.108075,-0.107782,-0.076811,-0.112379,-0.123673
2,1.808398,1.478923,0.98958,-0.193725,1.852328,-0.078021,-0.303802,-0.628011,1.146006,1.447558,-0.647824,-0.459276,-0.207244,1.446652,-0.916731,-1.173709,1.919739,2.096077,1.521244,-0.11263,2.98917,-0.073901,-0.29044,0.903517,2.041759,2.041901,-0.578532,0.392149,-0.083723,1.810273,-0.54477,-0.932552,1.493161,1.175287,-0.103571,2.281763,-0.072187,-0.278164,0.941412,1.524415,1.578723,-0.537788,0.320531,-0.069877,1.480377,-0.430678,-0.920598,0.696558,-0.126264,1.89294,...,-0.076811,-0.142645,-0.160346,-0.168078,-0.048494,-0.115712,-0.161154,-0.098296,-0.166323,-0.13714,-0.108658,-0.135967,-0.145547,-0.151198,-0.118956,-0.075167,-0.11266,-0.141515,-0.133832,-0.094712,-0.127227,-0.072202,-0.171351,-0.102068,-0.123157,-0.112941,-0.094046,-0.069113,-0.111814,5.915516,-0.123157,-0.165734,-0.153531,-0.094046,-0.118689,-0.139228,-0.126976,-0.154581,-0.112379,-0.131179,-0.118153,-0.051593,-0.147307,-0.107782,-0.1135,-0.108075,-0.107782,-0.076811,-0.112379,-0.123673
3,-0.411261,-1.451244,-1.067744,0.054619,-0.913867,-0.078021,-0.303802,-0.628011,-0.563268,-0.832094,-0.647824,-0.6643,0.225118,-0.691251,-0.236968,-0.173623,-0.474802,-1.074152,-0.85841,-0.001406,-0.863443,-0.073901,-0.29044,-0.64634,-0.564858,-0.739924,-0.578532,-0.648609,0.13394,-0.411796,-0.31704,-0.260392,-1.10182,-0.941021,-0.155357,-1.138109,-0.072187,-0.278164,-1.495258,-1.197513,-0.896373,-0.537788,-1.011327,-0.129178,-1.451473,-0.782631,-0.758903,-0.733254,-0.14208,-0.987804,...,-0.076811,-0.142645,-0.160346,-0.168078,-0.048494,-0.115712,-0.161154,-0.098296,-0.166323,-0.13714,-0.108658,-0.135967,-0.145547,-0.151198,-0.118956,-0.075167,-0.11266,-0.141515,-0.133832,-0.094712,-0.127227,-0.072202,-0.171351,-0.102068,-0.123157,-0.112941,-0.094046,-0.069113,-0.111814,-0.169047,-0.123157,-0.165734,-0.153531,-0.094046,-0.118689,-0.139228,-0.126976,-0.154581,-0.112379,-0.131179,-0.118153,-0.051593,-0.147307,-0.107782,-0.1135,-0.108075,-0.107782,-0.076811,-0.112379,-0.123673
4,-0.411261,-1.451244,-0.980198,-0.162072,-0.913867,-0.078021,-0.303802,-0.628011,-0.563268,-0.735088,-0.647824,-0.356763,-0.211734,1.446652,0.680711,0.757492,-0.474802,-1.074152,-0.809175,-0.181779,-0.863443,-0.073901,-0.29044,-0.64634,-0.564858,-0.682369,-0.578532,-0.474226,-0.240066,-0.410144,0.451548,0.528664,-1.10182,-0.9213,-0.236174,-1.138109,-0.072187,-0.278164,-1.495258,-1.197513,-0.873309,-0.537788,-0.939565,-0.282643,-1.450827,-0.319084,-0.259958,-0.699253,-0.211841,-0.951511,...,-0.076811,-0.142645,-0.160346,-0.168078,-0.048494,-0.115712,-0.161154,-0.098296,-0.166323,-0.13714,-0.108658,-0.135967,-0.145547,-0.151198,-0.118956,-0.075167,-0.11266,-0.141515,-0.133832,-0.094712,-0.127227,-0.072202,-0.171351,-0.102068,-0.123157,-0.112941,-0.094046,-0.069113,-0.111814,-0.169047,-0.123157,-0.165734,-0.153531,-0.094046,-0.118689,7.182455,-0.126976,-0.154581,-0.112379,-0.131179,-0.118153,-0.051593,-0.147307,-0.107782,-0.1135,-0.108075,-0.107782,-0.076811,-0.112379,-0.123673


### Fit A linear regression model

In [16]:
#instantiate a linear regression object
lm = LinearRegression()

#fit the linear regression to the data
lm = lm.fit(X_train_scaled, y_train)


print(lm.intercept_)
print(lm.coef_)

539494.636908079
[ 1.13793508e+07 -7.41928182e+05  6.22339391e+07 -2.29188888e+06
 -8.24961903e+06 -1.53669862e+07 -4.03607873e+06  9.19128116e+06
 -2.08747356e+07 -7.11163789e+07 -4.90075231e+07 -7.32850472e+05
 -6.15071294e+06 -1.08388902e+04 -2.59300526e+07  6.61301427e+05
  1.96501248e+04 -6.31979763e+03 -6.49225001e+04  3.57506428e+04
  4.43281688e+04 -8.89753009e+03 -2.06618551e+03 -1.09557563e+02
  3.28850009e+04 -9.33092207e+04 -2.63402540e+04  9.05429802e+03
 -2.01521186e+04 -1.13843338e+07 -3.32848757e+04  3.23352446e+04
 -6.70011907e+04  1.73212408e+05 -1.24505074e+04 -2.58125792e+04
  2.62001921e+04 -1.84026037e+04 -1.84196791e+04  4.54463961e+04
 -1.03645133e+04 -1.88194686e+04  1.16023638e+04 -2.15942307e+03
  7.67545773e+05 -3.98545880e+04  1.86555253e+04 -2.27441325e+05
 -6.65397042e+04 -2.79136762e+05  4.01344018e+04 -9.18158499e+04
 -2.11668860e+04  1.85589373e+05  2.56384648e+05  2.64245812e+04
  1.52543267e+05  2.72548762e+04 -6.21852461e+07  2.21372204e+05
 -1.9203

In [17]:
y_train_pred = lm.predict(X_train_scaled)
y_pred = lm.predict(X_test_scaled)

In [18]:
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))


print('Training RMSE:', int(train_rmse) )
print('Test RMSE:',  int(test_rmse))


Training RMSE: 128139
Test RMSE: 138409


### Use Recursive Feature Elimiation to select certain features

In [19]:
from sklearn.feature_selection import RFECV
ols = LinearRegression()

In [20]:
#recursive wrapper method

# Create recursive feature eliminator that scores features by mean squared errors
selector = RFECV(estimator=ols, step=2, cv=5, scoring='neg_mean_squared_error', verbose =2, n_jobs=-1)

# Fit recursive feature eliminator 
selector.fit(X_train_scaled, y_train)



Fitting estimator with 221 features.
Fitting estimator with 219 features.
Fitting estimator with 217 features.
Fitting estimator with 215 features.
Fitting estimator with 213 features.
Fitting estimator with 211 features.
Fitting estimator with 209 features.


RFECV(cv=5,
      estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                 normalize=False),
      min_features_to_select=1, n_jobs=-1, scoring='neg_mean_squared_error',
      step=2, verbose=2)

In [21]:
selected_columns = X_train_scaled.columns[selector.support_]
removed_columns = X_train_scaled.columns[~selector.support_]

In [22]:
len(selected_columns)

207

### Refit linear model with only selected features

In [23]:
lm_rfe = LinearRegression()

lm_rfe = lm_rfe.fit(X_train_scaled[selected_columns], y_train)

y_rfe=lm_rfe.predict(X_train_scaled[selected_columns])

trainRFE_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_rfe))


y_pred_rfe = lm_rfe.predict(X_test_scaled[selected_columns])

testRFE_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfe))



print("Train RMSE", int(trainRFE_rmse), "Test RMSE: ", int(testRFE_rmse))

Train RMSE 128163 Test RMSE:  138408


### Train a Lasso model with a alpha of 0.01

In [24]:
## training the model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01, normalize=False)

lasso.fit(X_train_scaled,y_train)

y_train_pred_lasso = lasso.predict(X_train_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

train_rmse_lasso = metrics.mean_absolute_error(y_train, y_train_pred_lasso)
test_rmse_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso))


print("Train RMSE", int(train_rmse_lasso), "Test RMSE: ", int(test_rmse_lasso))

Train RMSE 77996 Test RMSE:  137318


  positive)


In [25]:
lasso.coef_

array([ 4.32082628e+04,  1.01202725e+05,  2.03469429e+04,  2.06904903e+04,
       -3.62491076e+04,  1.09931341e+05,  4.65238323e+04,  1.21071154e+04,
        2.90312748e+04,  2.71232653e+04,  2.25019020e+04, -1.89764276e+04,
        4.22777795e+03,  1.15142977e+04, -5.13655007e+04, -2.37858769e+04,
        1.94224927e+04, -4.08695445e+03, -1.32707245e+05,  3.36485256e+04,
        4.27377765e+04, -1.01683415e+04, -3.82441830e+03, -8.88787770e+02,
        3.52295281e+04, -3.29382209e+04, -2.73570358e+02,  7.47990564e+03,
       -1.87978518e+04, -4.98633886e+04, -2.99598096e+04,  3.07463887e+04,
       -7.01974768e+04,  1.28819447e+05, -9.18955435e+03, -2.78727002e+04,
        3.42141036e+04, -2.12264924e+04, -1.67855031e+04,  6.83266096e+04,
        2.86902876e+04, -3.69042896e+03,  2.03982488e+03, -1.57011622e+03,
       -8.65544888e+04, -3.51369204e+04,  1.42418802e+04,  5.10556373e+04,
       -3.72026310e+04, -7.33850807e+04, -1.28329707e+04,  7.21160247e+04,
        2.12605570e+04,  

### Fit a Ridge Model 

In [26]:
## training the model
from sklearn.linear_model import Ridge

In [27]:
ridge = Ridge(alpha=0.5, normalize=False)

ridge.fit(X_train_scaled,y_train)

y_train_pred_ridge = ridge.predict(X_train_scaled)
y_pred_ridge = ridge.predict(X_test_scaled)

train_rmse_ridge = metrics.mean_absolute_error(y_train, y_train_pred_ridge)
test_rmse_ridge = np.sqrt(metrics.mean_squared_error(y_test, y_pred_ridge))


print("Train RMSE", int(train_rmse_ridge), "Test RMSE: ", int(test_rmse_ridge))

Train RMSE 77920 Test RMSE:  137170


We want to pick our best model, but this is more complicated than just choosing between linear regression, Lasso, or Ridge. We now have to also consider the different models that we get from different alpha values for Ridge and Lasso.


How do we determine the best model that will not overfit to the training data? 

___


<img src = "./resources/train_test_valid.png">

## Cross Validation

Cross-validation is a statistical method used to protect against overfitting a predictive model, particularly in a case where the amount of data may be limited. In cross-validation, you make a fixed number of folds (or partitions) of the data, run the analysis on each fold, and then average the overall error estimate.

### Steps for K-fold cross-validation



1. Split the dataset into K **equal** partitions (or "folds").
2. Use fold 1 as the **testing set** and the union of the other folds as the **training set**.
3. Calculate **testing accuracy**.
4. Repeat steps 2 and 3 K times, using a **different fold** as the testing set each time.
5. Use the **average testing accuracy** as the estimate of out-of-sample accuracy.

Diagram of **10-fold cross-validation:**

<img src="https://miro.medium.com/max/1354/1*qPMFLEbvc8QQf38Cf77wQg.png">

In [28]:
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

In [29]:
# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Validation set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:^25}'.format(iteration, data[0], str(data[1])))

Iteration                   Training set observations                   Validation set observations
    1     [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [0 1 2 3 4]       
    2     [ 0  1  2  3  4 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [5 6 7 8 9]       
    3     [ 0  1  2  3  4  5  6  7  8  9 15 16 17 18 19 20 21 22 23 24]     [10 11 12 13 14]     
    4     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 20 21 22 23 24]     [15 16 17 18 19]     
    5     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]     [20 21 22 23 24]     


- Dataset contains **25 observations** (numbered 0 through 24)
- 5-fold cross-validation, thus it runs for **5 iterations**
- For each iteration, every observation is either in the training set or the testing set, **but not both**
- Every observation is in the testing set **exactly once**

### Comparing cross-validation to train/test split



Advantages of **cross-validation:**

- More accurate estimate of out-of-sample accuracy
- More "efficient" use of data (every observation is used for both training and testing)

Advantages of **train/test split:**

- Runs K times faster than K-fold cross-validation
- Simpler to examine the detailed results of the testing process

### Cross-validation recommendations



1. K can be any number, but **K=10** is generally recommended
2. For classification problems, **stratified sampling** is recommended for creating the folds
    - Each response class should be represented with equal proportions in each of the K folds
    - scikit-learn's `cross_val_score` function does this by default

### Determinging the right Alphas 

In [30]:
from sklearn.linear_model import LassoCV, RidgeCV

#### LassoCV model

In [31]:
lassoCV_model = LassoCV(alphas = [50,100, 150, 200,250],cv=5, random_state=0, verbose=1, n_jobs=-1, max_iter=50000)
lassoCV_model.fit(X_train_scaled, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
.....................[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.6s remaining:    2.4s
  tol, rng, random, positive)
  tol, rng, random, positive)
.[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


LassoCV(alphas=[50, 100, 150, 200, 250], copy_X=True, cv=5, eps=0.001,
        fit_intercept=True, max_iter=50000, n_alphas=100, n_jobs=-1,
        normalize=False, positive=False, precompute='auto', random_state=0,
        selection='cyclic', tol=0.0001, verbose=1)

In [32]:
lassoCV_model.alpha_

100

In [33]:
lassoCV_model.alphas_

array([250, 200, 150, 100,  50])

In [34]:

y_train_pred_lassocv = lassoCV_model.predict(X_train_scaled)
y_pred_lassocv = lassoCV_model.predict(X_test_scaled)

train_rmse_lassocv = metrics.mean_absolute_error(y_train, y_train_pred_lassocv)
test_rmse_lassocv = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lassocv))


print("Train RMSE", int(train_rmse_lassocv), "Test RMSE: ", int(test_rmse_lassocv))

Train RMSE 78089 Test RMSE:  136804


In [35]:
lassoCV_model.coef_

array([ 0.00000000e+00, -0.00000000e+00, -6.85261560e+04,  6.09560649e+03,
       -0.00000000e+00,  0.00000000e+00, -2.31331094e+04, -1.27528291e+04,
       -0.00000000e+00, -0.00000000e+00, -2.86734854e+04, -0.00000000e+00,
        2.26722974e+04,  3.13629124e+03, -1.27242338e+05, -1.81029659e+04,
        1.07670434e+04, -0.00000000e+00, -1.22657895e+05,  2.17626876e+04,
        2.94504509e+04, -3.31945974e+03, -8.29100269e+03,  1.73686545e+03,
        2.63129167e+04, -1.21679219e+04,  0.00000000e+00,  0.00000000e+00,
       -7.08262063e+03,  9.49739105e+02, -0.00000000e+00,  2.39017591e+03,
       -5.26064862e+04,  6.73502567e+04, -0.00000000e+00, -1.65958804e+04,
        3.16621933e+04, -1.49116951e+04, -0.00000000e+00,  4.21732541e+04,
        5.21912423e+04,  5.98152846e+03,  1.01174878e+04, -4.05792846e+03,
       -0.00000000e+00, -1.98275715e+04, -0.00000000e+00,  0.00000000e+00,
       -6.75036035e+04, -0.00000000e+00,  1.74434653e+03, -1.77984059e+04,
        1.91059724e+04,  

#### RidgeCV model

In [36]:
RidgeCV_model = RidgeCV(alphas = [0.01,0.05,0.1,0.05,1,5,10,50,100, 200,250,300], cv=5 )
RidgeCV_model.fit(X_train_scaled, y_train)

RidgeCV(alphas=array([1.0e-02, 5.0e-02, 1.0e-01, 5.0e-02, 1.0e+00, 5.0e+00, 1.0e+01,
       5.0e+01, 1.0e+02, 2.0e+02, 2.5e+02, 3.0e+02]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [37]:
RidgeCV_model.alpha_

10.0

Now that we have used cross validation to help us determine the best **alpha** for Ridge and Lasso, we can then use those fitted models to compare on our test set.  

In [38]:

y_train_pred_ridgecv = RidgeCV_model.predict(X_train_scaled)
y_pred_ridgecv = RidgeCV_model.predict(X_test_scaled)

train_rmse_ridgecv = metrics.mean_absolute_error(y_train, y_train_pred_ridgecv)
test_rmse_ridgecv = np.sqrt(metrics.mean_squared_error(y_test, y_pred_ridgecv))


print("Train RMSE", int(train_rmse_ridgecv), "Test RMSE: ", int(test_rmse_ridgecv))

Train RMSE 77898 Test RMSE:  136918


## Improvements to cross-validation

**Repeated cross-validation**

- Repeat cross-validation multiple times (with **different random splits** of the data) and average the results
- More reliable estimate of out-of-sample performance by **reducing the variance** associated with a single trial of cross-validation

**Creating a hold-out set**

- "Hold out" a portion of the data **before** beginning the model building process
- Locate the best model using cross-validation on the remaining data, and test it **using the hold-out set**
- More reliable estimate of out-of-sample performance since hold-out set is **truly out-of-sample**

**Feature engineering and selection within cross-validation iterations**

- Normally, feature engineering and selection occurs **before** cross-validation
- Instead, perform all feature engineering and selection **within each cross-validation iteration**
- More reliable estimate of out-of-sample performance since it **better mimics** the application of the model to out-of-sample data


## Resources


- scikit-learn documentation: [Cross-validation](http://scikit-learn.org/stable/modules/cross_validation.html), [Model evaluation](http://scikit-learn.org/stable/modules/model_evaluation.html)
- scikit-learn issue on GitHub: [MSE is negative when returned by cross_val_score](https://github.com/scikit-learn/scikit-learn/issues/2439)
- Section 5.1 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) (11 pages) and related videos: [K-fold and leave-one-out cross-validation](https://www.youtube.com/watch?v=nZAM5OXrktY&list=PL5-da3qGB5IA6E6ZNXu7dp89_uv8yocmf) (14 minutes), [Cross-validation the right and wrong ways](https://www.youtube.com/watch?v=S06JpVoNaA0&list=PL5-da3qGB5IA6E6ZNXu7dp89_uv8yocmf) (10 minutes)
- Scott Fortmann-Roe: [Accurately Measuring Model Prediction Error](http://scott.fortmann-roe.com/docs/MeasuringError.html)
- Machine Learning Mastery: [An Introduction to Feature Selection](http://machinelearningmastery.com/an-introduction-to-feature-selection/)
- Harvard CS109: [Cross-Validation: The Right and Wrong Way](https://github.com/cs109/content/blob/master/lec_10_cross_val.ipynb)
- Journal of Cheminformatics: [Cross-validation pitfalls when selecting and assessing regression and classification models](http://www.jcheminf.com/content/pdf/1758-2946-6-10.pdf)