# Basics of cross validation
Importing the required libraries 

In [1]:
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
from sklearn.model_selection import train_test_split

Also some packages required for this lab: 

In [2]:
from functools import partial
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

# Validation Set Approach
Let's repeat the mpg/horsepower example 

In [10]:
Auto = load_data('Auto')
print(Auto)
Auto_train, Auto_valid = train_test_split(Auto,
                                         test_size=196,
                                         random_state=0)

      mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
0    18.0          8         307.0         130    3504          12.0    70   
1    15.0          8         350.0         165    3693          11.5    70   
2    18.0          8         318.0         150    3436          11.0    70   
3    16.0          8         304.0         150    3433          12.0    70   
4    17.0          8         302.0         140    3449          10.5    70   
..    ...        ...           ...         ...     ...           ...   ...   
387  27.0          4         140.0          86    2790          15.6    82   
388  44.0          4          97.0          52    2130          24.6    82   
389  32.0          4         135.0          84    2295          11.6    82   
390  28.0          4         120.0          79    2625          18.6    82   
391  31.0          4         119.0          82    2720          19.4    82   

     origin                       name  
0         1  chevrolet

Let's define a function to evaluate the MSE for different models

In [4]:
def evalMSE(terms,
            response,
            train,
            test):

   mm = MS(terms)
   X_train = mm.fit_transform(train)
   y_train = train[response]

   X_test = mm.transform(test)
   y_test = test[response]

   results = sm.OLS(y_train, X_train).fit()
   test_pred = results.predict(X_test)

   return np.mean((y_test - test_pred)**2)

In [5]:
MSE = np.zeros(5) # Why using size 5?
for idx, degree in enumerate(range(1, 6)):
    MSE[idx] = evalMSE([poly('horsepower', degree)],
                       'mpg',
                       Auto_train,
                       Auto_valid)
MSE

array([23.61661707, 18.76303135, 18.79694163, 18.77852784, 18.44907863])

# Performing LOOCV

In [6]:
hp_model = sklearn_sm(sm.OLS,
                      MS(['horsepower']))  # Only using horsepower as X to train this model?
# y is 'mpg'
X, Y = Auto.drop(columns=['mpg']), Auto['mpg']
cv_error = np.zeros(5)
H = np.array(Auto['horsepower'])
M = sklearn_sm(sm.OLS)

for i, d in enumerate(range(1,6)):
    X = np.power.outer(H, np.arange(d+1))
    print(X.shape)
    M_CV = cross_validate(M,
                          X,
                          Y,
                          cv=Auto.shape[0])
    cv_error[i] = np.mean(M_CV['test_score'])
cv_error


(392, 2)
(392, 3)
(392, 4)
(392, 5)
(392, 6)


array([24.23151352, 19.24821312, 19.33498406, 19.42443029, 19.03320648])

You can see that the argument cv is set to be the number of samples, which is an indication of a LOOCV taking place

# Performing K-Fold CV

In [7]:
cv_error = np.zeros(5)
cv = KFold(n_splits=10,
           shuffle=True,
           random_state=0) # use same splits for each degree
for i, d in enumerate(range(1,6)):
    X = np.power.outer(H, np.arange(d+1))
    print(X.shape[1])
    M_CV = cross_validate(M,
                          X,
                          Y,
                          cv=cv)
    cv_error[i] = np.mean(M_CV['test_score'])
cv_error

2
3
4
5
6


array([24.20766449, 19.18533142, 19.27626666, 19.47848403, 19.13720065])