## Cross Validation 
- k-fold implememtation
- cross validation using sklearn
- using boston housing dataset

In [1]:
# previous steps
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_boston

In [2]:
boston = load_boston()
boston_features = pd.DataFrame(boston.data, columns = boston.feature_names)
b = boston_features['B']
logdis = np.log(boston_features["DIS"])
loglstat = np.log(boston_features["LSTAT"])

# minmax scaling
boston_features["B"] = (b-min(b))/(max(b)-min(b))
boston_features["DIS"] = (logdis-min(logdis))/(max(logdis)-min(logdis))

#standardization
boston_features["LSTAT"] = (loglstat-np.mean(loglstat))/np.sqrt(np.var(loglstat))

In [3]:
X = boston_features[['CHAS', 'RM', 'DIS', 'B', 'LSTAT']]
y = pd.DataFrame(boston.target,columns = ['target'])
type(X)

pandas.core.frame.DataFrame

In [16]:
X.head()

Unnamed: 0,CHAS,RM,DIS,B,LSTAT
0,0.0,6.575,0.542096,1.0,-1.27526
1,0.0,6.421,0.623954,1.0,-0.263711
2,0.0,7.185,0.623954,0.989737,-1.627858
3,0.0,6.998,0.707895,0.994276,-2.153192
4,0.0,7.147,0.707895,1.0,-1.162114


### train test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(len(X_train), len(X_test), len(y_train), len(y_test))

404 102 404 102


In [5]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_hat_test = linreg.predict(X_test)

In [8]:
from sklearn.metrics import mean_squared_error
test_residuals = y_hat_test - y_test

test_mse = mean_squared_error(y_hat_test, y_test)
test_mse

13.9176761883513

### cross-validation

In [12]:
def cross_validation(data, k):
    
    data = pd.DataFrame(data)
    size = len(data)
    fold_size = size // k
    left = size % k
    
    start = 0
    folds = []
    
    for fold_n in range(1, k + 1):
        if fold_n <= left:
            fold = data.iloc[start: start + fold_size + 1]
            start += fold_size + 1
            folds.append(fold)
        else:
            fold = data.iloc[start: start + fold_size]
            folds.append(fold)
            start += fold_size
            
    return folds

In [13]:
boston_data = pd.concat([X.reset_index(drop=True), y], axis=1)
boston_folds = cross_validation(boston_data, 5)

In [14]:
boston_folds

[     CHAS     RM       DIS         B     LSTAT  target
 0     0.0  6.575  0.542096  1.000000 -1.275260    24.0
 1     0.0  6.421  0.623954  1.000000 -0.263711    21.6
 2     0.0  7.185  0.623954  0.989737 -1.627858    34.7
 3     0.0  6.998  0.707895  0.994276 -2.153192    33.4
 4     0.0  7.147  0.707895  1.000000 -1.162114    36.2
 5     0.0  6.430  0.707895  0.992990 -1.200048    28.7
 6     0.0  6.012  0.671500  0.996722  0.248456    22.9
 7     0.0  6.172  0.700059  1.000000  0.968416    27.1
 8     0.0  5.631  0.709276  0.974104  1.712312    16.5
 9     0.0  6.004  0.743201  0.974305  0.779802    18.9
 10    0.0  6.377  0.727217  0.988956  1.077829    15.0
 11    0.0  6.009  0.719175  1.000000  0.357391    18.9
 12    0.0  5.889  0.663113  0.983862  0.638571    21.7
 13    0.0  5.949  0.601338  1.000000 -0.432353    20.4
 14    0.0  6.096  0.578763  0.957436 -0.071152    18.2
 15    0.0  5.834  0.582214  0.996772 -0.390531    19.9
 16    0.0  5.935  0.582214  0.974658 -0.811149 

In [18]:
train_err = []
test_err = []
k = 5

for n in range(k):
    
    train = pd.concat([fold for i, fold in enumerate(boston_folds) if i != n])
    test = boston_folds[n]
    
    linreg = LinearRegression()
    linreg.fit(train[X.columns], train[y.columns])
    
    y_hat_train = linreg.predict(train[X.columns])
    y_hat_test = linreg.predict(test[X.columns])
    
    train_residuals = y_hat_train - train[y.columns]
    test_residuals = y_hat_test - test[y.columns]
    
    train_err.append(np.mean(train_residuals.astype(float)**2))
    test_err.append(np.mean(test_residuals.astype(float)**2))
    
print(train_err)
print(test_err)

[target    24.195577
dtype: float64, target    23.032087
dtype: float64, target    19.745073
dtype: float64, target    15.317101
dtype: float64, target    22.329973
dtype: float64]
[target    13.405145
dtype: float64, target    17.444017
dtype: float64, target    37.032711
dtype: float64, target    58.279544
dtype: float64, target    26.097989
dtype: float64]


### cross validation - sklearn

In [19]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

cv_5_res = cross_val_score(linreg, X, y, cv=5, scoring='neg_mean_squared_error')

In [20]:
cv_5_res

array([-13.40514492, -17.4440168 , -37.03271139, -58.27954385,
       -26.09798876])