# Coding cross validation yourself

Now it is up to you to demonstrate your knowledge on the absenteeism dataset.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


np.random.seed(40)

data = pd.read_csv("absent.csv",sep=';',index_col=0)
y = data['Absenteeism time in hours']
X = data.drop('Absenteeism time in hours',axis=1)

print(data.head)

## Applying cross validation

Apply cross-validation to this regression, where you normalize all variables, and calculate the mean absolute error, mean squared error, and R-squared value (only return the test set value):

In [None]:
from sklearn.linear_model import LinearRegression
classifier = LinearRegression()    
    
def apply_cv(X, y, nFold):    
    # Your ouput object should be a cross_validate() result
    
    # Import the required code
    from sklearn.pipeline import make_pipeline
    from sklearn.model_selection import cross_validate
    from sklearn.preprocessing import Normalizer
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import mean_squared_error
    
    # Create an instance of the Normalizer and add to a pipeline together with the classifier
    normalizer = Normalizer()
    pipeline = make_pipeline(normalizer, classifier)

    # Define the metrics
    metrics = ['neg_mean_absolute_error','neg_mean_squared_error','r2']

    # Use metrics and pipeline for cross validation
    outcomes = cross_validate(pipeline, X, y, scoring=metrics, cv= nFold, return_train_score=False,return_estimator=True)    
    
    return outcomes

Validate your result:

In [None]:
assert np.allclose(apply_cv(X,y,10)['test_neg_mean_absolute_error'],[-5.44169706, -4.88533949, -6.03776489, -6.53316679, -6.97119581,
       -5.6121378 , -6.20582349, -5.11806841, -5.58925382, -9.20383222])

assert np.allclose(apply_cv(X,y,10)['test_neg_mean_squared_error'],[ -57.35091985, -105.79371931,  -93.71342154, -120.49204514,
       -209.8988054 , -179.36848107, -180.39558093,  -90.82555524,
       -260.11583086, -309.20461226])

assert np.allclose(apply_cv(X,y,10)['test_r2'],[-0.11923377,  0.12709059,  0.09897902,  0.05605541,  0.1482223 ,
        0.07964105,  0.02008328,  0.14330421,  0.13328541,  0.06357028])

assert np.allclose(apply_cv(X,y,5)['test_neg_mean_absolute_error'],[-5.15813595, -6.30815503, -6.4512004 , -5.66281764, -7.53386985])

assert np.allclose(apply_cv(X,y,5)['test_neg_mean_squared_error'],[ -82.17300336, -112.00147052, -196.11580591, -135.58694837,
       -299.31974383])

assert np.allclose(apply_cv(X,y,5)['test_r2'],[0.05194651, 0.03335425, 0.11121968, 0.06534224, 0.052034  ])

## Cross validation with prediction

Calculate the root mean squared error of a prediction when it was in the test set:

In [None]:
from sklearn.linear_model import LinearRegression
classifier = LinearRegression()    
    
def calculate_rmse(X, y, nFold):  
    rmse = 0
    
    # Import the necessary code
    from sklearn.pipeline import make_pipeline
    from sklearn.model_selection import cross_validate
    from sklearn.preprocessing import Normalizer
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    
    # Create an instance of the Normalizer and add to a pipeline together with the classifier
    normalizer = Normalizer()
    pipeline = make_pipeline(normalizer, classifier)
    
    # Make cross-validated predictions
    predictions = cross_val_predict(pipeline, X, y, cv=nFold)
    
    # Calculate the RMSE of the list of predictions
    rmse = sqrt(mean_squared_error(y, predictions))
    
    return rmse

Validate your result:

In [9]:
assert np.allclose(calculate_rmse(X, y, 10),12.677377376954103)
assert np.allclose(calculate_rmse(X, y, 5),12.846765912010982)
assert np.allclose(calculate_rmse(X, y, len(X)),12.679949293998812)