# Lab 5.1 - Crossvalidation 


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import KFold, LeaveOneOut

pd.set_option('display.max_columns', 500)

%matplotlib inline

# 1. Loading a data set a building a model

In [2]:
# Read in subset of footballer data

model_data = pd.read_csv('footballer_reduced.csv')
model_data.head()

Unnamed: 0,age,height_cm,weight_kg,work_rate_att,overall
0,20,175,70,Medium,58
1,29,183,80,High,65
2,35,183,78,High,67
3,24,178,72,Medium,69
4,23,173,73,Medium,70


In [3]:
# Study categories
model_data.work_rate_att.value_counts()

Medium    266
High       71
Low        23
Name: work_rate_att, dtype: int64

In [4]:
# Turn category into numeric variables
model_data = pd.get_dummies(model_data, drop_first=True)
model_data.head()

Unnamed: 0,age,height_cm,weight_kg,overall,work_rate_att_Low,work_rate_att_Medium
0,20,175,70,58,0,1
1,29,183,80,65,0,0
2,35,183,78,67,0,0
3,24,178,72,69,0,1
4,23,173,73,70,0,1


In [None]:
# Even though we are not using seaborn we can style the plots with it
sns.set_style("darkgrid") 

# Create a dataframe
ax = 

plt.show()

In [5]:
# Define our X and y
y = model_data.overall
X = model_data.drop('overall', axis = 'columns')

## 2. Simple train-test split

In [7]:
# Split into train&validation, test
# Random state assures that folds are consistent across models
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 0.20, random_state = 0)
print(Xtrain.shape, Xtest.shape)

(288, 5) (72, 5)


In [8]:
# Check training and test loss
linmodel = LinearRegression().fit(Xtrain, ytrain)
trainloss = mean_squared_error(ytrain, linmodel.predict(Xtrain))
print(f"Training loss: %.3f" %  trainloss)
testloss = mean_squared_error(ytest, linmodel.predict(Xtest))
print(f"Training loss:%.3f" % testloss)

Training loss: 33.230
Training loss:40.952


## 3. Crossvalidation: define the partitions
For more details, see: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

In [9]:
# Define Kfold crossvalidation 
x=np.arange(20)
kf = KFold(n_splits = 5)
for train,test in kf.split(x):
    print("Train set: %s, Test set: %s" % (train, test))


Train set: [ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19], Test set: [0 1 2 3]
Train set: [ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19], Test set: [4 5 6 7]
Train set: [ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19], Test set: [ 8  9 10 11]
Train set: [ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19], Test set: [12 13 14 15]
Train set: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15], Test set: [16 17 18 19]


In [11]:
# Define Kfold crossvalidation with random shuffling
from sklearn.model_selection import KFold
x=np.arange(20)
kf = KFold(n_splits = 5, shuffle = True)
for train,test in kf.split(x):
    print("Train set: %s, Test set: %s" % (train, test))


Train set: [ 1  2  4  6  7  8  9 10 11 12 13 14 15 16 17 18], Test set: [ 0  3  5 19]
Train set: [ 0  1  3  4  5  6  7  8 10 11 12 13 14 15 16 19], Test set: [ 2  9 17 18]
Train set: [ 0  1  2  3  4  5  7  8  9 11 12 15 16 17 18 19], Test set: [ 6 10 13 14]
Train set: [ 0  1  2  3  4  5  6  8  9 10 11 13 14 17 18 19], Test set: [ 7 12 15 16]
Train set: [ 0  2  3  5  6  7  9 10 12 13 14 15 16 17 18 19], Test set: [ 1  4  8 11]


In [12]:
# Leave one out crossvalidation
x=np.arange(20)
loo = LeaveOneOut()
for train,test in loo.split(x):
    print("%s %s" % (train, test))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [0]
[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [1]
[ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [2]
[ 0  1  2  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [3]
[ 0  1  2  3  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [4]
[ 0  1  2  3  4  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [5]
[ 0  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 19] [6]
[ 0  1  2  3  4  5  6  8  9 10 11 12 13 14 15 16 17 18 19] [7]
[ 0  1  2  3  4  5  6  7  9 10 11 12 13 14 15 16 17 18 19] [8]
[ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16 17 18 19] [9]
[ 0  1  2  3  4  5  6  7  8  9 11 12 13 14 15 16 17 18 19] [10]
[ 0  1  2  3  4  5  6  7  8  9 10 12 13 14 15 16 17 18 19] [11]
[ 0  1  2  3  4  5  6  7  8  9 10 11 13 14 15 16 17 18 19] [12]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 14 15 16 17 18 19] [13]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 17 18 19] [14]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 16 1

## 4. Run the crossvalidation

In [17]:
# Check CV loss
kf = KFold(n_splits=5, shuffle=False)
sc = make_scorer(mean_squared_error)
cv_scores = cross_val_score(LinearRegression(), Xtrain, ytrain, cv=kf, scoring=sc)

# When printing arrays, set_printoptions controls the format
np.set_printoptions(precision=3)
print(f'List of CV loss:', cv_scores)
print(f"Average CV loss: %.3f +/- %.3f" % (cv_scores.mean(), cv_scores.std())) # this is the value we care about (use to compare models)

List of CV loss: [30.107 39.038 33.24  33.382 37.925]
Average CV loss: 34.738 +/- 3.292


In [18]:
# Leave One Out
kf = LeaveOneOut()
sc = make_scorer(mean_squared_error)

# use full data
cv_scores = cross_val_score(LinearRegression(), X, y, cv=kf, scoring=sc)

# When printing arrays, set_printoptions controls the format
np.set_printoptions(precision=3)
print(f'List of CV loss:', cv_scores)
print(f"Average CV loss: %.3f +/- %.3f" % (cv_scores.mean(), cv_scores.std()))

List of CV loss: [2.236e-02 7.182e-02 5.698e+01 1.561e+01 7.926e-01 1.685e+01 8.049e+01
 8.902e+00 1.262e+02 1.778e+01 2.153e+01 1.872e+00 5.026e+00 5.899e+01
 8.673e+00 1.296e+02 1.498e+02 4.350e+01 9.340e+01 4.862e+00 2.338e-02
 6.188e-02 1.154e+01 1.109e+02 1.354e+01 1.514e+01 3.699e+00 3.693e+00
 1.045e+01 1.804e+01 1.515e+01 2.456e+00 1.590e+02 2.307e+00 4.325e+00
 1.023e+01 4.496e+00 1.152e+01 1.441e+01 1.099e+00 4.679e+01 1.303e+01
 3.004e+00 5.049e+01 6.056e+01 2.315e+00 2.736e+01 3.905e+01 1.538e+01
 2.627e+01 2.827e+01 7.206e+01 2.344e+01 7.301e+01 5.108e+00 1.491e+01
 5.331e+00 1.402e+00 5.762e+00 1.252e+02 1.342e+00 3.574e+01 8.423e+00
 1.435e+02 3.010e+01 6.935e+01 7.783e+01 1.413e-02 8.333e+01 1.507e+01
 3.817e+01 8.960e+01 1.651e+01 1.859e-01 3.360e+00 9.474e+00 8.218e+01
 1.149e+02 1.305e+00 6.639e+01 8.430e+01 3.496e+01 1.332e+00 3.470e+01
 3.929e+00 5.054e+01 2.245e+01 3.333e+00 5.685e-01 7.017e+01 7.423e+00
 1.102e+02 6.007e+01 2.168e+02 6.198e+01 1.336e+02 7.076e+00