In [21]:
# Increase Jupyter display cell-width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

### Testing by Sampl Split

In [3]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.iloc[:, :-1]
y = dataset['target'].values

In [None]:
# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
print("Train dataset sample size: %i" % len(X_train))
print("Test dataset sample size: %i" % len(X_test))

#### Sometimes, reserving an out-of-sample (comprising what is not in-sample—that is, used as a sample for learning from training activity data) is not enough, because we may have to tune some parameters or make specific choices and we want to test the alternatives without having to use the test data. The solution is to reserve another part of our data for validation purposes, which implies checking what parameters could be optimal for our model. We can achieve that using train_test_split in two steps:

In [6]:
X_train, X_out_sample, y_train, y_out_sample = train_test_split(X, y, test_size=0.40, random_state=101)
X_validation, X_test, y_validation, y_test = train_test_split(X_out_sample, y_out_sample, test_size=0.50, random_state=101)
print("Train dataset sample size: %i" % len(X_train))
print("Validation dataset sample size: %i" % len(X_validation))
print("Test dataset sample size: %i" % len(X_test))

Train dataset sample size: 303
Validation dataset sample size: 101
Test dataset sample size: 102


In [5]:
X_train, X_out_sample, y_train, y_out_sample = train_test_split(X, y, test_size=0.40, random_state=101)
X_validation, X_test, y_validation, y_test = train_test_split(X_out_sample, y_out_sample, test_size=0.50, random_state=101)
print ("Train dataset sample size: %i" % len(X_train))
print ("Validation dataset sample size: %i" % len(X_validation))
print ("Test dataset sample size: %i" % len(X_test))

Train dataset sample size: 303
Validation dataset sample size: 101
Test dataset sample size: 102


### Cross Validation

In [13]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import numpy as np

def RMSE(y_true, y_pred):
    return np.sum((y_true - y_pred)**2)
lm = LinearRegression()
cv_iterator = KFold(n_splits=10, shuffle=True, random_state=101)
edges = np.histogram(y, bins=5)[1]
binning = np.digitize(y, edges)
stratified_cv_iterator = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)

In [14]:
second_order = PolynomialFeatures(degree=2, interaction_only=False)
third_order = PolynomialFeatures(degree=3, interaction_only=True)

In [18]:
over_param_X = second_order.fit_transform(X)
extra_over_param_X = third_order.fit_transform(X)
cv_score = cross_val_score(lm, over_param_X, y, cv=cv_iterator, scoring='neg_mean_squared_error', n_jobs=1)

In [19]:
print(cv_score)

[-11.67358614 -22.84201585  -9.19318785 -19.77458934 -11.68472904
  -9.15302457 -12.97982142 -22.18260725 -35.93064654 -13.75241156]


#### The mean squared error is negative because of the internals of the function, which can only maximize, whereas our cost metric has to be minimized; this is why it has become negative

In [20]:
print ('Cv score: mean %0.3f std %0.3f' % (np.mean(np.abs(cv_score)), np.std(cv_score)))

Cv score: mean 16.917 std 7.955


In [23]:
cv_score = cross_val_score(lm, over_param_X, y, cv=stratified_cv_iterator, scoring='neg_mean_squared_error', n_jobs=1)
print ('Cv score: mean %0.3f std %0.3f' % (np.mean(np.abs(cv_score)), np.std(cv_score)))

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.