# Regression example
Boston house-prices dataset.

More information about the dataset: https://scikit-learn.org/stable/datasets/index.html#boston-dataset

In [97]:
# Import packages
import numpy as np
import sklearn.datasets
import sklearn.linear_model
import sklearn.preprocessing

# Reproducible results
np.random.seed(42)

# Load dataset
data = sklearn.datasets.load_boston()

n_train = int(data.data.shape[0] * 0.75 + 0.5)
n_val = int(data.data.shape[0] * 0.2 + 0.5)
n_test = int(data.data.shape[0] * 0.05 + 0.5)

X = data.data[:n_train, :]
y = data.target[:n_train]
X_val = data.data[n_train:n_train + n_val, :]
y_val = data.target[n_train:n_train + n_val]
# Note! Do not touch the test data until the very end!
X_test = data.data[n_train + n_val:, :]
y_test = data.target[n_train + n_val:]

print(f"Training set size X  : {X.shape}")
print(f"Training set size y  : {y.shape}")
print(f"Validation set size X: {X_val.shape}")
print(f"Validation set size y: {y_val.shape}")
print(f"Test set size X      : {X_test.shape}")
print(f"Test set size y      : {y_test.shape}")
print(f"Feature names        : {data.feature_names}")

Training set size X  : (380, 13)
Training set size y  : (380,)
Validation set size X: (101, 13)
Validation set size y: (101,)
Test set size X      : (25, 13)
Test set size y      : (25,)
Feature names        : ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [0]:
# Task1: Preprocess the data
#  - Try without preprocessing, try with different kinds.
#  - Evaluate on the validation data
standard_scaler = sklearn.preprocessing.StandardScaler()
standard_scaler.fit(X)
X_ = standard_scaler.transform(X)
X_val_ = standard_scaler.transform(X_val)
X_test_ = standard_scaler.transform(X_test)

In [0]:
# Fit baseline model
model_baseline = sklearn.linear_model.LinearRegression(fit_intercept=True)
_ = model_baseline.fit(X_, y)

In [111]:
# Evaluate baseline model
yhat = model_baseline.predict(X_)
yhat_val = model_baseline.predict(X_val_)
mse = sklearn.metrics.mean_squared_error(y, yhat)
mse_val = sklearn.metrics.mean_squared_error(y_val, yhat_val)
print(f"Training data error  : {mse}")
print(f"Validation data error: {mse_val}")

Training data error  : 21.198414282847672
Validation data error: 70.14026982089234


In [0]:
# Task 2: Find a better model
#  - Try different regression methods
#  - Evaluate them on the validation data
#  - Beat the baseline model and select the best one you can find

model = "... add your code here!"

_ = model.fit(X_, y)

In [113]:
# Evaluate better model
yhat = model.predict(X_)
yhat_val = model.predict(X_val_)
mse = sklearn.metrics.mean_squared_error(y, yhat)
mse_val = sklearn.metrics.mean_squared_error(y_val, yhat_val)
print(f"Training data error  : {mse}")
print(f"Validation data error: {mse_val}")

Training data error  : 9.574639243945395
Validation data error: 36.25837009608003


In [0]:
# Task 3: Determine the importance of the input variables
# ... your code here

In [115]:
# Evaluate the final model on the test data.
# This is only ever done once, and as the last thing we do.
# Training another model after this, based on the performance on the test data
# leads to biased results.
yhat = model.predict(X_)
yhat_val = model.predict(X_val_)
yhat_test = model.predict(X_test_)
mse = sklearn.metrics.mean_squared_error(y, yhat)
mse_val = sklearn.metrics.mean_squared_error(y_val, yhat_val)
mse_test = sklearn.metrics.mean_squared_error(y_test, yhat_test)
print(f"Training data error  : {mse}")
print(f"Validation data error: {mse_val}")
print(f"Test data error      : {mse_test}")

Training data error  : 9.574639243945395
Validation data error: 36.25837009608003
Test data error      : 24.12472489333958
