# Regression example
California house-prices dataset. Predict median house value for California districts.

More information about the dataset:
 * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html#sklearn.datasets.fetch_california_housing
 * https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset
 * http://lib.stat.cmu.edu/datasets/


In [None]:
# Import packages
import numpy as np
import sklearn.datasets
import sklearn.linear_model
import sklearn.preprocessing

# Make the results reproducible
np.random.seed(42)

# Load dataset
data = sklearn.datasets.fetch_california_housing()

# Split into training, validation, and test data sets
n_train = int(data.data.shape[0] * 0.75 + 0.5)  # Train on 75 %
n_val = int(data.data.shape[0] * 0.15 + 0.5)  # Validate on 15 %
n_test = int(data.data.shape[0] * 0.10 + 0.5)  # Test on 10 %

X = data.data[:n_train, :]
y = data.target[:n_train]
X_val = data.data[n_train:n_train + n_val, :]
y_val = data.target[n_train:n_train + n_val]
# Note! Do not use (at all!) the test data until the very end!
X_test = data.data[n_train + n_val:, :]
y_test = data.target[n_train + n_val:]

print(f"Training set size X  : {X.shape}")
print(f"Training set size y  : {y.shape}")
print(f"Validation set size X: {X_val.shape}")
print(f"Validation set size y: {y_val.shape}")
print(f"Test set size X      : {X_test.shape}")
print(f"Test set size y      : {y_test.shape}")
print(f"Feature names        : {data.feature_names}")

In [None]:
# Task1: Preprocess the data
#  - Try without preprocessing, try with different kinds.
#  - Evaluate and compare models on the validation data.
#
# Note that we fit the preprocessing function to the training data!
# Then we apply the learned transformation to the validation and test data sets.
standard_scaler = sklearn.preprocessing.StandardScaler()
standard_scaler.fit(X)
X_ = standard_scaler.transform(X)
X_val_ = standard_scaler.transform(X_val)
X_test_ = standard_scaler.transform(X_test)

In [None]:
# Fit baseline model
model_baseline = sklearn.linear_model.LinearRegression(fit_intercept=True)
_ = model_baseline.fit(X_, y)

In [None]:
# Evaluate baseline model
yhat = model_baseline.predict(X_)
yhat_val = model_baseline.predict(X_val_)
mse = sklearn.metrics.mean_squared_error(y, yhat)
mse_val = sklearn.metrics.mean_squared_error(y_val, yhat_val)
print(f"Training data mean squared error  : {mse:.3f}")
print(f"Validation data mean squared error: {mse_val:.3f}")

In [None]:
# Task 2: Find a better model
#  - Try different regression methods
#  - Evaluate them on the validation data
#  - Beat the baseline model and select the best one you can find
#  - You can look here for potential models to use:
#    https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model

model = "... add your own regression model code here!"

# Note that we fit on the preprocessed data in X_
_ = model.fit(X_, y)

In [None]:
# Evaluate better model
yhat = model.predict(X_)
yhat_val = model.predict(X_val_)
mse = sklearn.metrics.mean_squared_error(y, yhat)
mse_val = sklearn.metrics.mean_squared_error(y_val, yhat_val)
print(f"Training data mean squared error  : {mse:.3f}")
print(f"Validation data mean squared error: {mse_val:.3f}")

In [None]:
# Task 3: Determine the importance of the input variables
# ... your code here

In [None]:
# Evaluate the final model on the test data.
# This is only ever done once, and as the last thing we do.
# Training another model after this, based on the performance on the test data
# leads to biased results!
yhat = model.predict(X_)
yhat_val = model.predict(X_val_)
yhat_test = model.predict(X_test_)
mse = sklearn.metrics.mean_squared_error(y, yhat)
mse_val = sklearn.metrics.mean_squared_error(y_val, yhat_val)
mse_test = sklearn.metrics.mean_squared_error(y_test, yhat_test)
print(f"Training data mean squared error  : {mse:.3f}")
print(f"Validation data mean squared error: {mse_val:.3f}")
print(f"Test data mean squared error      : {mse_test:.3f}")