In [1]:
from sklearn import datasets
import numpy as np

In [3]:
# load dataset
X, y = datasets.fetch_california_housing(return_X_y=True)
print(X.shape)
print(y.shape)

(20640, 8)
(20640,)


In [None]:
# create virtual features for nonlinear fitting
X_virtual = [np.power(X[:,0], 2).reshape([-1,1]),
             np.power(X[:,7], 2).reshape([-1,1]),
             np.power(X[:,10], 3).reshape([-1,1]),
             np.power(X[:,10], 2).reshape([-1,1]),
            ]
X_virtual = np.hstack(X_virtual)
X = np.hstack((X, X_virtual))
interc = np.ones((X.shape[0], 1))
X = np.hstack((interc, X))
print(X.shape)

(506, 18)


In [4]:
# split training and testing dataset
train_ratio = 0.8
cutoff = int(X.shape[0] * train_ratio)
X_tr = X[:cutoff, :]
y_tr = y[:cutoff]
X_te = X[cutoff:,:]
y_te = y[cutoff:]
print('Train/Test: %d/%d' %(X_tr.shape[0], X_te.shape[0]))

Train/Test: 16512/4128


In [5]:
# linear regression using the normal equation
def pseudo_inverse(A):
    # Calculate the pseudo_inverse of A
    pinv = np.matmul(np.linalg.inv(np.matmul(A.T, A)), A.T)
    return pinv 

In [6]:
# evaluation functions
def MSE(prediction,reference):
    # Calculate the mean square error between the prediction and reference vectors
    mse = 0.5 * np.mean(np.square(prediction - reference))
    return mse 

def MAE(prediction, reference):
    # Calculate the mean absolute error between the prediction and reference vectors
    mae = np.mean(np.abs(prediction - reference))
    return mae 

In [7]:
# fit the polynomial on the training set
beta = np.matmul(pseudo_inverse(X_tr), y_tr)
#print(np.linalg.inv(np.matmul(X_tr.T,X_tr)))
print(beta)

[ 5.09834582e-01  1.70096092e-02 -1.88621589e-01  9.13093027e-01
  1.30481356e-05 -1.17780627e-02 -7.00228953e-02 -1.74897255e-02]


In [8]:
# make prediction on the testing set
pred = np.matmul(X_te, beta)
mse = MSE(pred, y_te)
mae = MAE(pred, y_te)
print(mse)
print(mae)

0.34396222790312886
0.5902960701092475


In [13]:
# regularized linear regression 
def regularized_pseudo_inverse(A, theta):
    # Calculate the regularized pseudo_inverse of A
    ### Your code here ###
    pinv = np.matmul(np.linalg.inv(np.matmul(A.T, A) + theta * np.eye(len(A[0, :]))), A.T)
    return pinv 

In [27]:
# fit the polynomial, regularized by theta
theta = 0.5
beta_regularized = np.matmul(regularized_pseudo_inverse(X_tr, theta), y_tr) ### Your code here ###  

In [28]:
# make prediction on the testing set
pred_2 = np.matmul(X_te, beta_regularized)### Your code here ###
mse = MSE(pred_2, y_te)
mae = MAE(pred_2, y_te)
print(mse)
print(mae)

0.34395913925193344
0.5902999745762753
