In [4]:
from sklearn import datasets
import numpy as np

In [5]:
# load dataset
X, y = datasets.fetch_california_housing(return_X_y=True)
print(X.shape)
print(y.shape)

(20640, 8)
(20640,)


In [6]:
# create virtual features 
X_virtual = [np.power(X[:,0], 2).reshape([-1,1]),
             np.power(X[:,4], 2).reshape([-1,1]),
             np.power(X[:,5], 3).reshape([-1,1]),
             np.power(X[:,6], 2).reshape([-1,1]),
            ]
X_virtual = np.hstack(X_virtual)
X = np.hstack((X, X_virtual))
interc = np.ones((X.shape[0], 1))
X = np.hstack((interc, X))
print(X.shape)

(20640, 13)


In [7]:
# split training and testing dataset
train_ratio = 0.8
cutoff = int(X.shape[0] * train_ratio)
X_tr = X[:cutoff, :]
y_tr = y[:cutoff]
X_te = X[cutoff:,:]
y_te = y[cutoff:]
print('Train/Test: %d/%d' %(X_tr.shape[0], X_te.shape[0]))

Train/Test: 16512/4128


In [8]:
# linear regression using the normal equation
def pseudo_inverse(A):
    # Calculate the pseudo_inverse of A
    pinv = np.matmul(np.linalg.inv(np.matmul(A.T, A)), A.T)
    return pinv 

In [9]:
# evaluation functions
def MSE(prediction,reference):
    # Calculate the mean square error between the prediction and reference vectors
    mse = 0.5 * np.mean(np.square(prediction - reference))
    return mse 

def MAE(prediction, reference):
    # Calculate the mean absolute error between the prediction and reference vectors
    mae = np.mean(np.abs(prediction - reference))
    return mae 

In [10]:
# fit the polynomial on the training set
beta = np.matmul(pseudo_inverse(X_tr), y_tr)
#print(np.linalg.inv(np.matmul(X_tr.T,X_tr)))
print(beta)

[-2.46041530e+01  5.70776190e-01  1.07857470e-02 -1.35704230e-01
  7.74429534e-01  5.68996772e-06 -5.19585763e-02 -1.00669354e+00
 -4.15626171e-01 -1.05797041e-02  1.91324111e-10  1.42019113e-07
  8.28267916e-03]


In [11]:
# make prediction on the testing set
pred = np.matmul(X_te, beta)
mse = MSE(pred, y_te)
mae = MAE(pred, y_te)
print(mse)
print(mae)

5.688900430502425
0.5560411511341454


In [12]:
# regularized linear regression 
def regularized_pseudo_inverse(A, theta):
    # Calculate the regularized pseudo_inverse of A
    dim = A.shape[1]
    pinv = np.matmul(np.linalg.inv(np.matmul(A.T, A) + theta * np.identity(dim)), A.T)
    return pinv 

In [13]:
theta = 0.5
beta_regularized = np.matmul(regularized_pseudo_inverse(X_tr, theta), y_tr)

In [14]:
# make prediction on the testing set
pred_2 = np.matmul(X_te, beta_regularized)
mse = MSE(pred_2, y_te)
mae = MAE(pred_2, y_te)
print(mse)
print(mae)

5.463482120840019
0.5498164056842985
