In [6]:
import numpy as np # import numpy for matrix operations

In [7]:
### this function load data from .dat file
def load_dat(filename):
    with open(filename, 'r') as fin:
        lines = fin.readlines()
        dim = len(lines[0].strip().split())
        num_samples = len(lines)
        data = np.zeros((num_samples, dim))
        for i in range(num_samples):
            data[i, :] = np.array([float(x) for x in lines[i].strip().split()])
        return data 

In [8]:
### load data
# call the load_dat function to load X and Y from the corresponding input files
X = load_dat("ex2x.dat") # in this case, dim 2 so we have 2 variables
Y =  load_dat("ex2y.dat") # output is dim 1
# get some statistics of the data
num_samples = X.shape[0] # get the first dimension of X (i.e. number of rows), so number of samples
dim = X.shape[1] # get the second dimension of X (i.e. number of columns), so number of variables
print('X (%d x %d)' %(num_samples, dim))
print('Y (%d)' %(num_samples))

X (47 x 2)
Y (47)


In [None]:
### add intercept term to all samples in X
# practical to use np.ones, the learned beta_0 will then give the intercept value directly
X = np.concatenate((X, np.ones((num_samples, 1))), axis=1) # add a column of 1 to X
Y = Y.reshape([-1,1]) # basically encode Y as a  47 x 1 matrix, will still work here but not for exercise 3 so good practice
print('X (%d x %d)' %(num_samples, dim + 1))
print('Y (%d x 1)' %(num_samples))

# GOOD PRACTICE: ALWAYS CHECK MATRIX SHAPES BEFORE MULTIPLYING THEM

X (47 x 3)
Y (47 x 1)


In [13]:
### main functions of multivariate linear regression
def pseudo_inverse(A):
    # The pseudo inverse:
    # Input: a matrix A
    # Output: the pseudo_inverse of A
    ### Your code here ###
    A_t = np.transpose(A) # transpose A
    return np.dot(np.linalg.inv(np.dot(A_t, A)), A_t) # pseudo inverse
    
def sse(prediction,reference):
    # Calculate the sum of square error between the prediction and reference vectors
    ### Your code here ###
    return np.sum(np.square(prediction - reference)) # sum of square error

In [None]:
### estimate beta
# call the pseudo_inverse to estimate beta from X and Y
#print(np.transpose(X).shape) # debug
beta = np.dot(pseudo_inverse(X), Y) ### Your code here 
# print the estimated (learned) parameters
print(beta)

(3, 47)
[[  139.21067402]
 [-8738.01911233]
 [89597.9095428 ]]


In [None]:
### evaluate the model
# calculate the predicted scores
prediction = np.dot(X, beta) ### Your code here
# calculate the sum of square error
error = sse(prediction, Y)
print('Sum of square error: %f' %error)

Sum of square error: 192068324756.665863


The error is quite bad, but that's because the data itself is pretty bad (check orders of magnitude of input files)
One should always check the data before doing ML with it !

In [18]:
### Extra step 
# generate synthetic scores
# NB nomenclature: feature = variable = dimension
Ys = 3 * X[:,0] + 2 * X[:,1] + 0.5 * X[:,2] # generate Ys using a linear function of the features of X
# perform multivariate linear regression with X and Ys as inputs
beta_2 = np.dot(pseudo_inverse(X), Ys) ### Your code here
print('beta_2: ', beta_2)
# calculate the predicted scores
prediction_2 = np.dot(X, beta_2) ### Your code here
# calculate the sum of square error
error_2 = sse(prediction_2, Ys) 
print('Sum of square error: %f' %error_2) 

beta_2:  [3.  2.  0.5]
Sum of square error: 0.000000


There are 2 definitions of multivariate linear regression: for the first one, what we did is OK
For most people, multivariate is actually when we want to predict multiple variables (so multiple columns in Y, and separate beta_i,j coefficients for each variable -> we get a beta matrix instead of a beta vector, as we also had a Y matrix instead of a Y vector)