In [12]:
"""
Do not change the input and output format.
If our script cannot run your code or the format is improper, your code will not be graded.

The only functions you need to implement in this template is linear_regression_noreg, regularized_linear_regression,
tune_lambda, and test_error.
"""

import numpy as np
import pandas as pd

###### Q4.1 ######
def linear_regression_noreg(X, y):
    """
    Compute the weight parameter given X and y.
    Inputs:
    - X: A numpy array of shape (num_samples, D) containing feature.
    - y: A numpy array of shape (num_samples, ) containing label
    Returns:
    - w: a numpy array of shape (D, )
  """
    #####################################################
    ## w~* = (X~^t X~)^-1 X~^t y
    w = np.linalg.inv(X.T @ X) @ X.T @ y
    #####################################################		 
    return w

###### Q4.2 ######
def regularized_linear_regression(X, y, lambd):
    """
      Compute the weight parameter given X, y and lambda.
      Inputs:
      - X: A numpy array of shape (num_samples, D) containing feature.
      - y: A numpy array of shape (num_samples, ) containing label
      - lambd: a float number containing regularization strength
      Returns:
      - w: a numpy array of shape (D, )
    """
    ####################################################
    ## w~* = (X~^t X~ + lambd I)^-1 X~^t y
    w = np.linalg.inv(X.T @ X + lambd*np.eye(X.shape[1])) @ X.T @ y
    #####################################################		 
    return w

###### Q4.3 ######
def tune_lambda(Xtrain, ytrain, Xval, yval, lambds):
    """
    Find the best lambda value.
    Inputs:
    - Xtrain: A numpy array of shape (num_training_samples, D) containing training feature.
    - ytrain: A numpy array of shape (num_training_samples, ) containing training label
    - Xval: A numpy array of shape (num_val_samples, D) containing validation feature.
    - yval: A numpy array of shape (num_val_samples, ) containing validation label
    - lambds: a list of lambdas
    Returns:
    - bestlambda: the best lambda you find in lambds
    """
    #####################################################
    ##
    w = regularized_linear_regression(Xtrain, ytrain, lambds[0])
    err = test_error(w, Xval,yval)* yval.shape[0]
    bestlambda = lambds[0]
    for lambd in lambds[1:]:
        w_temp = regularized_linear_regression(Xtrain, ytrain, lambd)
        err_temp = test_error(w_temp, Xval, yval) * yval.shape[0]
        if err > err_temp:
            err = err_temp
            bestlambda = lambd
    #####################################################
    return bestlambda

###### Q4.4 ######
def test_error(w, X, y):
    """
      Compute the mean squre error on test set given X, y, and model parameter w.
      Inputs:
      - X: A numpy array of shape (num_samples, D) containing test feature.
      - y: A numpy array of shape (num_samples, ) containing test label
      - w: a numpy array of shape (D, )
      Returns:
      - err: the mean square error
    """
    #####################################################
    ## err = ||X*w -y||_2^2 / N
    err = (np.linalg.norm(X @ w - y,2))**2 / y.shape[0]
    #####################################################
    return err


"""
NO MODIFICATIONS below this line.
You should only write your code in the above functions.
"""

def data_processing():
    white = pd.read_csv('winequality-white.csv', low_memory=False, sep=';').values

    [N, d] = white.shape

    np.random.seed(3)
    # prepare data
    ridx = np.random.permutation(N)
    ntr = int(np.round(N * 0.8))
    nval = int(np.round(N * 0.1))
    ntest = N - ntr - nval

  # spliting training, validation, and test

    Xtrain = np.hstack([np.ones([ntr, 1]), white[ridx[0:ntr], 0:-1]])

    ytrain = white[ridx[0:ntr], -1]
  
    Xval = np.hstack([np.ones([nval, 1]), white[ridx[ntr:ntr + nval], 0:-1]])
    yval = white[ridx[ntr:ntr + nval], -1]

    Xtest = np.hstack([np.ones([ntest, 1]), white[ridx[ntr + nval:], 0:-1]])
    ytest = white[ridx[ntr + nval:], -1]
    return Xtrain, ytrain, Xval, yval, Xtest, ytest


def main():
    np.set_printoptions(precision=3)
    Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing()
    # =========================Q3.1 linear_regression=================================
    w = linear_regression_noreg(Xtrain, ytrain)
    print("======== Question 3.1 Linear Regression ========")
    print("dimensionality of the model parameter is ", len(w), ".", sep="")
    print("model parameter is ", np.array_str(w))
  
    # =========================Q3.2 regularized linear_regression=====================
    lambd = 5.0
    wl = regularized_linear_regression(Xtrain, ytrain, lambd)
    print("\n")
    print("======== Question 3.2 Regularized Linear Regression ========")
    print("dimensionality of the model parameter is ", len(wl), sep="")
    print("lambda = ", lambd, ", model parameter is ", np.array_str(wl), sep="")

    # =========================Q3.3 tuning lambda======================
    lambds = [0, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1, 1, 10, 10 ** 2]
    bestlambd = tune_lambda(Xtrain, ytrain, Xval, yval, lambds)
    print("\n")
    print("======== Question 3.3 tuning lambdas ========")
    print("tuning lambda, the best lambda =  ", bestlambd, sep="")

    # =========================Q3.4 report mse on test ======================
    wbest = regularized_linear_regression(Xtrain, ytrain, bestlambd)
    mse = test_error(wbest, Xtest, ytest)
    print("\n")
    print("======== Question 3.4 report MSE ========")
    print("MSE on test is %.3f" % mse)
    
if __name__ == "__main__":
    main()

dimensionality of the model parameter is 12.
model parameter is  [ 2.166e+02  1.145e-01 -1.824e+00 -1.065e-02  1.037e-01  1.546e-01
  3.416e-03  2.347e-04 -2.173e+02  8.348e-01  7.366e-01  1.153e-01]


dimensionality of the model parameter is 12
lambda = 5.0, model parameter is [ 6.324e-01 -2.854e-02 -1.641e+00 -3.577e-02  2.803e-02 -1.362e-01
  5.259e-03 -8.421e-04  5.888e-01  2.887e-01  4.162e-01  3.768e-01]


tuning lambda, the best lambda =  0.001


MSE on test is 0.512


In [3]:
Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing()

In [4]:
Xtrain

array([[ 1.   ,  4.9  ,  0.345, ...,  3.24 ,  0.4  , 10.1  ],
       [ 1.   ,  8.3  ,  0.17 , ...,  3.12 ,  0.58 ,  9.4  ],
       [ 1.   ,  7.5  ,  0.21 , ...,  3.51 ,  0.47 , 10.7  ],
       ...,
       [ 1.   ,  9.7  ,  0.14 , ...,  2.98 ,  0.62 ,  9.5  ],
       [ 1.   ,  5.8  ,  0.32 , ...,  3.42 ,  0.42 , 11.8  ],
       [ 1.   ,  8.6  ,  0.2  , ...,  3.11 ,  0.49 , 11.4  ]])

In [5]:
ytrain

array([5., 6., 6., ..., 5., 7., 7.])

In [9]:
Xtrain[0,:]

array([1.0000e+00, 4.9000e+00, 3.4500e-01, 3.4000e-01, 1.0000e+00,
       6.8000e-02, 3.2000e+01, 1.4300e+02, 9.9138e-01, 3.2400e+00,
       4.0000e-01, 1.0100e+01])

In [10]:
type(Xtrain)

numpy.ndarray

In [11]:
Xtrain.T

array([[ 1.   ,  1.   ,  1.   , ...,  1.   ,  1.   ,  1.   ],
       [ 4.9  ,  8.3  ,  7.5  , ...,  9.7  ,  5.8  ,  8.6  ],
       [ 0.345,  0.17 ,  0.21 , ...,  0.14 ,  0.32 ,  0.2  ],
       ...,
       [ 3.24 ,  3.12 ,  3.51 , ...,  2.98 ,  3.42 ,  3.11 ],
       [ 0.4  ,  0.58 ,  0.47 , ...,  0.62 ,  0.42 ,  0.49 ],
       [10.1  ,  9.4  , 10.7  , ...,  9.5  , 11.8  , 11.4  ]])

In [15]:
## w~* = (X~^t X~)^-1 X~^t y
np.linalg.inv(Xtrain.T @ Xtrain) @ Xtrain.T @ ytrain

array([ 2.16557631e+02,  1.14470297e-01, -1.82449448e+00, -1.06525609e-02,
        1.03719441e-01,  1.54637453e-01,  3.41633947e-03,  2.34694682e-04,
       -2.17305280e+02,  8.34826139e-01,  7.36600071e-01,  1.15290882e-01])

In [17]:
Xtrain.shape[1]

12

In [19]:
lambd =2
lambd*np.eye(Xtrain.shape[1])

array([[2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2.]])

In [26]:
a=np.array([3,4])
(np.linalg.norm(ytrain,2))**2

138784.99999999997

In [34]:
lambds = [0, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1, 1, 10, 10 ** 2]
print('loop start')
print(lambds[0])
for lambd in lambds[1:]:
  print(lambd)

loop start
0
0.0001
0.001
0.01
0.1
1
10
100


In [59]:
if __name__ == "__main__":
    main()

dimensionality of the model parameter is 12.
model parameter is  [ 2.166e+02  1.145e-01 -1.824e+00 -1.065e-02  1.037e-01  1.546e-01
  3.416e-03  2.347e-04 -2.173e+02  8.348e-01  7.366e-01  1.153e-01]


dimensionality of the model parameter is 12
lambda = 5.0, model parameter is [ 6.324e-01 -2.854e-02 -1.641e+00 -3.577e-02  2.803e-02 -1.362e-01
  5.259e-03 -8.421e-04  5.888e-01  2.887e-01  4.162e-01  3.768e-01]


tuning lambda, the best lambda =  0.1


MSE on test is 254.453
