In [1]:
import numpy as np
import scipy
from scipy.optimize import minimize
from scipy.linalg import cho_factor, cho_solve

In [2]:
def params_unwrap(param_vec, shapes, sizes):
    """Helper routine for minimize_list"""
    args = []
    pos = 0
    for i in range(len(shapes)):
        sz = sizes[i]
        args.append(param_vec[pos:pos+sz].reshape(shapes[i]))
        pos += sz
    return args


def params_wrap(param_list):
    """Helper routine for minimize_list"""
    param_list = [np.array(x) for x in param_list]
    shapes = [x.shape for x in param_list]
    sizes = [x.size for x in param_list]
    param_vec = np.zeros(sum(sizes))
    pos = 0
    for param in param_list:
        sz = param.size
        param_vec[pos:pos+sz] = param.ravel()
        pos += sz
    unwrap = lambda pvec: params_unwrap(pvec, shapes, sizes)
    return param_vec, unwrap


def minimize_list(cost, init_list, args):
    """Optimize a list of arrays (wrapper of scipy.optimize.minimize)

    The input function "cost" should take a list of parameters,
    followed by any extra arguments:
        cost(init_list, *args)
    should return the cost of the initial condition, and a list in the same
    format as init_list giving gradients of the cost wrt the parameters.

    The options to the optimizer have been hard-coded. You may wish
    to change disp to True to get more diagnostics. You may want to
    decrease maxiter while debugging. Although please report all results
    in Q2-5 using maxiter=500.
    """
    opt = {'maxiter': 500, 'disp': False}
    init, unwrap = params_wrap(init_list)
    def wrap_cost(vec, *args):
        E, params_bar = cost(unwrap(vec), *args)
        vec_bar, _ = params_wrap(params_bar)
        return E, vec_bar
    res = minimize(wrap_cost, init, args, 'L-BFGS-B', jac=True, options=opt)
    return unwrap(res.x)


def linreg_cost(params, X, yy, alpha):
    """Regularized least squares cost function and gradients

    Can be optimized with minimize_list -- see fit_linreg_gradopt for a
    demonstration.

    Inputs:
    params: tuple (ww, bb): weights ww (D,), bias bb scalar
         X: N,D design matrix of input features
        yy: N,  real-valued targets
     alpha: regularization constant

    Outputs: (E, [ww_bar, bb_bar]), cost and gradients
    """
    # Unpack parameters from list
    ww, bb = params

    # forward computation of error
    ff = np.dot(X, ww) + bb
    res = ff - yy
    E = np.dot(res, res) + alpha*np.dot(ww, ww)

    # reverse computation of gradients
    ff_bar = 2*res
    bb_bar = np.sum(ff_bar)
    ww_bar = np.dot(X.T, ff_bar) + 2*alpha*ww

    return E, [ww_bar, bb_bar]


def fit_linreg_gradopt(X, yy, alpha):
    """
    fit a regularized linear regression model with gradient opt

         ww, bb = fit_linreg_gradopt(X, yy, alpha)

     Find weights and bias by using a gradient-based optimizer
     (minimize_list) to improve the regularized least squares cost:

       np.sum(((np.dot(X,ww) + bb) - yy)**2) + alpha*np.dot(ww,ww)

     Inputs:
             X N,D design matrix of input features
            yy N,  real-valued targets
         alpha     scalar regularization constant

     Outputs:
            ww D,  fitted weights
            bb     scalar fitted bias
    """
    D = X.shape[1]
    args = (X, yy, alpha)
    init = (np.zeros(D), np.array(0))
    ww, bb = minimize_list(linreg_cost, init, args)
    return ww, bb


def logreg_cost(params, X, yy, alpha):
    """Regularized logistic regression cost function and gradients

    Can be optimized with minimize_list -- see fit_linreg_gradopt for a
    demonstration of fitting a similar function.

    Inputs:
    params: tuple (ww, bb): weights ww (D,), bias bb scalar
         X: N,D design matrix of input features
        yy: N,  real-valued targets
     alpha: regularization constant

    Outputs: (E, [ww_bar, bb_bar]), cost and gradients
    """
    # Unpack parameters from list
    ww, bb = params

    # Force targets to be +/- 1
    yy = 2*(yy==1) - 1

    # forward computation of error
    aa = yy*(np.dot(X, ww) + bb)
    sigma = 1/(1 + np.exp(-aa))
    E = -np.sum(np.log(sigma)) + alpha*np.dot(ww, ww)

    # reverse computation of gradients
    aa_bar = sigma - 1
    bb_bar = np.dot(aa_bar, yy)
    ww_bar = np.dot(X.T, yy*aa_bar) + 2*alpha*ww

    return E, (ww_bar, bb_bar)


def nn_cost(params, X, yy=None, alpha=None):
    """NN_COST simple neural network cost function and gradients, or predictions

           E, params_bar = nn_cost([ww, bb, V, bk], X, yy, alpha)
                    pred = nn_cost([ww, bb, V, bk], X)

     Cost function E can be minimized with minimize_list

     Inputs:
             params (ww, bb, V, bk), where:
                    --------------------------------
                        ww K,  hidden-output weights
                        bb     scalar output bias
                        V  K,D hidden-input weights
                        bk K,  hidden biases
                    --------------------------------
                  X N,D input design matrix
                 yy N,  regression targets
              alpha     scalar regularization for weights

     Outputs:
                     E  sum of squares error
            params_bar  gradients wrt params, same format as params
     OR
               pred N,  predictions if only params and X are given as inputs
    """
    # Unpack parameters from list
    ww, bb, V, bk = params

    # Forwards computation of cost
    A = np.dot(X, V.T) + bk[None,:] # N,K
    P = 1 / (1 + np.exp(-A)) # N,K
    F = np.dot(P, ww) + bb # N,
    if yy is None:
        # user wants prediction rather than training signal:
        return F
    res = F - yy # N,
    E = np.dot(res, res) + alpha*(np.sum(V*V) + np.dot(ww,ww)) # 1x1

    # Reverse computation of gradients
    F_bar = 2*res # N,
    ww_bar = np.dot(P.T, F_bar) + 2*alpha*ww # K,
    bb_bar = np.sum(F_bar) # scalar
    P_bar = np.dot(F_bar[:,None], ww[None,:]) # N,K
    A_bar = P_bar * P * (1 - P) # N,K
    V_bar = np.dot(A_bar.T, X) + 2*alpha*V # K,D
    bk_bar = np.sum(A_bar, 0)

    return E, (ww_bar, bb_bar, V_bar, bk_bar)


def rbf_fn(X1, X2):
    """Helper routine for gp_post_par"""
    return np.exp((np.dot(X1,(2*X2.T))-np.sum(X1*X1,1)[:,None]) - np.sum(X2*X2,1)[None,:])


def gauss_kernel_fn(X1, X2, ell, sigma_f):
    """Helper routine for gp_post_par"""
    return sigma_f**2 * rbf_fn(X1/(np.sqrt(2)*ell), X2/(np.sqrt(2)*ell))


def gp_post_par(X_rest, X_obs, yy, sigma_y=0.05, ell=5.0, sigma_f=0.1):
    """GP_POST_PAR means and covariances of a posterior Gaussian process

         rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy)
         rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy, sigma_y, ell, sigma_f)

     Calculate the means and covariances at all test locations of the posterior Gaussian
     process conditioned on the observations yy at observed locations X_obs.

     Inputs:
                 X_rest GP test locations
                  X_obs locations of observations
                     yy observed values
                sigma_y observation noise standard deviation
                    ell kernel function length scale
                sigma_f kernel function standard deviation

     Outputs:
           rest_cond_mu mean at each location in X_rest
          rest_cond_cov covariance matrix between function values at all test locations
    """
    X_rest = X_rest[:, None]
    X_obs = X_obs[:, None]
    K_rest = gauss_kernel_fn(X_rest, X_rest, ell, sigma_f)
    K_rest_obs = gauss_kernel_fn(X_rest, X_obs, ell, sigma_f)
    K_obs = gauss_kernel_fn(X_obs, X_obs, ell, sigma_f)
    M = K_obs + sigma_y**2 * np.eye(yy.size)
    M_cho, M_low = cho_factor(M)
    rest_cond_mu = np.dot(K_rest_obs, cho_solve((M_cho, M_low), yy))
    rest_cond_cov = K_rest - np.dot(K_rest_obs, cho_solve((M_cho, M_low), K_rest_obs.T))

    return rest_cond_mu, rest_cond_cov


In [3]:
data = np.load('./ct_data.npz')
X_train = data['X_train']; X_val = data['X_val']; X_test = data['X_test']
y_train = data['y_train']; y_val = data['y_val']; y_test = data['y_test']

# 1

## a)

In [92]:
print(f'The mean of the training positions in y_train is {np.mean(y_train):.4f} and the standard error of the mean is {np.std(y_train)/np.sqrt(len(y_train)):.4f}.')
print(f'The mean of the 5,785 positions in the y_val is {np.mean(y_val):.4f} and the standard error of the mean is  {np.std(y_val)/np.sqrt(len(y_val)):.4f}.')
print(f'The mean of the first 5,785 entries in the y_train is {np.mean(y_train[0:5785,]):.4f} and the standard error of the mean is  {np.std(y_train[0:5785,])/np.sqrt(5785):.4f}.')

The mean of the training positions in y_train is -0.0000 and the standard error of the mean is 0.0050.
The mean of the 5,785 positions in the y_val is -0.2160 and the standard error of the mean is  0.0129.
The mean of the first 5,785 entries in the y_train is -0.4425 and the standard error of the mean is  0.0119.


Explain how your results demonstrate that these standard error bars do not reliably indicate what the average of locations in future CT slice data will be. Why are standard error bars misleading here?

Ni hao

## b)

In [4]:
# stage one: constant columns
index1 = np.where(np.var(X_train, axis=0) == 0)[0]
print(f'Column indexes of constants: {index1}.')

# stage two: identical columns
index2 = np.delete(np.arange(0, X_train.shape[1]), np.sort(np.unique(X_train, axis=1, return_index=True)[1]))
print(f'Column indexes of dupulicates: {index2}.')

# indeices for removal
index_remove = np.unique(np.concatenate((index1, index2), axis=0))
print(f'Column indexes for removal: {index_remove}.')

# removal
X_train = np.delete(X_train, index_remove, axis=1)
X_val = np.delete(X_val, index_remove, axis=1)
X_test = np.delete(X_test, index_remove, axis=1)


Column indexes of constants: [ 59  69 179 189 351].
Column indexes of dupulicates: [ 69  78  79 179 188 189 199 287 351 359].
Column indexes for removal: [ 59  69  78  79 179 188 189 199 287 351 359].


Report which columns of the X_... arrays you remove at each of the two stages. Report these as 0-based indexes. (For the second stage, you might report indexes in the original array, or after you did the first stage. It doesn’t matter, as long as your code is clear and correct.)

Answer here!

# 2

## a)

In [5]:
def fit_linreg(X, yy, alpha):
    n, d = X.shape[0], X.shape[1]
    y_tilde = np.concatenate((yy, np.zeros(d)))
    Phi = np.concatenate((X, np.ones((n, 1))), axis=1)
    diag = np.diag(np.append(np.sqrt(alpha) * np.ones((d)), [0]))
    Phi_tilde = np.concatenate((Phi, diag))
    y_tilde = np.concatenate((yy, np.zeros((d+1))))
    w_tilde = np.linalg.lstsq(Phi_tilde, y_tilde, rcond=None)[0]
    ww, bb = w_tilde[0:-1], w_tilde[-1]
    return ww, bb

In [26]:
# fit X_train and y_train with alpha = 30
w_ls, b_ls = fit_linreg(X=X_train, yy=y_train, alpha=30)
para_ls = np.concatenate((w_ls, b_ls), axis=None)
pred_ls = np.concatenate((X_train, np.ones((len(X_train), 1))), axis=1) @ para_ls

In [27]:
# fit_linreg_gradopt(X, yy, 30)
w_gd, b_gd = fit_linreg_gradopt(X=X_train, yy=y_train, alpha=30)
para_gd = np.concatenate((w_ls, b_ls), axis=None)
pred_gd = np.concatenate((X_train, np.ones((len(X_train), 1))), axis=1) @ para_gd

In [28]:
# Report RMSE = sqrt(||y - f(x;w,b)||^2 / N)
RMSE_ls = np.sqrt(np.mean((y_train - pred_ls) ** 2))
RMSE_gd = np.sqrt(np.mean((y_train - pred_gd) ** 2))
print(f'The RMSE using least square method is {RMSE_ls:.4f}.')
print(f'The RMSE using gradient descent method is {RMSE_gd:.4f}.')

The RMSE using least square method is 0.3568.
The RMSE using gradient descent method is 0.3568.


# 3

In [6]:
def fit_logreg_gradopt(X, yy, alpha):
    D = X.shape[1]
    args = (X, yy, alpha)
    init = (np.zeros(D), np.array(0))
    ww, bb = minimize_list(logreg_cost, init, args)
    return ww, bb

def logreg_prob(X, ww, bb):
    return 1 / (1 + np.exp(- (X @ ww + bb)))

In [42]:
K = 20 # number of thresholded classification problems to fit
mx = np.max(y_train); mn = np.min(y_train); hh = (mx-mn)/(K+1)
thresholds = np.linspace(mn+hh, mx-hh, num=K, endpoint=True)
prob_train = np.zeros((len(X_train), K))
prob_val = np.zeros((len(X_val), K))
para_w = np.zeros((X_train.shape[1], K))
para_b = np.zeros(K)

for kk in range(K):
    labels = (y_train > thresholds[kk]).astype(int)
    w_logr, b_logr = fit_logreg_gradopt(X=X_train, yy=labels, alpha=30)
    para_w[:,kk] = w_logr
    para_b[kk] = b_logr
    prob_train[:,kk] = logreg_prob(X=X_train, ww=w_logr, bb=b_logr)
    prob_val[:,kk] = logreg_prob(X=X_val, ww=w_logr, bb=b_logr)

In [43]:
# fit regularised linear regression based on the results of logistics regression
# least square
w_ls_log, b_ls_log = fit_linreg(X=prob_train, yy=y_train, alpha=30)
para_ls_log = np.concatenate((w_ls_log, b_ls_log), axis=None)
pred_ls_log = np.concatenate((prob_train, np.ones((len(prob_train), 1))), axis=1) @ para_ls_log

In [44]:
# gradient descent
w_gd_log, b_gd_log = fit_linreg_gradopt(X=prob_train, yy=y_train, alpha=30)
para_gd_log = np.concatenate((w_gd_log, b_gd_log), axis=None)
pred_gd_log = np.concatenate((prob_train, np.ones((len(prob_train), 1))), axis=1) @ para_gd_log

In [45]:
# Report RMSE = sqrt(||y - f(x;w,b)||^2 / N)
# This is for train set
RMSE_ls_log_train = np.sqrt(np.mean((y_train - pred_ls_log) ** 2))
RMSE_gd_log_train = np.sqrt(np.mean((y_train - pred_gd_log) ** 2))

print('We are using the TRAIN set!')
print(f'The RMSE using least square method is {RMSE_ls_log_train:.4f}.')
print(f'The RMSE using gradient descent method is {RMSE_gd_log_train:.4f}.')

We are using the TRAIN set!
The RMSE using least square method is 0.1544.
The RMSE using gradient descent method is 0.1544.


In [46]:
# Report RMSE = sqrt(||y - f(x;w,b)||^2 / N)
# This is for val set
pred_ls_log_val = np.concatenate((prob_val, np.ones((len(prob_val), 1))), axis=1) @ para_ls_log
pred_gd_log_val = np.concatenate((prob_val, np.ones((len(prob_val), 1))), axis=1) @ para_gd_log

RMSE_ls_log_val = np.sqrt(np.mean((y_val - pred_ls_log_val) ** 2))
RMSE_gd_log_val = np.sqrt(np.mean((y_val - pred_gd_log_val) ** 2))

print('We are using the VAL set!')
print(f'The RMSE using least square method is {RMSE_ls_log_val:.4f}.')
print(f'The RMSE using gradient descent method is {RMSE_gd_log_val:.4f}.')

We are using the VAL set!
The RMSE using least square method is 0.2542.
The RMSE using gradient descent method is 0.2542.


# 4

In [47]:
# minimise nn cost using minimize_list with random intialisation from lecture note

# random initialisation
d, k = X_train.shape[1], 20
ww = 0.1*np.random.randn(k)/np.sqrt(k)
bb = 0.1*np.random.randn()/np.sqrt(k)
V = 0.1*np.random.randn(k, d)/np.sqrt(k)
bk = 0.1*np.random.randn(k)/np.sqrt(k)
nn_init = (ww, bb, V, bk)

# X_train, y_train and alpha
nn_alpha = 30
nn_args = (X_train, y_train, nn_alpha)

params_random = minimize_list(nn_cost, nn_init, nn_args)

pred_nn_random_train = nn_cost(params_random, X=X_train)
RMSE_nn_random_train = np.sqrt(np.mean((y_train - pred_nn_random_train) ** 2))

print('We are using the TRAIN set!')
print(f'The RMSE of nn with random initialisaiton is {RMSE_nn_random_train:.4f}.')

We are using the TRAIN set!
The RMSE of nn with random initialisaiton is 0.1403.


In [48]:
# minimise nn cost using minimize_list with Q3 intialisation

# Q3 initialisation
nn_init_q3 = (w_ls_log, b_ls_log, para_w.T, para_b)

# X_train, y_train and alpha
nn_alpha = 30
nn_args = (X_train, y_train, nn_alpha)

params_q3 = minimize_list(nn_cost, nn_init_q3, nn_args)

pred_nn_q3_train = nn_cost(params_q3, X=X_train)
RMSE_nn_q3_train = np.sqrt(np.mean((y_train - pred_nn_q3_train) ** 2))

print('We are using the TRAIN set!')
print(f'The RMSE of nn with Q3 initialisaiton is {RMSE_nn_q3_train:.4f}.')

We are using the TRAIN set!
The RMSE of nn with Q3 initialisaiton is 0.1396.


# 5

In [49]:
# bayesian optimisation

In [7]:
def train_nn_reg(XX, XX_val, yy, yy_val, params, alpha, report_train=False, report_val=True):
    args = (XX_val, yy_val, alpha)
    params_bar = minimize_list(nn_cost, params, args)
    pred = nn_cost(params_bar, X=XX)
    RMSE = np.sqrt(np.mean((yy - pred) ** 2))
    pred_val = nn_cost(params_bar, X=XX_val)
    RMSE_val = np.sqrt(np.mean((yy_val - pred_val) ** 2))
    if report_train:
        print(f'The RMSE_train of nn with Q3 initialisaiton is {RMSE:.4f} while alpha is {alpha}.')
    if report_val:
        print(f'The RMSE_val of nn with Q3 initialisaiton is {RMSE_val:.4f} while alpha is {alpha}.')
    return RMSE_val

In [105]:
# initialation
alpha_set = np.arange(0, 50, 0.02)
params_q3 = (w_ls_log, b_ls_log, para_w.T, para_b)
points_num = 3
RMSE_alpha_base = train_nn_reg(XX=X_train, XX_val=X_val, yy=y_train, yy_val=y_val, params=params_q3, alpha=30)

for i in range(5):
    pick = np.random.randint(0, len(alpha_set), points_num)
    alpha_train = alpha_set[pick]
    alpha_acquisition = np.delete(alpha_set, pick)
    RMSE_val = np.zeros(points_num)
    for j in range(points_num):
        RMSE_val[j] = train_nn_reg(XX=X_train, XX_val=X_val, yy=y_train, yy_val=y_val, params=params_q3, alpha=alpha_train[j])
    y = np.log(RMSE_alpha_base) - np.log(RMSE_val)
    mu, cov = gp_post_par(X_rest=alpha_acquisition, X_obs=alpha_train, yy=y)
    sigma = np.sqrt(np.diag(cov))
    PI = scipy.stats.norm.cdf((mu - y.max()) / sigma)
    print(f'Iteration {i+1}, the maximal PI is {PI.max()} with alpha equals to {alpha_acquisition[np.argmax(PI)]}.')
    alpha_set = np.append(alpha_set, alpha_acquisition[np.argmax(PI)])

The RMSE_val of nn with Q3 initialisaiton is 0.1767 while alpha is 30.
The RMSE_val of nn with Q3 initialisaiton is 0.1058 while alpha is 8.76.
The RMSE_val of nn with Q3 initialisaiton is 0.1044 while alpha is 8.46.
The RMSE_val of nn with Q3 initialisaiton is 0.1252 while alpha is 13.280000000000001.
Iteration 1, the maximal PI is 0.03131035687909651 with alpha equals to 8.68.
The RMSE_val of nn with Q3 initialisaiton is 0.1373 while alpha is 16.72.
The RMSE_val of nn with Q3 initialisaiton is 0.1840 while alpha is 33.06.
The RMSE_val of nn with Q3 initialisaiton is 0.1453 while alpha is 19.14.
Iteration 2, the maximal PI is 0.13225510069064217 with alpha equals to 15.66.
The RMSE_val of nn with Q3 initialisaiton is 0.1535 while alpha is 21.92.
The RMSE_val of nn with Q3 initialisaiton is 0.2049 while alpha is 42.88.
The RMSE_val of nn with Q3 initialisaiton is 0.0816 while alpha is 4.24.
Iteration 3, the maximal PI is 0.00027779653371804564 with alpha equals to 4.68.
The RMSE_val of

In [106]:
train_nn_reg(XX=X_train, XX_val=X_val, yy=y_train, yy_val=y_val, 
             params=params_q3, alpha=alpha_acquisition[np.argmax(PI)], report_train=True, report_val=True)

The RMSE_val of nn with Q3 initialisaiton is 0.4034 while alpha is 5.92.
The RMSE_val of nn with Q3 initialisaiton is 0.0911 while alpha is 5.92.


0.09107310843522953

In [107]:
# Q3 initialisation
nn_init_q3 = (w_ls_log, b_ls_log, para_w.T, para_b)

# X_train, y_train and alpha
nn_alpha = alpha_acquisition[np.argmax(PI)]
nn_args = (X_train, y_train, nn_alpha)

params_q3 = minimize_list(nn_cost, nn_init_q3, nn_args)

pred_nn_q5_train = nn_cost(params_q3, X=X_train)
RMSE_nn_q5_train = np.sqrt(np.mean((y_train - pred_nn_q5_train) ** 2))

print('We are using the TRAIN set!')
print(f'The RMSE of nn with Q3 initialisaiton is {RMSE_nn_q5_train:.4f}.')

We are using the TRAIN set!
The RMSE of nn with Q3 initialisaiton is 0.0781.


In [None]:
pred_nn_q5_val = nn_cost(params_q3, X=X_val)
RMSE_nn_q5_val = np.sqrt(np.mean((y_val - pred_nn_q5_val) ** 2))

print('We are using the VAL set!')
print(f'The RMSE of nn with Q3 initialisaiton is {RMSE_nn_q5_val:.4f}.')

# 6

In [None]:
# apply PCA then perform the regression and nn

In [8]:
# notice that column 271 is actually constant only one data set with a different value
X_train_no271 = np.delete(X_train, 271, axis=1)
X_val_no271 = np.delete(X_val, 271, axis=1)
print(X_train_no271.shape[1])
print(f'It should be {X_train.shape[1]-1}.')

# standardisation
X_train_std = (X_train_no271 - X_train_no271.mean(axis=0)) / X_train_no271.std(axis=0)
X_val_std = (X_val_no271 - X_val_no271.mean(axis=0)) / X_val_no271.std(axis=0)

# PCA calculation
X_train_std_cov = np.cov(X_train_std, ddof=1, rowvar=False)
X_train_std_cov_eigval, X_train_std_cov_eigvec = np.linalg.eig(X_train_std_cov)
PC_order = np.argsort(X_train_std_cov_eigval)[::-1]
X_train_std_cov_eigval = X_train_std_cov_eigval[PC_order]
X_train_std_cov_eigvec = X_train_std_cov_eigvec[:,PC_order]
explained_variance = X_train_std_cov_eigval / np.sum(X_train_std_cov_eigval)
accum_explained_variance = np.add.accumulate(explained_variance)

# retain 95% or try other of information
pc = sum(accum_explained_variance <= 1) + 1
print(pc)

# PCA
X_train_pca = np.matmul(X_train_std, X_train_std_cov_eigvec[:,:pc])
X_val_pca = np.matmul(X_val_std, X_train_std_cov_eigvec[:,:pc])

372
It should be 372.
373


In [9]:
K = 20 # number of thresholded classification problems to fit
mx = np.max(y_train); mn = np.min(y_train); hh = (mx-mn)/(K+1)
thresholds = np.linspace(mn+hh, mx-hh, num=K, endpoint=True)
prob_train_pca = np.zeros((len(X_train_pca), K))
prob_val_pca = np.zeros((len(X_val_pca), K))
para_w_pca = np.zeros((X_train_pca.shape[1], K))
para_b_pca = np.zeros(K)

for kk in range(K):
    labels = (y_train > thresholds[kk]).astype(int)
    w_logr_pca, b_logr_pca = fit_logreg_gradopt(X=X_train_pca, yy=labels, alpha=30)
    para_w_pca[:,kk] = w_logr_pca
    para_b_pca[kk] = b_logr_pca
    prob_train_pca[:,kk] = logreg_prob(X=X_train_pca, ww=w_logr_pca, bb=b_logr_pca)
    prob_val_pca[:,kk] = logreg_prob(X=X_val_pca, ww=w_logr_pca, bb=b_logr_pca)

w_ls_log_pca, b_ls_log_pca = fit_linreg(X=prob_train_pca, yy=y_train, alpha=30)
para_ls_log_pca = np.concatenate((w_ls_log_pca, b_ls_log_pca), axis=None)
pred_ls_log_pca = np.concatenate((prob_train_pca, np.ones((len(prob_train_pca), 1))), axis=1) @ para_ls_log_pca
RMSE_ls_log_train_pca = np.sqrt(np.mean((y_train - pred_ls_log_pca) ** 2))
pred_ls_log_val_pca = np.concatenate((prob_val_pca, np.ones((len(prob_val_pca), 1))), axis=1) @ para_ls_log_pca
RMSE_ls_log_val_pca = np.sqrt(np.mean((y_val - pred_ls_log_val_pca) ** 2))

print('We are using the TRAIN set!')
print(f'The RMSE using least square method is {RMSE_ls_log_train_pca:.4f}.')
print('We are using the VAL set!')
print(f'The RMSE using least square method is {RMSE_ls_log_val_pca:.4f}.')

We are using the TRAIN set!
The RMSE using least square method is 0.0903.
We are using the VAL set!
The RMSE using least square method is 0.2331.


In [10]:
# minimise nn cost using minimize_list with Q3 intialisation

# Q3 initialisation
nn_init_q6_pca = (w_ls_log_pca, b_ls_log_pca, para_w_pca.T, para_b_pca)

# X_train, y_train and alpha
nn_alpha = 30
nn_args_pca = (X_train_pca, y_train, nn_alpha)

params_q6_pca = minimize_list(nn_cost, nn_init_q6_pca, nn_args_pca)

pred_nn_q6_train_pca = nn_cost(params_q6_pca, X=X_train_pca)
RMSE_nn_q6_train_pca = np.sqrt(np.mean((y_train - pred_nn_q6_train_pca) ** 2))

print('We are using the TRAIN set!')
print(f'The RMSE of nn with Q3 initialisaiton is {RMSE_nn_q6_train_pca:.4f}.')

We are using the TRAIN set!
The RMSE of nn with Q3 initialisaiton is 0.0870.


Some change made for below chunk. (Please verify)

In [11]:
# initialation
alpha_set = np.arange(0, 50, 0.02)
nn_init_q6_pca = (w_ls_log_pca, b_ls_log_pca, para_w_pca.T, para_b_pca)
points_num = 3
RMSE_alpha_base_pca = train_nn_reg(XX=X_train_pca, XX_val=X_val_pca, yy=y_train, yy_val=y_val, params=nn_init_q6_pca, alpha=30)

for i in range(5):
    pick = np.random.randint(0, len(alpha_set), points_num)
    alpha_train = alpha_set[pick]
    alpha_acquisition = np.delete(alpha_set, pick)
    RMSE_val = np.zeros(points_num)
    for j in range(points_num):
        RMSE_val[j] = train_nn_reg(XX=X_train_pca, XX_val=X_val_pca, yy=y_train, yy_val=y_val, params=nn_init_q6_pca, alpha=alpha_train[j])
    y = np.log(RMSE_alpha_base_pca) - np.log(RMSE_val)
    mu, cov = gp_post_par(X_rest=alpha_acquisition, X_obs=alpha_train, yy=y)
    sigma = np.sqrt(np.diag(cov))
    PI = scipy.stats.norm.cdf((mu - y.max()) / sigma)
    print(f'Iteration {i+1}, the maximal PI is {PI.max()} with alpha equals to {alpha_acquisition[np.argmax(PI)]}.')
    RMSE_alpha_base_pca = train_nn_reg(XX=X_train_pca, XX_val=X_val_pca, yy=y_train, yy_val=y_val, params=nn_init_q6_pca, alpha=alpha_acquisition[np.argmax(PI)])
    alpha_set = np.append(alpha_set, alpha_acquisition[np.argmax(PI)])

The RMSE_val of nn with Q3 initialisaiton is 0.0921 while alpha is 30.
The RMSE_val of nn with Q3 initialisaiton is 0.0751 while alpha is 17.76.
The RMSE_val of nn with Q3 initialisaiton is 0.0817 while alpha is 21.5.
The RMSE_val of nn with Q3 initialisaiton is 0.1022 while alpha is 38.7.
Iteration 1, the maximal PI is 0.1789962163309276 with alpha equals to 16.54.
The RMSE_val of nn with Q3 initialisaiton is 0.1060 while alpha is 42.28.
The RMSE_val of nn with Q3 initialisaiton is 0.0652 while alpha is 12.72.
The RMSE_val of nn with Q3 initialisaiton is 0.0483 while alpha is 7.08.
Iteration 2, the maximal PI is 0.0026004882388388716 with alpha equals to 7.12.
The RMSE_val of nn with Q3 initialisaiton is 0.0863 while alpha is 25.32.
The RMSE_val of nn with Q3 initialisaiton is 0.1047 while alpha is 40.92.
The RMSE_val of nn with Q3 initialisaiton is 0.0893 while alpha is 27.68.
Iteration 3, the maximal PI is 0.38293668369629297 with alpha equals to 22.96.
The RMSE_val of nn with Q3 in

In [12]:
train_nn_reg(XX=X_train_pca, XX_val=X_val_pca, yy=y_train, yy_val=y_val, 
             params=nn_init_q6_pca, alpha=alpha_acquisition[np.argmax(PI)], report_train=True, report_val=True)

The RMSE_train of nn with Q3 initialisaiton is 0.4288 while alpha is 4.78.
The RMSE_val of nn with Q3 initialisaiton is 0.0431 while alpha is 4.78.


0.04309593005138647

In [15]:
# X_train, y_train and alpha
nn_alpha = alpha_acquisition[np.argmax(PI)]
nn_alpha = 0
nn_args_pca = (X_train_pca, y_train, nn_alpha)

params_q6_pca = minimize_list(nn_cost, nn_init_q6_pca, nn_args_pca)

pred_nn_q6_train_pca = nn_cost(params_q6_pca, X=X_train_pca)
RMSE_nn_q6_train_pca = np.sqrt(np.mean((y_train - pred_nn_q6_train_pca) ** 2))

print('We are using the TRAIN set!')
print(f'The RMSE of nn with Q3 initialisaiton is {RMSE_nn_q6_train_pca:.4f}.')

We are using the TRAIN set!
The RMSE of nn with Q3 initialisaiton is 0.0500.
