In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy.optimize import minimize
from scipy.linalg import cho_factor, cho_solve

In [2]:
def params_unwrap(param_vec, shapes, sizes):
    """Helper routine for minimize_list"""
    args = []
    pos = 0
    for i in range(len(shapes)):
        sz = sizes[i]
        args.append(param_vec[pos:pos+sz].reshape(shapes[i]))
        pos += sz
    return args


def params_wrap(param_list):
    """Helper routine for minimize_list"""
    param_list = [np.array(x) for x in param_list]
    shapes = [x.shape for x in param_list]
    sizes = [x.size for x in param_list]
    param_vec = np.zeros(sum(sizes))
    pos = 0
    for param in param_list:
        sz = param.size
        param_vec[pos:pos+sz] = param.ravel()
        pos += sz
    unwrap = lambda pvec: params_unwrap(pvec, shapes, sizes)
    return param_vec, unwrap


def minimize_list(cost, init_list, args):
    """Optimize a list of arrays (wrapper of scipy.optimize.minimize)

    The input function "cost" should take a list of parameters,
    followed by any extra arguments:
        cost(init_list, *args)
    should return the cost of the initial condition, and a list in the same
    format as init_list giving gradients of the cost wrt the parameters.

    The options to the optimizer have been hard-coded. You may wish
    to change disp to True to get more diagnostics. You may want to
    decrease maxiter while debugging. Although please report all results
    in Q2-5 using maxiter=500.
    """
    opt = {'maxiter': 500, 'disp': False}
    init, unwrap = params_wrap(init_list)
    def wrap_cost(vec, *args):
        E, params_bar = cost(unwrap(vec), *args)
        vec_bar, _ = params_wrap(params_bar)
        return E, vec_bar
    res = minimize(wrap_cost, init, args, 'L-BFGS-B', jac=True, options=opt)
    return unwrap(res.x)


def linreg_cost(params, X, yy, alpha):
    """Regularized least squares cost function and gradients

    Can be optimized with minimize_list -- see fit_linreg_gradopt for a
    demonstration.

    Inputs:
    params: tuple (ww, bb): weights ww (D,), bias bb scalar
         X: N,D design matrix of input features
        yy: N,  real-valued targets
     alpha: regularization constant

    Outputs: (E, [ww_bar, bb_bar]), cost and gradients
    """
    # Unpack parameters from list
    ww, bb = params

    # forward computation of error
    ff = np.dot(X, ww) + bb
    res = ff - yy
    E = np.dot(res, res) + alpha*np.dot(ww, ww)

    # reverse computation of gradients
    ff_bar = 2*res
    bb_bar = np.sum(ff_bar)
    ww_bar = np.dot(X.T, ff_bar) + 2*alpha*ww

    return E, [ww_bar, bb_bar]


def fit_linreg_gradopt(X, yy, alpha):
    """
    fit a regularized linear regression model with gradient opt

         ww, bb = fit_linreg_gradopt(X, yy, alpha)

     Find weights and bias by using a gradient-based optimizer
     (minimize_list) to improve the regularized least squares cost:

       np.sum(((np.dot(X,ww) + bb) - yy)**2) + alpha*np.dot(ww,ww)

     Inputs:
             X N,D design matrix of input features
            yy N,  real-valued targets
         alpha     scalar regularization constant

     Outputs:
            ww D,  fitted weights
            bb     scalar fitted bias
    """
    D = X.shape[1]
    args = (X, yy, alpha)
    init = (np.zeros(D), np.array(0))
    ww, bb = minimize_list(linreg_cost, init, args)
    return ww, bb


def logreg_cost(params, X, yy, alpha):
    """Regularized logistic regression cost function and gradients

    Can be optimized with minimize_list -- see fit_linreg_gradopt for a
    demonstration of fitting a similar function.

    Inputs:
    params: tuple (ww, bb): weights ww (D,), bias bb scalar
         X: N,D design matrix of input features
        yy: N,  real-valued targets
     alpha: regularization constant

    Outputs: (E, [ww_bar, bb_bar]), cost and gradients
    """
    # Unpack parameters from list
    ww, bb = params

    # Force targets to be +/- 1
    yy = 2*(yy==1) - 1

    # forward computation of error
    aa = yy*(np.dot(X, ww) + bb)
    sigma = 1/(1 + np.exp(-aa))
    E = -np.sum(np.log(sigma)) + alpha*np.dot(ww, ww)

    # reverse computation of gradients
    aa_bar = sigma - 1
    bb_bar = np.dot(aa_bar, yy)
    ww_bar = np.dot(X.T, yy*aa_bar) + 2*alpha*ww

    return E, (ww_bar, bb_bar)


def nn_cost(params, X, yy=None, alpha=None):
    """NN_COST simple neural network cost function and gradients, or predictions

           E, params_bar = nn_cost([ww, bb, V, bk], X, yy, alpha)
                    pred = nn_cost([ww, bb, V, bk], X)

     Cost function E can be minimized with minimize_list

     Inputs:
             params (ww, bb, V, bk), where:
                    --------------------------------
                        ww K,  hidden-output weights
                        bb     scalar output bias
                        V  K,D hidden-input weights
                        bk K,  hidden biases
                    --------------------------------
                  X N,D input design matrix
                 yy N,  regression targets
              alpha     scalar regularization for weights

     Outputs:
                     E  sum of squares error
            params_bar  gradients wrt params, same format as params
     OR
               pred N,  predictions if only params and X are given as inputs
    """
    # Unpack parameters from list
    ww, bb, V, bk = params

    # Forwards computation of cost
    A = np.dot(X, V.T) + bk[None,:] # N,K
    P = 1 / (1 + np.exp(-A)) # N,K
    F = np.dot(P, ww) + bb # N,
    if yy is None:
        # user wants prediction rather than training signal:
        return F
    res = F - yy # N,
    E = np.dot(res, res) + alpha*(np.sum(V*V) + np.dot(ww,ww)) # 1x1

    # Reverse computation of gradients
    F_bar = 2*res # N,
    ww_bar = np.dot(P.T, F_bar) + 2*alpha*ww # K,
    bb_bar = np.sum(F_bar) # scalar
    P_bar = np.dot(F_bar[:,None], ww[None,:]) # N,K
    A_bar = P_bar * P * (1 - P) # N,K
    V_bar = np.dot(A_bar.T, X) + 2*alpha*V # K,D
    bk_bar = np.sum(A_bar, 0)

    return E, (ww_bar, bb_bar, V_bar, bk_bar)


def rbf_fn(X1, X2):
    """Helper routine for gp_post_par"""
    return np.exp((np.dot(X1,(2*X2.T))-np.sum(X1*X1,1)[:,None]) - np.sum(X2*X2,1)[None,:])


def gauss_kernel_fn(X1, X2, ell, sigma_f):
    """Helper routine for gp_post_par"""
    return sigma_f**2 * rbf_fn(X1/(np.sqrt(2)*ell), X2/(np.sqrt(2)*ell))


def gp_post_par(X_rest, X_obs, yy, sigma_y=0.05, ell=5.0, sigma_f=0.1):
    """GP_POST_PAR means and covariances of a posterior Gaussian process

         rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy)
         rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy, sigma_y, ell, sigma_f)

     Calculate the means and covariances at all test locations of the posterior Gaussian
     process conditioned on the observations yy at observed locations X_obs.

     Inputs:
                 X_rest GP test locations
                  X_obs locations of observations
                     yy observed values
                sigma_y observation noise standard deviation
                    ell kernel function length scale
                sigma_f kernel function standard deviation

     Outputs:
           rest_cond_mu mean at each location in X_rest
          rest_cond_cov covariance matrix between function values at all test locations
    """
    X_rest = X_rest[:, None]
    X_obs = X_obs[:, None]
    K_rest = gauss_kernel_fn(X_rest, X_rest, ell, sigma_f)
    K_rest_obs = gauss_kernel_fn(X_rest, X_obs, ell, sigma_f)
    K_obs = gauss_kernel_fn(X_obs, X_obs, ell, sigma_f)
    M = K_obs + sigma_y**2 * np.eye(yy.size)
    M_cho, M_low = cho_factor(M)
    rest_cond_mu = np.dot(K_rest_obs, cho_solve((M_cho, M_low), yy))
    rest_cond_cov = K_rest - np.dot(K_rest_obs, cho_solve((M_cho, M_low), K_rest_obs.T))

    return rest_cond_mu, rest_cond_cov


In [3]:
data = np.load('./ct_data.npz')
X_train = data['X_train']; X_val = data['X_val']; X_test = data['X_test']
y_train = data['y_train']; y_val = data['y_val']; y_test = data['y_test']

In [4]:
# stage one: constant columns
index1 = np.where(np.var(X_train, axis=0) == 0)[0]
# print(f'Column indexes of constants: {index1}.')

# stage two: identical columns
index2 = np.delete(np.arange(0, X_train.shape[1]), np.sort(np.unique(X_train, axis=1, return_index=True)[1]))
# print(f'Column indexes of dupulicates: {index2}.')

# indeices for removal
index_remove = np.unique(np.concatenate((index1, index2), axis=0))
# print(f'Column indexes for removal: {index_remove}.')

# removal
X_train = np.delete(X_train, index_remove, axis=1)
X_val = np.delete(X_val, index_remove, axis=1)
X_test = np.delete(X_test, index_remove, axis=1)

X_train = np.delete(X_train, 271, axis=1)
X_val = np.delete(X_val, 271, axis=1)
X_test = np.delete(X_test, 27, axis=1)

Note that in this test, the data would delete column 271 due to some investigation.

In [5]:
def fit_linreg(X, yy, alpha):
    n, d = X.shape[0], X.shape[1]
    y_tilde = np.concatenate((yy, np.zeros(d)))
    Phi = np.concatenate((X, np.ones((n, 1))), axis=1)
    diag = np.diag(np.append(np.sqrt(alpha) * np.ones((d)), [0]))
    Phi_tilde = np.concatenate((Phi, diag))
    y_tilde = np.concatenate((yy, np.zeros((d+1))))
    w_tilde = np.linalg.lstsq(Phi_tilde, y_tilde, rcond=None)[0]
    ww, bb = w_tilde[0:-1], w_tilde[-1]
    return ww, bb

In [6]:
def fit_logreg_gradopt(X, yy, alpha):
    D = X.shape[1]
    args = (X, yy, alpha)
    init = (np.zeros(D), np.array(0))
    ww, bb = minimize_list(logreg_cost, init, args)
    return ww, bb

def logreg_prob(X, ww, bb):
    return 1 / (1 + np.exp(- (X @ ww + bb)))

In [7]:
def train_nn_reg(XX, XX_val, yy, yy_val, params, alpha, report_train=False, report_val=True):
    args = (XX_val, yy_val, alpha)
    params_bar = minimize_list(nn_cost, params, args)
    pred = nn_cost(params_bar, X=XX)
    RMSE = np.sqrt(np.mean((yy - pred) ** 2))
    pred_val = nn_cost(params_bar, X=XX_val)
    RMSE_val = np.sqrt(np.mean((yy_val - pred_val) ** 2))
    if report_train:
        print(f'The RMSE_train is {RMSE:.4f} while alpha is {alpha}.')
    if report_val:
        print(f'The RMSE_val is {RMSE_val:.4f} while alpha is {alpha}.')
    return RMSE_val

We first use standardised data to repeat the process.

In [8]:
# standardisation
X_train_std = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0)
X_val_std = (X_val - X_train.mean(axis=0)) / X_train.std(axis=0)

In [9]:
K = 20 # number of thresholded classification problems to fit
mx = np.max(y_train); mn = np.min(y_train); hh = (mx-mn)/(K+1)
thresholds = np.linspace(mn+hh, mx-hh, num=K, endpoint=True)
prob_train_std = np.zeros((len(X_train_std), K))
prob_val_std = np.zeros((len(X_val_std), K))
logr_w_std = np.zeros((X_train_std.shape[1], K))
logr_b_std = np.zeros(K)

for kk in range(K):
    labels = (y_train > thresholds[kk]).astype(int)
    w_logr_std, b_logr_std = fit_logreg_gradopt(X=X_train_std, yy=labels, alpha=30)
    logr_w_std[:,kk] = w_logr_std
    logr_b_std[kk] = b_logr_std
    prob_train_std[:,kk] = logreg_prob(X=X_train_std, ww=w_logr_std, bb=b_logr_std)
    prob_val_std[:,kk] = logreg_prob(X=X_val_std, ww=w_logr_std, bb=b_logr_std)

lr_w_std, lr_b_std = fit_linreg(X=prob_train_std, yy=y_train, alpha=30)
lr_para_std = np.concatenate((lr_w_std, lr_b_std), axis=None)
pred_train_std = np.concatenate((prob_train_std, np.ones((len(prob_train_std), 1))), axis=1) @ lr_para_std
RMSE_train_std = np.sqrt(np.mean((y_train - pred_train_std) ** 2))
pred_val_std = np.concatenate((prob_val_std, np.ones((len(prob_val_std), 1))), axis=1) @ lr_para_std
RMSE_val_std = np.sqrt(np.mean((y_val - pred_val_std) ** 2))

print(f'The RMSE of TRAIN set is {RMSE_train_std:.4f}.')
print(f'The RMSE of VAL set is {RMSE_val_std:.4f}.')

The RMSE of TRAIN set is 0.0903.
The RMSE of VAL set is 0.2445.


In [10]:
# params form above
nn_init_std = (lr_w_std, lr_b_std, logr_w_std.T, logr_b_std)

# X_train_std, y_train and alpha
nn_alpha = 30
nn_args_std = (X_train_std, y_train, nn_alpha)

# nn params
nn_params_std = minimize_list(nn_cost, nn_init_std, nn_args_std)

pred_nn_train_std = nn_cost(nn_params_std, X=X_train_std)
RMSE_nn_train_std = np.sqrt(np.mean((y_train - pred_nn_train_std) ** 2))

print(f'The RMSE of nn using TRAIN set is {RMSE_nn_train_std:.4f}.')

The RMSE of nn using TRAIN set is 0.0874.


In [11]:
# initialation
alpha_set = np.arange(0, 50, 0.02)
points_num = 3
RMSE_alpha_base_std = train_nn_reg(XX=X_train_std, XX_val=X_val_std, yy=y_train, yy_val=y_val, params=nn_init_std, alpha=30)

for i in range(5):
    pick = np.random.randint(0, len(alpha_set), points_num)
    alpha_train = alpha_set[pick]
    alpha_acquisition = np.delete(alpha_set, pick)
    RMSE_val = np.zeros(points_num)
    for j in range(points_num):
        RMSE_val[j] = train_nn_reg(XX=X_train_std, XX_val=X_val_std, yy=y_train, yy_val=y_val, params=nn_init_std, alpha=alpha_train[j])
    y = np.log(RMSE_alpha_base_std) - np.log(RMSE_val)
    mu, cov = gp_post_par(X_rest=alpha_acquisition, X_obs=alpha_train, yy=y)
    sigma = np.sqrt(np.diag(cov))
    PI = scipy.stats.norm.cdf((mu - y.max()) / sigma)
    print(f'Iteration {i+1}, the maximal PI is {PI.max()} with alpha equals to {alpha_acquisition[np.argmax(PI)]}.')
    RMSE_alpha_base_std = train_nn_reg(XX=X_train_std, XX_val=X_val_std, yy=y_train, yy_val=y_val, params=nn_init_std, alpha=alpha_acquisition[np.argmax(PI)])
    alpha_set = np.append(alpha_set, alpha_acquisition[np.argmax(PI)])

The RMSE_val is 0.0923 while alpha is 30.
The RMSE_val is 0.0777 while alpha is 20.400000000000002.
The RMSE_val is 0.0766 while alpha is 19.02.
The RMSE_val is 0.0876 while alpha is 24.84.
Iteration 1, the maximal PI is 0.24948331698853637 with alpha equals to 18.0.
The RMSE_val is 0.0753 while alpha is 18.0.
The RMSE_val is 0.0355 while alpha is 3.08.
The RMSE_val is 0.0765 while alpha is 18.86.
The RMSE_val is 0.1004 while alpha is 38.480000000000004.
Iteration 2, the maximal PI is 0.00038244600012522344 with alpha equals to 2.7800000000000002.
The RMSE_val is 0.0359 while alpha is 2.7800000000000002.
The RMSE_val is 0.0314 while alpha is 2.2.
The RMSE_val is 0.0889 while alpha is 26.18.
The RMSE_val is 0.0387 while alpha is 3.9.
Iteration 3, the maximal PI is 0.1739506542324482 with alpha equals to 0.0.
The RMSE_val is 0.0204 while alpha is 0.0.
The RMSE_val is 0.0693 while alpha is 15.18.
The RMSE_val is 0.0751 while alpha is 18.56.
The RMSE_val is 0.1035 while alpha is 41.46.
Ite

In [12]:
nn_alpha_opt = alpha_acquisition[np.argmax(PI)]
train_nn_reg(XX=X_train_std, XX_val=X_val_std, yy=y_train, yy_val=y_val, 
             params=nn_init_std, alpha=nn_alpha_opt, report_train=True, report_val=True)

The RMSE_train is 0.2111 while alpha is 0.0.
The RMSE_val is 0.0204 while alpha is 0.0.


0.020414185082332455

In [15]:
nn_args_std_opta = (X_train_std, y_train, nn_alpha_opt)

nn_params_std_opta = minimize_list(nn_cost, nn_init_std, nn_args_std_opta)

pred_nn_train_std_opta = nn_cost(nn_params_std_opta, X=X_train_std)
RMSE_nn_train_std_opta = np.sqrt(np.mean((y_train - pred_nn_train_std_opta) ** 2))

pred_nn_val_std_opta = nn_cost(nn_params_std_opta, X=X_val_std)
RMSE_nn_val_std_opta = np.sqrt(np.mean((y_val - pred_nn_val_std_opta) ** 2))

print(f'The RMSE of nn using TRAIN set is {RMSE_nn_train_std_opta:.4f}.')
print(f'The RMSE of nn using VAL set is {RMSE_nn_val_std_opta:.4f}.')

The RMSE of nn using TRAIN set is 0.0503.
The RMSE of nn using VAL set is 0.2611.
