# AM207 Final Project

Paper: *Learned Uncertainty-Aware (LUNA) Bases for Bayesian Regression using Multi-Headed Auxiliary Networks*

207Notes:
 -  reproduce figure 6 (rows are random restarts, see main paper) with NLM 2 hidden layers 50-50
 -  new code: plot the priors
 -  verify in paper, how many iterations were done for this figure, verify data generating process
 - 
 -  

In [4]:
from autograd import numpy as np
from autograd import grad
from autograd.misc.optimizers import adam, sgd
from autograd import scipy as sp
import autograd.numpy.random as npr
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import sys
import time

# our libraries
import utils
from nlm import NLM
from feed_forward import Feedforward
import bayes_helpers as bh

### Generate Data

In [2]:
number_of_points = 50
x_train, y_train, x_test = utils.generate_data(number_of_points)

x_train = x_train.reshape((1, -1))
y_train = y_train.reshape((1, -1))
x_test = x_test.reshape((1, -1))




#### Define NN Hyperparameters

In [3]:
###relu activation
activation_fn_type = 'relu'
activation_fn = lambda x: np.maximum(np.zeros(x.shape), x)


###neural network model design choices
width = 50
hidden_layers = 2
input_dim = 1
output_dim = 1

architecture = {'width': width,
               'hidden_layers': hidden_layers,
               'input_dim': input_dim,
               'output_dim': output_dim,
               'activation_fn_type': 'relu',
               'activation_fn_params': 'rate=1',
               'activation_fn': activation_fn}

#set random state to make the experiments replicable
rand_state = 0
random = np.random.RandomState(rand_state)

###define design choices in gradient descent
params = {'step_size':1e-3, 
          'max_iteration':5000, 
          'random_restarts':1,
          'optimizer':'adam'}


### Run Vanilla Nueral Network

In [None]:
nn = Feedforward(architecture, random=random)

t0 = time.time()
#fit my neural network to minimize MSE on the given data
nn.fit(x_train, y_train, params)
nn_time = np.round(time.time() - t0, 3)

print(f"{nn_time} Seconds")

#predict on the test x-values
y_test_pred = nn.forward(nn.weights, x_test)

#visualize the function learned by the neural network
plt.scatter(x_train.flatten(), y_train.flatten(), color='black', label='data')
plt.plot(x_test.flatten(), y_test_pred.flatten(), color='red', label='learned neural network function')
plt.legend(loc='best')
plt.show()

#utils.run_toy_nn(Feedforward,architecture,params,random,x_train,y_train,x_test)

 Iteration 2900 lower bound 9.89253584966715; gradient mag: 15.41839892632538633

### NLM Demo

In [None]:
# test
prior_var = .1
y_var = 1.0
regularization_param_nlm = 8.37
test_nlm = NLM(prior_var,y_var, regularization_param_nlm,architecture, random_state = np.random.RandomState(0))

In [None]:
params = {'step_size':1e-3, 
          'max_iteration':500, 
          'random_restarts':1,
          'optimizer':'adam'}

t0 = time.time()
test_nlm.train(x_train,y_train, params)
nlm_time = np.round(time.time() - t0, 3)
print(f"{nlm_time} Seconds")

In [None]:
posterior_predictives, posterior_predictive_samples = test_nlm.predict(x_test)

In [None]:
bh.viz_pp_samples(x_train, y_train,x_test.flatten(),posterior_predictive_samples,"NLM test")

Michael Scratch code for debugging finite differences

In [None]:
class dumb_NN():

    def __init__(self,D_in,D_out,ff):
        self.D_in = D_in
        self.D_out = D_out
        self.ff = ff
        
    def default_finite_diff(self, W,x):
        '''
        x.shape[0] is # of dimensions
        x.shape[1] is # of observations

        output: Returns a 3d matrix:
                (in dimension) x (out dimension (# of aux functions)) x (# observations)
        '''
        
        #create one epsilon for each observation
        eps = np.random.normal(0,0.1,size=x.shape[1])
        #print(eps.shape)

        #iterate over features of raw input data (rows of x)
        out = np.zeros((self.D_in, self.D_out, x.shape[1]))
        #print(out.shape)

        #evaluate function at x
        f_ex = self.ff.forward(W, x)

        #for one dimension at a time
        for i in range(x.shape[0]):

            delta = np.zeros(x.shape)
            delta[i,:] = eps
            #print(delta)
            f_eps = self.ff.forward(W,x+delta)
            print(f_eps.shape)
            # out dim X #obs
            print(out)
            out[i,:,:] = (f_eps - f_ex)/eps # value wise division, different epsilon for each column 
            print(out)

        return out

    def similarity_score(self, W, x):
        '''
        Calculates total sum of squared cosine similarity between all pairwise combinations of aux 
        functions
        
        Inputs: 
        - W = NumPy array of weights [dim=(1, width H, input dimension D_in)]

        Returns:
        - score = total cosine similarity squared across all pairs of functions [scalar]

        ''' 

        D_out = self.D_out
        score = 0
        #derivs of all the aux funcs
        holy_grail = self.default_finite_diff(W, x)
        # in dim x out dim x # obs
        M = holy_grail.shape[1]
        for i in range(D_out):
            grad_i = holy_grail[:,i,:]
            for j in range(i + 1, D_out):
                grad_j = holy_grail[:,j,:]
                score += self.cos_sim_sq(grad_i, grad_j)
        return score

    def cos_sim_sq(self,grad_i, grad_j):
        numer = np.dot(grad_i, grad_j.T)
        denom = (np.dot(grad_i,grad_i.T)*np.dot(grad_j,grad_j.T))
        return (numer/denom)[0][0]


In [None]:
    nn = Feedforward(architecture, random=random)

    #fit my neural network to minimize MSE on the given data
    nn.fit(x_train, y_train, params)

    #predict on the test x-values
    y_test_pred = nn.forward(nn.weights, x_test)

In [None]:
out = dumb_NN(input_dim,output_dim,nn).default_finite_diff(nn.weights,x_train)