In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import torch

Branin Function:
\begin{align*}
f&:([-5,10]\times [0,15]) \rightarrow \mathbb{R}\\
f(x)&=a(x_2 - bx_1^2 + cx_1 - r)^2 + s(1-t)cos(x_1) + s
\end{align*}

Recommended values:
\begin{align*}
a &= 1\\
b &= \frac{5.1}{4\pi^2}\\
c &= \frac{5}{\pi}\\
r &= 6\\
s &= 10\\
t &= \frac{1}{8\pi}
\end{align*}

In [103]:
class Branin():
    '''
    Takes in an n x 2 input matrix where each row is an observation of dimension 2.
    Outputs n x 1 output matrix where each output has dimension 1.
    '''
    
    def __init__(self, noise_var=0):
        self.range = np.array([[-5,10],
                             [0,15]])
        self.param = {
            'a':1,
            'b':5.1/(4*math.pi**2),
            'c':5/math.pi,
            'r':6,
            's':10,
            't':1/(8*math.pi)
        }
        
        self.noise_var = noise_var
        self.input_dim = 2
        self.output_dim = 1

    def scale_domain(self,x):
        # Scaling the domain
        x_copy = np.copy(x)
        if len(x_copy.shape) == 1:
            x_copy = x_copy.reshape((1, x_copy.shape[0]))
        for i in range(len(self.range)):
            x_copy[:, i] = x_copy[:, i] * (self.range[i, 1] - self.range[i, 0]) / 2 + (
                        self.range[i, 1] + self.range[i, 0]) / 2
        return x_copy

    def __evaluate_single(self, x):
        a = self.param['a']
        b = self.param['b']
        c = self.param['c']
        r = self.param['r']
        s = self.param['s']
        t = self.param['t']
        
        f = a*(x[1] - b*x[1]**2 + c*x[0] - r)**2 + s*(1-t)*math.cos(x[1]) + s
        
        return f
    
    def evaluate_torch(self, x):
        a = self.param['a']
        b = self.param['b']
        c = self.param['c']
        r = self.param['r']
        s = self.param['s']
        t = self.param['t']
        
        f = a*(x[:,1] - b*x[:,1]**2 + c*x[:,0] - r)**2 + s*(1-t)*torch.cos(x[:,1]) + s
                
        return f
    
    def evaluate_true(self, x):
        x = x.reshape(-1,self.input_dim)
        
        return np.apply_along_axis(self.__evaluate_single, axis = 1, arr = x)

    def evaluate(self, x):
        true_values = self.evaluate_true(x).reshape(x.shape[0],self.output_dim)
        noise = np.random.normal(0, self.noise_var, size = (x.shape[0],self.output_dim))
        
        return true_values + noise

In [104]:
torch.concat([torch.tensor([]),torch.tensor([5]),torch.tensor([10])])

tensor([ 5., 10.])

# Draw sample inputs

# Train simple MLP to twist inputs into desired space
In this case, we're twisting into (a) a same-dimensional space, (b) a lower dimensional space, and (c) a higher dimensional space, just for context in results.

In [105]:
n = 1000
x1_sample = np.random.uniform(low = -5, high = 10, size = n)
x2_sample = np.random.uniform(low = 0, high = 15, size = n)

In [106]:
b = Branin()
X_sample = np.array([x1_sample,x2_sample]).reshape(-1,2)
y_sample = b.evaluate(X_sample)

In [107]:
max_y_sample = torch.tensor(max(y_sample))
min_y_sample = torch.tensor(min(y_sample))

print(f"Max:{max(y_sample)}, Min:{min(y_sample)}")
print("Use for calibrating the actual y-values")

Max:[452.87617429], Min:[0.41490044]
Use for calibrating the actual y-values


In [108]:
torch.tensor(max(y_sample))

tensor([452.8762], dtype=torch.float64)

In [109]:
x1 = torch.rand(n) * 2 - 1
x2 = torch.rand(n) * 2 - 1
X = torch.concat([x1.reshape(-1,1), x2.reshape(-1,1)], dim = 1)
y = b.evaluate_torch(X)

In [110]:
X = X.float()
y = y.float()

In [111]:
print(f"X shape:{X.shape}. y shape: {y.shape}")

X shape:torch.Size([1000, 2]). y shape: torch.Size([1000])


In [112]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [113]:
class SimpleNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, fn):
        super(SimpleNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, output_dim)
        )
        self.fn = fn

    def forward(self, x):
        x = self.linear_relu_stack(x)
        x = self.fn.evaluate_torch(x)
        return x

In [114]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cpu device


In [115]:
model = SimpleNetwork(input_dim = 2, output_dim = 2, fn = b)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=0.9)
criterion = torch.nn.MSELoss()
b = Branin()

In [116]:
print(f"Min, max y:{min(y):.2f}/{max(y):.2f}")
print("Min, max y-hat")

Min, max y:28.91/90.01
Min, max y-hat


In [117]:
print(f"Layer 2 weights: {model.linear_relu_stack[2].weight}")

Layer 2 weights: Parameter containing:
tensor([[ 0.4437, -0.2735,  0.2166,  0.3359],
        [-0.1188,  0.1601, -0.0956, -0.2172]], requires_grad=True)


In [118]:
for i in range(100):
    y_hat = model(X)
    loss = criterion(y_hat,y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_accuracy = torch.mean((y - y_hat)**2)
#     print(f"Layer 1 weights: {model.linear_relu_stack[0].weight}")
#     print(f"Layer 2 weights: {model.linear_relu_stack[2].weight}")
    if i%10 == 0:
        print(f"Min/max y-hat vs y is:{min(y_hat)/min(y):.2f}, {max(y_hat)/max(y):.2f}")
        print(f"MSE: {train_accuracy}")

Min/max y-hat vs y is:1.77, 0.62
MSE: 211.41958618164062
Min/max y-hat vs y is:1.82, 0.62
MSE: 197.3109893798828
Min/max y-hat vs y is:1.87, 0.62
MSE: 176.33509826660156
Min/max y-hat vs y is:1.82, 0.63
MSE: 157.04989624023438
Min/max y-hat vs y is:1.72, 0.64
MSE: 137.8560333251953
Min/max y-hat vs y is:1.61, 0.65
MSE: 116.9983139038086
Min/max y-hat vs y is:1.49, 0.66
MSE: 94.42373657226562
Min/max y-hat vs y is:1.37, 0.68
MSE: 71.62969970703125
Min/max y-hat vs y is:1.26, 0.71
MSE: 50.808319091796875
Min/max y-hat vs y is:1.16, 0.74
MSE: 33.94025421142578


In [119]:
model.linear_relu_stack(X)

tensor([[-0.5688,  0.1340],
        [-0.2371,  0.7739],
        [-0.5512,  0.2264],
        ...,
        [-0.3238,  0.6306],
        [-0.0657,  1.0571],
        [-0.5856,  0.1372]], grad_fn=<AddmmBackward0>)

# Bayes Opt

# Problems
1. Requires black box model to be differentiable
2. Can't actually get $y$ once the function requires more than 2 inputs.

# Next Steps
1. Try learning the best input by shifting everything towards the minimum.