In [3]:
import torch
import numpy as np
import os
import matplotlib.pyplot as plt
from copy import deepcopy
from experiment_with_t import *
from tqdm import tqdm

In [2]:
torch.random.manual_seed(42)

samples = []
bdry_samples = []
n = 1000
for i in range(n):
    samples.append(sample(K))
    bdry_samples.append(sample(K, t=1.))


best_loss = float('inf')
for model_path in tqdm(os.listdir('models')):
    PATH = f'models/{model_path}'
    model = model_path.split(',')
    if model[-1][-3:] != '.pt':
        continue
    else:
        hl_size, reg, lr, batch_size, iters = model
        hl_size = int(hl_size)
        reg = float(reg)
        lr = float(lr)
        batch_size = int(batch_size)
        iters = int(iters[:-3])

        net = BigNet(K, hl_size)
        net.load_state_dict(torch.load(PATH))

        loss = 0.
        bdry_loss = 0.
        for state, bdry_state in zip(samples, bdry_samples):
            t, s, q = state
            _, bdry_s, bdry_q = bdry_state
            loss += hjb_term(net, t, s, q, lam, p, K)
            bdry_loss += hjb_bdry(net, s, q, p)
        
        total_loss = loss + bdry_loss
        if total_loss < best_loss:
            best_dict = {
                'net':net,
                'hl_size':hl_size,
                'reg':reg,
                'lr':lr,
                'batch_size':batch_size,
                'iters':iters,
                'loss':loss/n,
                'bdry_loss':bdry_loss/n,
                'total_loss':total_loss/n
            }

100%|██████████| 335/335 [12:25<00:00,  2.23s/it]


In [3]:
best_dict
# Best results: {'net': BigNet(
#    (fc1): Linear(in_features=9, out_features=60, bias=True)
#    (fc2): Linear(in_features=60, out_features=60, bias=True)
#    (fc3): Linear(in_features=60, out_features=1, bias=True)
#  ),
#  'hl_size': 60,
#  'reg': 0.1,
#  'lr': 0.3,
#  'batch_size': 5,
#  'iters': 100000,
#  'loss': tensor(1.1946, grad_fn=<DivBackward0>),
#  'bdry_loss': tensor([0.], grad_fn=<DivBackward0>),
#  'total_loss': tensor([1.1946], grad_fn=<DivBackward0>)}

{'net': BigNet(
   (fc1): Linear(in_features=9, out_features=60, bias=True)
   (fc2): Linear(in_features=60, out_features=60, bias=True)
   (fc3): Linear(in_features=60, out_features=1, bias=True)
 ),
 'hl_size': 60,
 'reg': 0.1,
 'lr': 0.3,
 'batch_size': 5,
 'iters': 100000,
 'loss': tensor(1.1951, grad_fn=<DivBackward0>),
 'bdry_loss': tensor([0.], grad_fn=<DivBackward0>),
 'total_loss': tensor([1.1951], grad_fn=<DivBackward0>)}

In [4]:
hl_size = 60
reg = 0.1
lr = 0.3
batch_size = 5
iters = 100000
PATH = f'models/{hl_size},{reg},{lr},{batch_size},{iters}.pt'
net = BigNet(K, hl_size)
net.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [5]:
# net = best_dict['net']
n = 100
reps = 10
# rwds = run_policy(net, n, reps, K)

In [6]:
mus = torch.rand(K)
print(mus)
rwds2 = run_policy_fixed_mu(net, n, reps, mus)

tensor([0.6253, 0.3445, 0.9178, 0.7326])


100%|██████████| 10/10 [00:02<00:00,  4.86it/s]


In [7]:
rwds2

tensor([0.2200, 0.4400, 0.5000, 0.3000, 0.1800, 0.5200, 0.3800, 0.3400, 0.2400,
        0.3200])

In [8]:
torch.mean(2 * mus - 1)

tensor(0.3101)

In [9]:
t, s, q = sample(K)
policy(net, t, s, q, K)

tensor([0.2493, 0.2547, 0.2568, 0.2392], grad_fn=<SoftmaxBackward>)

In [10]:
torch.mean(rwds2)

tensor(0.3440)

In [None]:
torch.random.manual_seed(42)

samples = []
bdry_samples = []
n = 1000
for i in range(n):
    samples.append(sample(K))
    bdry_samples.append(sample(K, t=1.))


best_loss = float('inf')
for model_path in tqdm(os.listdir('models')):
    PATH = f'models/{model_path}'
    model = model_path.split(',')
    if model[-1][-3:] != '.pt':
        continue
    else:
        hl_size, reg, lr, batch_size, iters = model
        hl_size = int(hl_size)
        reg = float(reg)
        lr = float(lr)
        batch_size = int(batch_size)
        iters = int(iters[:-3])

        if iters > 100000:
            print(iters)
            net = BigNet(K, hl_size)
            net.load_state_dict(torch.load(PATH))

            loss = 0.
            bdry_loss = 0.
            for state, bdry_state in zip(samples, bdry_samples):
                t, s, q = state
                _, bdry_s, bdry_q = bdry_state
                loss += hjb_term(net, t, s, q, lam, p, K)
                bdry_loss += hjb_bdry(net, s, q, p)
            
            total_loss = loss + bdry_loss
            if total_loss < best_loss:
                best_dict2 = {
                    'net':net,
                    'hl_size':hl_size,
                    'reg':reg,
                    'lr':lr,
                    'batch_size':batch_size,
                    'iters':iters,
                    'loss':loss/n,
                    'bdry_loss':bdry_loss/n,
                    'total_loss':total_loss/n
                }

print(best_dict2)

In [13]:
best_dict2

{'net': BigNet(
   (fc1): Linear(in_features=9, out_features=60, bias=True)
   (fc2): Linear(in_features=60, out_features=60, bias=True)
   (fc3): Linear(in_features=60, out_features=1, bias=True)
 ),
 'hl_size': 60,
 'reg': 0.03,
 'lr': 0.3,
 'batch_size': 5,
 'iters': 1000000,
 'loss': tensor(1.1903, grad_fn=<DivBackward0>),
 'bdry_loss': tensor([0.], grad_fn=<DivBackward0>),
 'total_loss': tensor([1.1903], grad_fn=<DivBackward0>)}

In [None]:
# {'net': BigNet(
#    (fc1): Linear(in_features=9, out_features=60, bias=True)
#    (fc2): Linear(in_features=60, out_features=60, bias=True)
#    (fc3): Linear(in_features=60, out_features=1, bias=True)
#  ),
#  'hl_size': 60,
#  'reg': 0.03,
#  'lr': 0.3,
#  'batch_size': 5,
#  'iters': 1000000,
#  'loss': tensor(1.1903, grad_fn=<DivBackward0>),
#  'bdry_loss': tensor([0.], grad_fn=<DivBackward0>),
#  'total_loss': tensor([1.1903], grad_fn=<DivBackward0>)}

In [14]:
net = best_dict2['net']
n = 100
reps = 10
# rwds = run_policy(net, n, reps, K)
mus = torch.rand(K)
print(mus)
rwds2 = run_policy_fixed_mu(net, n, reps, mus)

tensor([0.4141, 0.8813, 0.7742, 0.6505])


100%|██████████| 10/10 [00:02<00:00,  4.39it/s]


In [15]:
print(rwds2)

tensor([0.5000, 0.4000, 0.4600, 0.3200, 0.4200, 0.4400, 0.2600, 0.2800, 0.3800,
        0.5000])


In [16]:
print(2 * mus - 1)

tensor([-0.1718,  0.7626,  0.5484,  0.3009])


In [23]:
for i in range(10):
    x = torch.rand(1)
    if x < 0.5:
        print(x)
    else:
        print(f'big! {x}')

tensor([0.3233])
big! tensor([0.9842])
big! tensor([0.7160])
big! tensor([0.9071])
big! tensor([0.5577])
tensor([0.1629])
big! tensor([0.5832])
big! tensor([0.6821])
big! tensor([0.6227])
tensor([0.4694])


In [24]:
torch.bernoulli(mus)

tensor([0., 1., 1., 1.])

In [25]:
qs = torch.randn(4) + 10

In [1]:
import torch
import numpy as np
import os
import matplotlib.pyplot as plt
from copy import deepcopy
from experiment_with_t import *
from tqdm import tqdm

In [2]:
net = BigNet(K, hl_size=60)
num_samples = 100
train_lbfgs(net, num_samples, 500, 0.03, lam, p, K)

  0%|          | 1/500 [00:07<1:05:56,  7.93s/it]

Loss on iter 0: tensor([0.0004], grad_fn=<DivBackward0>)


 20%|██        | 101/500 [11:17<24:08,  3.63s/it]

Loss on iter 100: tensor([3.3486e-06], grad_fn=<DivBackward0>)


 40%|████      | 201/500 [11:48<01:49,  2.74it/s]

Loss on iter 200: tensor([3.3368e-06], grad_fn=<DivBackward0>)


 60%|██████    | 301/500 [12:16<01:13,  2.70it/s]

Loss on iter 300: tensor([3.3368e-06], grad_fn=<DivBackward0>)


 80%|████████  | 401/500 [12:45<00:36,  2.71it/s]

Loss on iter 400: tensor([3.3368e-06], grad_fn=<DivBackward0>)


100%|██████████| 500/500 [13:13<00:00,  1.59s/it]


BigNet(
  (fc1): Linear(in_features=9, out_features=60, bias=True)
  (fc2): Linear(in_features=60, out_features=60, bias=True)
  (fc3): Linear(in_features=60, out_features=1, bias=True)
)

In [5]:
t, s, q = sample(K)
policy(net, t, s, q, K)

tensor([0.3065, 0.2471, 0.1774, 0.2690], grad_fn=<SoftmaxBackward>)

In [6]:
ns = [100, 1000, 10000]
expts = 10
reps = 10

greedy_rwds = np.zeros((len(ns), expts, reps))
ucb_rwds = np.zeros((len(ns), expts, reps))
our_rwds = np.zeros((len(ns), expts, reps))
random_rwds = np.zeros((len(ns), expts))


for i, n in enumerate(ns):
    print(f'Experiments for n={n}')
    for j in tqdm(range(expts)):
        # print(f'Start experiment {j + 1} (n = {n})')
        mus = torch.rand(K)
        random_rwds[i, j] = 2 * np.mean(mus.numpy()) - 1

        # print('Run greedy')
        greedy_rwds[i, j, :] = eps_greedy(n, reps, mus, 1.).numpy()
        
        # print('Run UCB')
        ucb_rwds[i, j, :] = ucb(n, reps, mus).numpy()

        # print('Run ours')
        our_rwds[i, j, :] = run_policy_fixed_mu(net, n, reps, mus).numpy()

print(f'Random: {np.mean(random_rwds)}')
print(f'epsilon-Greedy: {np.mean(greedy_rwds)}')
print(f'UCB: {np.mean(ucb_rwds)}')
print(f'Ours: {np.mean(our_rwds)}')

Experiments for n=100


100%|██████████| 10/10 [00:14<00:00,  1.45s/it]


Experiments for n=1000


100%|██████████| 10/10 [02:20<00:00, 14.07s/it]


Experiments for n=10000


100%|██████████| 10/10 [23:16<00:00, 139.69s/it]

Random: 0.062489598989486694
epsilon-Greedy: 2.432475286970536
UCB: 2.5187605460484823
Ours: 0.0451511046787103





In [8]:
ns = [100, 1000]
expts = 10
reps = 10

greedy_rwds_beta = np.zeros((len(ns), expts, reps))
ucb_rwds_beta = np.zeros((len(ns), expts, reps))
our_rwds_beta = np.zeros((len(ns), expts, reps))
random_rwds_beta = np.zeros((len(ns), expts))


for i, n in enumerate(ns):
    print(f'Experiments for n={n}')
    for j in tqdm(range(expts)):
        # print(f'Start experiment {j + 1} (n = {n})')
        mus = torch.rand(K)
        random_rwds_beta[i, j] = 2 * np.mean(mus.numpy()) - 1

        # print('Run greedy')
        greedy_rwds_beta[i, j, :] = eps_greedy(n, reps, mus, 1.).numpy()
        
        # print('Run UCB')
        ucb_rwds_beta[i, j, :] = ucb(n, reps, mus).numpy()

        # print('Run ours')
        our_rwds_beta[i, j, :] = run_policy_fixed_mu(net, n, reps, mus).numpy()

print(f'Random: {np.mean(random_rwds_beta)}')
print(f'epsilon-Greedy: {np.mean(greedy_rwds_beta)}')
print(f'UCB: {np.mean(ucb_rwds_beta)}')
print(f'Ours: {np.mean(our_rwds_beta)}')

Experiments for n=100


100%|██████████| 10/10 [00:15<00:00,  1.58s/it]


Experiments for n=1000


100%|██████████| 10/10 [02:42<00:00, 16.26s/it]

Random: 0.09856199324131013
epsilon-Greedy: 2.4173818510770797
UCB: 2.6026742637529967
Ours: 0.11017007758840919





In [4]:
class PINN(nn.Module):
    def __init__(self, K):
        super().__init__()
        self.fc1 = nn.Linear(2 * K + 1, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 100)
        self.fc4 = nn.Linear(100, 100)
        self.fc5 = nn.Linear(100, 1)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        x = torch.tanh(self.fc4(x))
        x = self.fc5(x)
        return x

net = PINN(K)
num_samples = 2000
trained_net = train_lbfgs(net, num_samples, 500, 0.03, lam, p, K)

  0%|          | 0/500 [00:00<?, ?it/s]


TypeError: softmax() received an invalid combination of arguments - got (Tensor), but expected one of:
 * (Tensor input, int dim, torch.dtype dtype)
 * (Tensor input, name dim, *, torch.dtype dtype)


In [4]:
torch.save(trained_net.state_dict(), 'lbfgs_trained.pt')

In [6]:
ns = [100, 1000]
expts = 10
reps = 10

greedy_rwds_beta = np.zeros((len(ns), expts, reps))
ucb_rwds_beta = np.zeros((len(ns), expts, reps))
our_rwds_beta = np.zeros((len(ns), expts, reps))
random_rwds_beta = np.zeros((len(ns), expts))


for i, n in enumerate(ns):
    print(f'Experiments for n={n}')
    for j in tqdm(range(expts)):
        # print(f'Start experiment {j + 1} (n = {n})')
        mus = torch.rand(K)
        random_rwds_beta[i, j] = 2 * np.mean(mus.numpy()) - 1

        # print('Run greedy')
        greedy_rwds_beta[i, j, :] = eps_greedy(n, reps, mus, 1.).numpy()
        
        # print('Run UCB')
        ucb_rwds_beta[i, j, :] = ucb(n, reps, mus).numpy()

        # print('Run ours')
        our_rwds_beta[i, j, :] = run_policy_fixed_mu(trained_net, n, reps, mus).numpy()

print(f'Random: {np.mean(random_rwds_beta)}')
print(f'epsilon-Greedy: {np.mean(greedy_rwds_beta)}')
print(f'UCB: {np.mean(ucb_rwds_beta)}')
print(f'Ours: {np.mean(our_rwds_beta)}')

Experiments for n=100


100%|██████████| 10/10 [00:34<00:00,  3.45s/it]


Experiments for n=1000


100%|██████████| 10/10 [06:37<00:00, 39.74s/it]

Random: -0.015937143564224245
epsilon-Greedy: 2.0141468712780624
UCB: 2.297837796732783
Ours: -0.00244005830027163



