In [1]:
import sys
sys.path.append('../')
from gymenv_v2 import make_multiple_env
import numpy as np

In [2]:
easy_config = {
    "load_dir"        : '../instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 20,
    "reward_type"     : 'obj'
}
env = make_multiple_env(**easy_config) 

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9


In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.multivariate_normal import MultivariateNormal
from torch.nn.utils.rnn import pad_sequence

use_cuda = torch.cuda.is_available()
print("Is CUDA available? %s.", use_cuda)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Is CUDA available? %s. False


In [4]:
class policyNet(nn.Module):
    def __init__(self, size_dim, hidden_size, output_size):
        super(policyNet, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(size_dim, size_dim)
        self.fc1 = nn.Linear(size_dim-1, size_dim)
        
        self.gru = nn.GRU(size_dim, hidden_size, batch_first=True)
        
        self.hidden_combine = nn.Linear(hidden_size*2, hidden_size)
        
        self.out = nn.Linear(hidden_size, output_size)
        
        
#         self.fc1 = nn.Linear(size_dim, hidden)
    def forward(self, s, hidden):
        A, b, c, E, d = self._preproc(s)
        Ab = torch.hstack((A, b.unsqueeze(1)))
        Ed = torch.hstack((E, d.unsqueeze(1)))
        
        c = self.embedding(c)
        
        Ab = Ab @ c.T
        
        Ab = F.relu(self.fc1(Ab))
        h, h_hidden = self.gru(Ab.unsqueeze(0), hidden.unsqueeze(0))
        
        g, g_hidden = self.gru(Ed.unsqueeze(0), hidden.unsqueeze(0))
        
        h = self.out(h.squeeze(0))
        g = self.out(g.squeeze(0))
        
        hidden = torch.cat((h_hidden.squeeze(0), g_hidden.squeeze(0)), 1)
        
        hidden = self.hidden_combine(F.relu(hidden))
        
        S = torch.mean(h @ g.T, 0)
        
        
        return F.log_softmax(S, dim=-1), hidden
        
        
    def initHidden(self):
        return torch.zeros(1, self.hidden_size, device=device)
        
        
        return torch.nn.functional.softmax((h @ g.T).mean(0), dim=-1)
    def _preproc(self, s):
        min1 = min(s[0].min(), s[-2].min())
        max1 = max(s[0].max(), s[-2].max())
        min2 = min(s[1].min(), s[-1].min())
        max2 = max(s[1].max(), s[-1].max())

        A = torch.FloatTensor((s[0] - min1) / (max1 - min1))
        E = torch.FloatTensor((s[-2] - min1) / (max1 - min1))
        b = torch.FloatTensor((s[1] - min2) / (max2 - min2))
        d = torch.FloatTensor((s[-1] - min2) / (max2 - min2))
        return [A, b, torch.LongTensor(s[2]), E, d]

In [5]:
def discounted_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_sum = 0
    for i in reversed(range(0,len(r))):
        discounted_r[i] = running_sum * gamma + r[i]
        running_sum = discounted_r[i]
    return torch.FloatTensor(discounted_r)

In [6]:
s = env.reset()
a = np.random.randint(0, s[-1].size, 1)
s, r, d, _ = env.step(list(a))
a = np.random.randint(0, s[-1].size, 1)
s, r, d, _ = env.step(list(a))

Academic license - for non-commercial use only - expires 2021-06-11
Using license file /Users/syeehyn/gurobi.lic


In [7]:
import wandb

run=wandb.init(project="finalproject", entity="ieor-4575", tags=["training-easy"])

[34m[1mwandb[0m: Currently logged in as: [33mieor-4575[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.26 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [8]:
%%wandb
env = make_multiple_env(**easy_config)
N = 60

alpha = 1e-1
beta = 1e-3

numtrajs = 16
iterations = 100
gamma = .8


# criterion = torch.nn.SmoothL1Loss(size_average=False)



policy = policyNet(N+1, 128, 64)

# value_function = valueNet(N+1, 128)

hidden = policy.initHidden()


# value_optimizer = torch.optim.Adam(value_function.parameters(), lr=beta)
# value_scheduler = torch.optim.lr_scheduler.ExponentialLR(value_optimizer, gamma=0.99)

policy_optimizer = torch.optim.Adam(policy.parameters(), lr=alpha)
policy_scheduler = torch.optim.lr_scheduler.ExponentialLR(policy_optimizer, gamma=0.99)




rrecord = []
for ite in range(iterations):
    
    OBS = []
    ACTS = []
    VALS = []
    
    for num in range(numtrajs):
        obss = []
        acts = []
        rews = []


        s = env.reset()
        d = False
        repisode = 0
        while not d:

            with torch.no_grad():
                prob, _ = policy(s, hidden)
                prob /= prob.sum()

            a = np.random.choice(s[-1].size, p = prob.numpy(), size=1)

            obss.append(s)

            s, r, d, _ = env.step(list(a))


            acts.append(a)
            rews.append(r)

            repisode += r

        rrecord.append(np.sum(rews))

        v_hat = discounted_rewards(rews, gamma)
        
        OBS.append(obss)
        ACTS.append(acts)
        VALS.append(v_hat)
    
    
#     criterion = []
    
#     for obss, v_hat in zip(OBS, VALS):
#         for obs, v in zip(obss, v_hat):
#             v_ = value_function(obs)
#             loss = torch.square(v_ - v)
#             value_optimizer.zero_grad()
#             loss.backward()
#             value_optimizer.step()
#             value_scheduler.step()
#             criterion.append(loss.item())
#     print(f'value approx loss: {np.mean(criterion)}')
    
#     ADS = []
#     with torch.no_grad():
#         for obss, v_hat in zip(OBS, VALS):
#             ads = []
#             for obs, v in zip(obss, v_hat):
#                 v_ = value_function(obs)
#                 ads.append(v - v_)
#             ADS.append(torch.FloatTensor(ads))
    
    
    criterion = []
    for obss, acts, v_hat in zip(OBS, ACTS, VALS):
        for obs, act, v in zip(obss, acts, v_hat):
            prob, hidden = policy(obs, hidden)
            prob_selected = prob[act]
            hidden = hidden.detach()
            loss = - v * prob_selected

            policy_optimizer.zero_grad()
            loss.backward()
            policy_optimizer.step()
            policy_scheduler.step()
            criterion.append(loss.item())
        
    print(f'loss: {np.mean(criterion)}')
    
#     fixedWindow=100
#     movingAverage=0
#     if len(rrecord) >= fixedWindow:
#         movingAverage=np.mean(rrecord[len(rrecord)-fixedWindow:len(rrecord)-1])
        
    #wandb logging
    wandb.log({ "Training reward" : rrecord[-1]})
        

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9
loss: 0.1395948070493489
loss: 1.739350798894358
loss: 21.607819751520584
loss: 21.359074540130678
loss: 21.65079563631043
loss: 22.803762855244077
loss: 21.13154758461169
loss: 21.522787187973154
loss: 23.8216470384461
loss: 19.397333398714544
loss: 19.796255027863662
loss: 18.547151192848105
loss: 23.506641956910606


KeyboardInterrupt: 