In [309]:
import sys
sys.path.append('./')
sys.path.append('../')
from gymenv_v2 import make_multiple_env
import numpy as np


import torch
from torch import nn
import torch.nn.functional as F

In [3]:
easy_config = {
    "load_dir"        : '../instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 10,
    "reward_type"     : 'obj'
}
env = make_multiple_env(**easy_config) 

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9


In [537]:
def discounted_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_sum = 0
    for i in reversed(range(0,len(r))):
        discounted_r[i] = running_sum * gamma + r[i]
        running_sum = discounted_r[i]
    return discounted_r

In [485]:
class ActorCritic(torch.nn.Module):
    def __init__(self, num_inputs, hidden_dim):
        super(ActorCritic, self).__init__()
        self.batchNormMatrix = nn.BatchNorm1d(num_features = num_inputs)
        
        self.core = nn.Sequential(
                                nn.Conv1d(1, 16, 3),
                                nn.ELU(),
                                nn.Conv1d(16, 16, 3),
                                nn.ELU(),
                                nn.Conv1d(16, 1, 3),
                                nn.ELU(),
                                )
        
        self.lstm = nn.LSTM(num_inputs+1 - 2 * 3, hidden_dim)
        
        self.critic_out = nn.Linear(hidden_dim, 1)
    
    def forward(self, inputs):
        
        s, X_h, Y_h = inputs
          
            
        a, b, c, d, e = self._preproc(s)
        a, d = [self.batchNormMatrix(x) for x in [a, d]]
        b, e =[(x - x.min()) / (x.max() - x.min()) for x in [b, e]]
        X, Y = [torch.cat((x, y.unsqueeze(1)), 1) for x, y in zip([a, d], [b, e])]
        
        
        X, Y = [x.unsqueeze(1) for x in [X, Y]]
        
        X, Y = [self.core(x) for x in [X, Y]]
        
        
        H, G = [self.lstm(x, h) for x, h in zip([X, Y], [X_h, Y_h])]
        
        
        X, Y = [x[0].squeeze(1) for x in [H, G]]
        
        
        
        X_h, Y_h = [x[1] for x in [H, G]]
        
        
        S = X @ Y.T
        
        
        J = torch.cat((X, Y), 0)
        
        J = self.critic_out(J)
        
        
        return S.mean(0), X_h, Y_h, J.mean()
        
    
    def _preproc(self, s):
        return [torch.FloatTensor(item) for item in s]

In [659]:
def train():
    obss = []
    rewds = []
    acts = []
    entropies = []
    values = []
    log_probs = []

    s, d, repisode = env.reset(), False, 0

    X_h = torch.zeros(1, 1, 32), torch.zeros(1, 1, 32)
    Y_h = torch.zeros(1, 1, 32), torch.zeros(1, 1, 32)


    while not d:
        logits, X_h, Y_h, value = model((s, X_h, Y_h))

        prob = F.softmax(logits, dim=-1)
        log_prob = F.log_softmax(logits, dim=-1)
        entropy = (-(log_prob * prob)).sum()
        entropies.append(entropy)

        a = prob.multinomial(num_samples=1).detach()
        log_prob = log_prob[a].sum()

        a = a.numpy()

        obss.append(s)
        s, r, d, _ = env.step(list(a))

        repisode += r
        values.append(value)
        rewds.append(r)
        acts.append(a)
        log_probs.append(log_prob)

    discounted_rewds = discounted_rewards(rewds, gamma)



    value_loss = 0
    policy_loss = 0
    gae = 0
    for i in range(len(rewds)):

        advantages = discounted_rewds[i] - values[i]
        value_loss += .5 * advantages.pow(2)


        delta_t = discounted_rewds[i]

        gae = gae * gamma * gae_lambda + delta_t

        policy_loss -= log_probs[i] * gae - entropy_coef * entropies[i]

    optimizer.zero_grad()
    (policy_loss + value_loss_coef * value_loss).backward()

    print(f'value loss: {value_loss}')
    print(f'policy loss: {policy_loss}')
    print(f'training rewards {repisode}')

In [586]:
# from pathos.multiprocessing import ProcessingPool as Pool

# THREAD = 16


In [587]:
model = ActorCritic(N, hidden_dim)

# results = multiprocess_cut(envs)


In [588]:
s = env.reset()

a = np.random.randint(0, s[-1].size, 1)   
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))

In [589]:
X_h = torch.zeros(1, 1, 32), torch.zeros(1, 1, 32)
Y_h = torch.zeros(1, 1, 32), torch.zeros(1, 1, 32)
logits, X_h, Y_h, value = model((s, X_h, Y_h))

In [590]:
prob = F.softmax(logits, dim=-1)
log_prob = F.log_softmax(logits, dim=-1)

In [591]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)

In [660]:
gae_lambda = 1
entropy_coef = .01
value_loss_coef = .5

N = 60
gamma = .99
hidden_dim = 32

num_iteration = 20

alpha = 1e-2
optimizer = torch.optim.Adam(model.parameters(), lr=alpha)


env = make_multiple_env(**easy_config)
model = ActorCritic(N, hidden_dim)

for it in range(num_iteration):
    print(f'iter {it}')
    train()

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9
iter 0
value loss: 0.016421224921941757
policy loss: 9.630837440490723
training rewards 0.08013209927730713
iter 1
value loss: 0.007653901819139719
policy loss: 5.1265106201171875
training rewards 0.043967092729417345
iter 2
value loss: 0.014788379892706871
policy loss: 8.923904418945312
training rewards 0.058907299724

In [543]:
from my_optim import SharedAdam
optimizer = SharedAdam(model.parameters(), lr=args.lr)
optimizer.share_memory()

NameError: name 'args' is not defined