In [2]:
import sys
sys.path.append('../')
from gymenv_v2 import make_multiple_env
import numpy as np


import torch
from torch import nn
import torch.nn.functional as F
import torch.multiprocessing as mp

In [3]:
torch.__version__

'1.8.0'

In [4]:
torch.distributed.is_available()

True

In [5]:
import gym
import numpy as np

import torch
import torch.distributed.rpc as rpc
import torch.optim as optim
from torch.distributed.rpc import RRef, rpc_async, remote
from torch.distributions import Categorical


In [6]:
easy_config = {
    "load_dir"        : '../instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 10,
    "reward_type"     : 'obj'
}
def discounted_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_sum = 0
    for i in reversed(range(0,len(r))):
        discounted_r[i] = running_sum * gamma + r[i]
        running_sum = discounted_r[i]
    return discounted_r


class policyNet(torch.nn.Module):
    def __init__(self, num_inputs, hidden_dim):
        super(policyNet, self).__init__()
        self.batchNormMatrix = nn.BatchNorm1d(num_features = num_inputs)
        
        self.MLP = nn.Sequential(
                            nn.Linear(num_inputs+1, hidden_dim),
                            nn.ReLU(),
                            nn.Linear(hidden_dim, 32)
                            )
        self.value_out = nn.Sequential(
                                nn.Linear(32, hidden_dim),
                                nn.ReLU(),
                                nn.Linear(hidden_dim, 1)
                            )
        
        
    
    def forward(self, s):
        a, b, _, d, e = self._preproc(s)
        a, d = [self.batchNormMatrix(x) for x in [a, d]]
        b, e =[(x - x.min()) / (x.max() - x.min()) for x in [b, e]]
        X, Y = [torch.cat((x, y.unsqueeze(1)), 1) for x, y in zip([a, d], [b, e])]
        
        H, G = [self.MLP(x) for x in [X, Y]]
        
        
        S = H @ G.T
        
        return S.mean(0)
    
    
    def _preproc(self, s):
        return [torch.FloatTensor(item) for item in s]

def train(model, env, train_config):
    gamma = train_config['gamma']
    
    s, d = env.reset(), False
    
    rewds = []
    actions = []
    log_probs = []
    estimated_values = []
    ep_r = 0
    while not d:
        logit, estimated = model(s)
        prob = F.softmax(logit, dim=-1)
        log_prob = F.log_softmax(logit, dim=-1)
        a = prob.multinomial(num_samples=1).detach()
        
        s, r, d, _ = env.step(list(a))
        ep_r += r
        rewds.append(r)
        actions.append(a.item())
        log_probs.append(log_prob[a].sum())
        estimated_values.append(estimated.detach())
    
    discounted_rewds = discounted_rewards(rewds, gamma)
    
    return rewds
#     value_loss = 0
#     policy_loss = 0
#     for i in range(len(rewds)):
#         adl = discounted_rewds[i] - estimated_values[i]
#         value_loss += np.square(adl)/2
#         policy_loss -= adl * log_probs[i]
    
#     optimizer.zero_grad()
#     (policy_loss + value_loss).backward()
    
#     torch.nn.utils.clip_grad_norm_(model.parameters())
    
#     optimizer.step()
    
    
#     return ep_r

In [101]:
def train(model, rank, observer):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    env = observer.env 

    s, d = env.reset(), False
    rewds = []
    actions = []
    log_probs = []
    estimated_values = []
    ep_r = 0
    while not d:
        logit = model(s)
        prob = F.softmax(logit, dim=-1)
        log_prob = F.log_softmax(logit, dim=-1)
        a = prob.multinomial(num_samples=1).detach()
        
        s, r, d, _ = env.step(list(a))
        ep_r += r
        rewds.append(r)
        actions.append(a.item())
        log_probs.append(log_prob[a].sum())
        # estimated_values.append(estimated.detach())
    
    discounted_rewds = discounted_rewards(rewds, gamma)
    
    value_loss = 0
    policy_loss = 0
    for i in range(len(rewds)):
        adl = discounted_rewds[i]# - estimated_values[i]
#         value_loss += np.square(adl)/2
        policy_loss -= adl * log_probs[i]
    
    optimizer.zero_grad()
    (policy_loss).backward()
    
    optimizer.step()
    Q.put(ep_r)

In [102]:
from multiprocessing import Process, Queue

In [103]:
class Observer(object):
    def __init__(self):
        
        self.env = make_multiple_env(**easy_config) 
        self.rewards = 0

In [104]:
observers = [Observer() for _ in range(8)]

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9
loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/tra

In [113]:
learning_rate = 1e-2
gamma = .99

model = policyNet(60, 32)
model.share_memory() 



processes = []

Q = Queue()
for _ in range(5):
    for rank, observer in zip(range(8), observers):
        p = mp.Process(target=train, args=(model, rank, observer))
        p.start()
        processes.append(p)

for p in processes:
    p.join()

In [114]:
def dump_queue(queue):
    """
    Empties all pending items in a queue and returns them in a list.
    """
    result = []

    for i in iter(queue.get, 'STOP'):
        print(i)
        result.append(i)
        del i
    time.sleep(.1)
    return result

In [115]:
dump_queue(Q)

0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075
0.7058461509232075


KeyboardInterrupt: 

In [83]:
del x

NameError: name 'x' is not defined

In [143]:
from my_optim import SharedAdam


In [144]:
train_config = {
        'gamma' : .99
    }

num_processes = 16

shared_model = policyNet(60, 64)
shared_model.share_memory()


optimizer = SharedAdam(model.parameters(), lr=1e-2)
optimizer.share_memory()

In [150]:
processes = []
for rank, env in zip(range(num_processes), envs):
    p = mp.Process(target=train, args=(shared_model, env, train_config))
    p.start()
    processes.append(p)
for p in processes:
    p.join()

In [155]:
torch.__version__

In [154]:
import torch