In [1]:
import sys

sys.path.append('../')

from gymenv_v2 import make_multiple_env
import numpy as np

In [2]:
# s = env.reset()
# a = np.random.randint(0, s[-1].size, 1)
# s, r, d, _ = env.step(list(a))
# s, r, d, _ = env.step(list(a))
# s, r, d, _ = env.step(list(a))
# s, r, d, _ = env.step(list(a))

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
class Policy(nn.Module):
    def __init__(self, num_inputs):
        super(Policy, self).__init__()
        self.batchNormMatrix = nn.BatchNorm1d(num_features = num_inputs)
        
        self.rnn = nn.RNN(num_inputs+1, 64, 5)
        
        self.core = nn.Sequential(
                                nn.Conv1d(1, 64, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(64, 128, 3),
                                nn.Dropout(.5),
                                nn.Conv1d(128, 64, 3),
                                nn.Dropout(.5),
                                nn.Conv1d(64, 32, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(32, 1, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                )
        
#         self.rnn = nn.RNN()
        
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, s, hidden=None):
        a, b, _, d, e = self._preproc(s)
        a, d = [self.batchNormMatrix(x) for x in [a, d]]
        b, e =[(x - x.min()) / (x.max() - x.min()) for x in [b, e]]
        X, Y = [torch.cat((x, y.unsqueeze(1)), 1) for x, y in zip([a, d], [b, e])]
        
        X, Y = [x.unsqueeze(1) for x in [X, Y]]
        
        
        
        if not hidden:
            hidden = (torch.randn(5, 1, 64), torch.randn(5, 1, 64))
        X, X_h = self.rnn(X, hidden[0])
        Y, Y_h = self.rnn(Y, hidden[1])
        
        X, Y = [F.relu(x) for x in [X, Y]]
        
        H, G = [self.core(x) for x in [X, Y]]
        
        
        
        H, G = [x.squeeze(1) for x in [H, G]]
        
        S = H @ G.T
        
        
        
        action_scores = S.mean(0)
        
        
        return F.softmax(action_scores, dim=-1), (X_h, Y_h)
    
    
    def _preproc(self, s):
        return [torch.FloatTensor(item) for item in s]

In [4]:
env_config = {
    "load_dir"        : '../instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}
env = make_multiple_env(**env_config)

s = env.reset()
a = np.random.randint(0, s[-1].size, 1)
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))



model = Policy(60)

In [5]:
import sys
sys.path.append("..")
from gymenv_v2 import make_multiple_env
import torch
import numpy as np
from torch.distributions import Categorical
from torch.optim.lr_scheduler import StepLR

class Observer(object):
    def __init__(self, env_config):
        self.env = make_multiple_env(**env_config)
    def run_episode(self, agent):
        state, ep_reward, d = self.env.reset(), 0, False
        hidden = None
        while not d:
            # send the state to the agent to get an action
            action, hidden = agent.select_action(state, hidden)

            # apply the action to the environment, and get the reward
            state, reward, d, _ = self.env.step(action)
            # report the reward to the agent for training purpose
            agent.report_reward(reward, d)

class Agent(object):
    def __init__(self, training_config,observer, model):
        learning_rate = training_config['lr']
        gamma = training_config['gamma']
        
        self.entropy_coef = training_config['entropy_coef']
        self.observer = observer
        self.rewards = []
        self.gamma = gamma
        self.policy = model
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.scheduler = StepLR(self.optimizer, step_size=5, gamma=0.1)
        self.eps = np.finfo(np.float32).eps.item()
        self.save_log_probs = []
        self.save_probs = []
        
    def select_action(self, state, hidden):
        probs, hidden = self.policy(state, hidden)
        m = Categorical(probs)
        action = m.sample()
        self.save_log_probs.append(m.log_prob(action))
        self.save_probs.append(m.probs[action])
        return action.item(), hidden
    def report_reward(self, reward, d):
        if not d:
            self.rewards.append(reward)
        else:
            self.rewards.append(reward)
            self.rewards.append(np.NaN)
        
    def run_episode(self):
        self.observer.run_episode(self)
        
    def finish_episode(self):
        R, log_probs, probs  = 0, self.save_log_probs.copy(), self.save_probs.copy()
        
        rewards = []
        rewards_seqs = []
        rewards_seq = []
        for reward in self.rewards:
            if not np.isnan(reward):
                rewards.append(reward)
                rewards_seq.append(reward)
            else:
                rewards_seqs.append(rewards_seq)
                rewards_seq = []
        reward = min([sum(rewards_seq) for rewards_seq in rewards_seqs])
        
        self.rewards = []
        self.save_log_probs = []
        self.save_probs = []
        
        policy_loss, returns = [], []
        
        for r in rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + self.eps)
        
        for log_prob, prob, R in zip(log_probs, probs, returns):
            policy_loss.append(-log_prob * R + self.entropy_coef * (log_prob * prob))
        self.optimizer.zero_grad()
        
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        return reward

In [6]:
training_config = {
                'lr': 1e-3,
                'gamma': .95,
                'num_revisit': 1,
                'entropy_coef': 1
            }
env_config =  {
    "load_dir"        : '../instances/train_100_n60_m60',
    "idx_list"        : list(range(99)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
    }
model = Policy(60)
observer = Observer(env_config)


agent = Agent(training_config, observer, model)


for iteration in range(10):
    for _ in range(1):
        agent.run_episode()
    reward = agent.finish_episode()
    print(f'iter: {iteration}, training reward: {reward}')

loading training instances, dir ../instances/train_100_n60_m60 idx 0
loading training instances, dir ../instances/train_100_n60_m60 idx 1
loading training instances, dir ../instances/train_100_n60_m60 idx 2
loading training instances, dir ../instances/train_100_n60_m60 idx 3
loading training instances, dir ../instances/train_100_n60_m60 idx 4
loading training instances, dir ../instances/train_100_n60_m60 idx 5
loading training instances, dir ../instances/train_100_n60_m60 idx 6
loading training instances, dir ../instances/train_100_n60_m60 idx 7
loading training instances, dir ../instances/train_100_n60_m60 idx 8
loading training instances, dir ../instances/train_100_n60_m60 idx 9
loading training instances, dir ../instances/train_100_n60_m60 idx 10
loading training instances, dir ../instances/train_100_n60_m60 idx 11
loading training instances, dir ../instances/train_100_n60_m60 idx 12
loading training instances, dir ../instances/train_100_n60_m60 idx 13
loading training instances, di

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/syeehyn/opt/anaconda3/envs/ieor4575/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-6907b7c82dc9>", line 23, in <module>
    reward = agent.finish_episode()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/syeehyn/opt/anaconda3/envs/ieor4575/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2044, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/syeehyn/opt/anaconda3/envs/ieor4575/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1169, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_

TypeError: object of type 'NoneType' has no len()