In [1]:
import sys

sys.path.append('../')

from gymenv_v2 import make_multiple_env
import numpy as np

In [12]:
import torch
from torch import nn
import torch.nn.functional as F
class Policy(nn.Module):
    def __init__(self, num_inputs):
        super(Policy, self).__init__()
        self.batchNormMatrix = nn.BatchNorm1d(num_features = num_inputs)
        
        
        self.core = nn.Sequential(
                                nn.Conv1d(1, 32, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(32, 64, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(64, 32, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(32, 1, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                )
        
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, s):
        a, b, _, d, e = self._preproc(s)
        a, d = [self.batchNormMatrix(x) for x in [a, d]]
        b, e =[(x - x.min()) / (x.max() - x.min()) for x in [b, e]]
        X, Y = [torch.cat((x, y.unsqueeze(1)), 1) for x, y in zip([a, d], [b, e])]
        
        
        X, Y = [x.unsqueeze(1) for x in [X, Y]]
        
        H, G = [self.core(x) for x in [X, Y]]
        

        H, G = [x.squeeze(1) for x in [H, G]]
        
        S = H @ G.T
        
        
        action_scores = S.mean(0)
        
        return F.softmax(action_scores, dim=-1)
    
    
    def _preproc(self, s):
        return [torch.FloatTensor(item) for item in s]

In [13]:
env_config =  {
    "load_dir"        : '../instances/train_100_n60_m60',
    "idx_list"        : list(range(99)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
    }

env = make_multiple_env(**env_config)


s = env.reset()
a = np.random.randint(0, s[-1].size, 1)
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))

loading training instances, dir ../instances/train_100_n60_m60 idx 0
loading training instances, dir ../instances/train_100_n60_m60 idx 1
loading training instances, dir ../instances/train_100_n60_m60 idx 2
loading training instances, dir ../instances/train_100_n60_m60 idx 3
loading training instances, dir ../instances/train_100_n60_m60 idx 4
loading training instances, dir ../instances/train_100_n60_m60 idx 5
loading training instances, dir ../instances/train_100_n60_m60 idx 6
loading training instances, dir ../instances/train_100_n60_m60 idx 7
loading training instances, dir ../instances/train_100_n60_m60 idx 8
loading training instances, dir ../instances/train_100_n60_m60 idx 9
loading training instances, dir ../instances/train_100_n60_m60 idx 10
loading training instances, dir ../instances/train_100_n60_m60 idx 11
loading training instances, dir ../instances/train_100_n60_m60 idx 12
loading training instances, dir ../instances/train_100_n60_m60 idx 13
loading training instances, di

In [14]:
model = Policy(60)

In [15]:
model(s)

tensor([0.0361, 0.0151, 0.0136, 0.0171, 0.0143, 0.0149, 0.0165, 0.0167, 0.0163,
        0.0168, 0.0150, 0.0155, 0.0160, 0.0145, 0.0152, 0.0147, 0.0140, 0.0144,
        0.0155, 0.0144, 0.0156, 0.0157, 0.0160, 0.0166, 0.0149, 0.0144, 0.0161,
        0.0169, 0.0136, 0.0154, 0.0139, 0.0141, 0.0147, 0.0136, 0.0189, 0.0138,
        0.0138, 0.0150, 0.0159, 0.0151, 0.0172, 0.0150, 0.0132, 0.0143, 0.0148,
        0.0170, 0.0186, 0.0146, 0.0150, 0.0190, 0.0156, 0.0140, 0.0143, 0.0168,
        0.0155, 0.0153, 0.0172, 0.0193, 0.0227, 0.0137, 0.0159, 0.0155, 0.0148],
       grad_fn=<SoftmaxBackward>)

In [16]:
import sys
sys.path.append("..")
from gymenv_v2 import make_multiple_env
import torch
import numpy as np
from torch.distributions import Categorical
from torch.optim.lr_scheduler import StepLR
class Observer(object):
    def __init__(self, env_config):
        self.env = make_multiple_env(**env_config)
    def run_episode(self, agent):
        state, ep_reward, d = self.env.reset(), 0, False
        while not d:
            # send the state to the agent to get an action
            action = agent.select_action(state)

            # apply the action to the environment, and get the reward
            state, reward, d, _ = self.env.step(action)
            # report the reward to the agent for training purpose
            agent.report_reward(reward, d)

class Agent(object):
    def __init__(self, training_config,observer, model):
        learning_rate = training_config['lr']
        gamma = training_config['gamma']
        
        self.observer = observer
        self.rewards = []
        self.gamma = gamma
        self.policy = model
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.scheduler = StepLR(self.optimizer, step_size=5, gamma=0.1)
        self.eps = np.finfo(np.float32).eps.item()
        self.save_log_probs = []
        
    def select_action(self, state):
        probs = self.policy(state)
        m = Categorical(probs)
        action = m.sample()
        self.save_log_probs.append(m.log_prob(action))
        return action.item()
    def report_reward(self, reward, d):
        if not d:
            self.rewards.append(reward)
        else:
            self.rewards.append(reward)
            self.rewards.append(np.NaN)
        
    def run_episode(self):
        self.observer.run_episode(self)
        
    def finish_episode(self):
        R, probs  = 0, self.save_log_probs.copy()
        
        rewards = []
        rewards_seqs = []
        rewards_seq = []
        for reward in self.rewards:
            if not np.isnan(reward):
                rewards.append(reward)
                rewards_seq.append(reward)
            else:
                rewards_seqs.append(rewards_seq)
                rewards_seq = []
        reward = min([sum(rewards_seq) for rewards_seq in rewards_seqs])
        
        self.rewards = []
        self.save_log_probs = []
        
        policy_loss, returns = [], []
        
        for r in rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + self.eps)
        
        for log_prob, R in zip(probs, returns):
            policy_loss.append(-log_prob * R)
        self.optimizer.zero_grad()
        
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        return reward

In [17]:
training_config = {
                'lr': 1e-2,
                'gamma': .95,
                'num_revisit': 2
            }
env_config =  {
    "load_dir"        : '../instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}


model = Policy(60)
observer = Observer(env_config)


agent = Agent(training_config, observer, model)


for iteration in range(10):
    for _ in range(training_config['num_revisit']):
        agent.run_episode()
    reward = agent.finish_episode()
    print(f'iter: {iteration}, training reward: {reward}')

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9
iter: 0, training reward: 0.055794868655539176
iter: 1, training reward: 0.06733496795322935
iter: 2, training reward: 0.12685385922804926
iter: 3, training reward: 0.16396497146479305
iter: 4, training reward: 0.047297267150497646
iter: 5, training reward: 0.13360274243837011
iter: 6, training reward: 0.11544965239136

KeyboardInterrupt: 