In [1]:
import sys

sys.path.append('../')

from gymenv_v2 import make_multiple_env
import numpy as np

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import random

In [3]:
env_config = {
    "load_dir"        : '../instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
}
env = make_multiple_env(**env_config)

s = env.reset()
a = np.random.randint(0, s[-1].size, 1)
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))
s, r, d, _ = env.step(list(a))

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9
Academic license - for non-commercial use only - expires 2021-06-11
Using license file /Users/syeehyn/gurobi.lic


In [4]:
class Encoder(nn.Module):
    def __init__(self, num_inputs, hidden_size, num_layers, dropout=0):
        super(Encoder, self).__init__()
        self.batchNormMatrix = nn.BatchNorm1d(num_features = num_inputs)
        self.lstm = nn.LSTM(
                    num_inputs+1,
                    hidden_size,
                    num_layers,
                    bidirectional=False,
                    dropout = dropout
                    )
    def forward(self, s):
        a, b, _, d, e = self._preproc(s)
        a, d = [self.batchNormMatrix(x) for x in [a, d]]
        b, e =[(x - x.min()) / (x.max() - x.min()) for x in [b, e]]
        X, Y = [torch.cat((x, y.unsqueeze(1)), 1) for x, y in zip([a, d], [b, e])]
        X, Y = [x.unsqueeze(1) for x in [X, Y]]
        
        X, (X_h, X_c) = self.lstm(X)
        Y, (Y_h, Y_c) = self.lstm(Y)
        
        return (X, X_h, X_c), (Y, Y_h, Y_c)
    def _preproc(self, s):
        return [torch.FloatTensor(item) for item in s]

In [5]:
class Attention(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attention, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(torch.mul(hidden, encoder_output), dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(torch.mul(hidden, energy), dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(torch.mul(self.v, energy), dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [6]:
class Decoder(nn.Module):
    def __init__(self, num_inputs, hidden_size, num_layers, attention, dropout=0):
        super(Decoder, self).__init__()
        self.batchNormMatrix = nn.BatchNorm1d(num_features = num_inputs)
        self.lstm = nn.LSTM(
                    num_inputs+1,
                    hidden_size,
                    num_layers,
                    bidirectional=False,
                    dropout = dropout
                    )
        self.attention = attention
        self.cat = nn.Linear(hidden_size *2, hidden_size)
        
        
    def forward(self, s, X_hidden, Y_hidden ,X_, Y_):
        a, b, _, d, e = self._preproc(s)
        a, d = [self.batchNormMatrix(x) for x in [a, d]]
        b, e =[(x - x.min()) / (x.max() - x.min()) for x in [b, e]]
        X, Y = [torch.cat((x, y.unsqueeze(1)), 1) for x, y in zip([a, d], [b, e])]
        X, Y = [x.unsqueeze(1) for x in [X, Y]]
        
        X, (X_h, X_c) = self.lstm(X, X_hidden)
        Y, (Y_h, Y_c) = self.lstm(Y, Y_hidden)
        
        
        X_, X_h_, X_c_ = X_
        Y_, Y_h_, Y_c_ = Y_
        
        X_attn_weights, Y_attn_weights = [self.attention(x, x_) for x, x_ in zip((X, Y), (X_, Y_))]
        
        X, Y = X.squeeze(1), Y.squeeze(1)
        X_context = X_attn_weights.bmm(X_).squeeze(1)
        Y_context = Y_attn_weights.bmm(Y_).squeeze(1)
        
        
        X_cat_input = torch.cat((X, X_context), 1)
        X_cat_output = F.relu(self.cat(X_cat_input))
        
        Y_cat_input = torch.cat((Y, Y_context), 1)
        Y_cat_output = F.relu(self.cat(Y_cat_input))
        
        
        
        S = X_cat_output @ Y_cat_output.T
        
        action_scores = S.mean(0)
        
        
        return F.softmax(action_scores, dim=-1), (X_h, X_c), (Y_h, Y_c)
    
    def _preproc(self, s):
        return [torch.FloatTensor(item) for item in s]

In [7]:
class Policy(nn.Module):
    def __init__(self, num_inputs, hidden_size, num_layers, method, 
                 epsilon=.8, epsilon_decay = .8, dropout=0):
        super(Policy, self).__init__()
        self.encoder = Encoder(
                        num_inputs, 
                        hidden_size, 
                        num_layers, 
                        dropout
                        )
        attention = Attention(
                        method,
                        hidden_size
                        )
        
        self.decoder = Decoder(
                        num_inputs, 
                        hidden_size, 
                        num_layers, 
                        attention, 
                        dropout
                        )
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        
        
    def forward(self, s, hidden=None):
        
        output = torch.zeros(s[-1].size)
        
        if hidden:
            X_hidden, Y_hidden = hidden
        else:
            X_hidden = torch.zeros(self.num_layers, 1, self.hidden_size), \
                        torch.zeros(self.num_layers, 1, self.hidden_size)
            Y_hidden = torch.zeros(self.num_layers, 1, self.hidden_size), \
                        torch.zeros(self.num_layers, 1, self.hidden_size)
        
        X, Y = self.encoder(s)
        
        prob, (X_h, X_c), (Y_h, Y_c) = self.decoder(s, X_hidden, Y_hidden, X, Y)
        
        
        if random.random() < self.epsilon:
            prob = torch.rand(prob.shape)
            prob /= prob.sum()
        
        
        return prob, ((X_h, X_c), (Y_h, Y_c))

In [8]:
import sys
sys.path.append("..")
from gymenv_v2 import make_multiple_env
import torch
import numpy as np
from torch.distributions import Categorical
from torch.optim.lr_scheduler import StepLR

class Observer(object):
    def __init__(self, env_config):
        self.env = make_multiple_env(**env_config)
    def run_episode(self, agent):
        state, ep_reward, d = self.env.reset(), 0, False
        hidden = None
        while not d:
            # send the state to the agent to get an action
            action, hidden = agent.select_action(state, hidden)

            # apply the action to the environment, and get the reward
            state, reward, d, _ = self.env.step(action)
            # report the reward to the agent for training purpose
            
            agent.policy.epsilon *= agent.policy.epsilon_decay
            agent.report_reward(reward, d)

class Agent(object):
    def __init__(self, training_config,observer, model):
        learning_rate = training_config['lr']
        gamma = training_config['gamma']
        self.entropy_coef = training_config['entropy_coef']
        self.observer = observer
        self.rewards = []
        self.gamma = gamma
        self.policy = model
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=learning_rate)
#         self.scheduler = StepLR(self.optimizer, step_size=4, gamma=0.1)
        self.eps = np.finfo(np.float32).eps.item()
        self.save_log_probs = []
        self.save_probs = []
        
    def select_action(self, state, hidden):
        probs, hidden = self.policy(state, hidden)
        try:
            m = Categorical(probs)
        except ValueError:
            probs = torch.rand(prob.shape)
            probs /= probs.sum()
            m = Categorical(probs)
        action = m.sample()
        self.save_log_probs.append(m.log_prob(action))
        self.save_probs.append(m.probs[action])
        return action.item(), hidden
    def report_reward(self, reward, d):
        if not d:
            self.rewards.append(reward)
        else:
            self.rewards.append(reward)
            self.rewards.append(np.NaN)
        
    def run_episode(self):
        self.observer.run_episode(self)
        
    def finish_episode(self):
        R, log_probs, probs  = 0, self.save_log_probs.copy(), self.save_probs.copy()
        
        rewards = []
        rewards_seqs = []
        rewards_seq = []
        for reward in self.rewards:
            if not np.isnan(reward):
                rewards.append(reward)
                rewards_seq.append(reward)
            else:
                rewards_seqs.append(rewards_seq)
                rewards_seq = []
        reward = min([sum(rewards_seq) for rewards_seq in rewards_seqs])
        
        self.rewards = []
        self.save_log_probs = []
        self.save_probs = []
        
        policy_loss, returns = [], []
        
        for r in rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + self.eps)
        
        for log_prob, prob, R in zip(log_probs, probs, returns):
#             policy_loss.append(-log_prob * R)
            policy_loss.append(-log_prob * R - self.entropy_coef * (log_prob * prob))
        self.optimizer.zero_grad()
        
        policy_loss = torch.stack(policy_loss).sum()
#         print(policy_loss)
        policy_loss.backward()
        self.optimizer.step()
#         self.scheduler.step()
        return reward

In [9]:
model = Policy(60, 128, 3, 'dot')

In [None]:
training_config = {
                'lr': 1e-3,
                'gamma': .95,
                'num_revisit': 3,
                'entropy_coef': 1,
            }
env_config = {
    "load_dir"        : '../instances/train_10_n60_m60',
    "idx_list"        : list(range(10)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
    }

model = Policy(60, 64, 2, 'concat', dropout=.3)
observer = Observer(env_config)


agent = Agent(training_config, observer, model)


for iteration in range(50):
    for _ in range(training_config['num_revisit']):
        agent.run_episode()
    reward = agent.finish_episode()
    print(f'iter: {iteration}, training reward: {reward}')

loading training instances, dir ../instances/train_10_n60_m60 idx 0
loading training instances, dir ../instances/train_10_n60_m60 idx 1
loading training instances, dir ../instances/train_10_n60_m60 idx 2
loading training instances, dir ../instances/train_10_n60_m60 idx 3
loading training instances, dir ../instances/train_10_n60_m60 idx 4
loading training instances, dir ../instances/train_10_n60_m60 idx 5
loading training instances, dir ../instances/train_10_n60_m60 idx 6
loading training instances, dir ../instances/train_10_n60_m60 idx 7
loading training instances, dir ../instances/train_10_n60_m60 idx 8
loading training instances, dir ../instances/train_10_n60_m60 idx 9
iter: 0, training reward: 0.07657868644378141
iter: 1, training reward: 0.05443873782223818
iter: 2, training reward: 0.10437326798955837
iter: 3, training reward: 0.09280565424796805
iter: 4, training reward: 0.047775817174624535
iter: 5, training reward: 0.1177266291279011
iter: 6, training reward: 0.0960423972612716