# REINFORCE actor-critic learning for the CVRP

## Importing the libraries

In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from gym import wrappers
from torch.autograd import Variable
from torch.distributions import Categorical
from collections import deque

## Creating a CVRP environment

In [2]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
import random
import copy
import math

class VRPEnv(gym.Env):
  def __init__(self):
    # customer count ('0' is depot) 
    self.customer_count = 11
    # the capacity of vehicles
    self.vehicle_capacity = 2
  
    self.action_space = spaces.Discrete(3)
    self.observation_space = spaces.Box(low=0,high=1, shape=(4,1), dtype=np.float64)
    self.VRP = np.array((self.customer_count,4))
    self._max_episode_steps = 1000
    self.viewer = None
    self.state = None
    self.steps_beyond_done = None
    self.route = []
    self.route.append(0)
    self.previous_action = 0
    self.hn_actor = torch.zeros([1,self.customer_count,128], dtype=torch.float32).to(device)
    self.hn_actor_target = torch.zeros([1,self.customer_count,128], dtype=torch.float32).to(device)
    

  def reset(self, seed=200):
    if seed == 200:
      seed = int(time.time())
    np.random.seed(seed)
    x_locations = (np.random.rand(self.customer_count)).reshape((self.customer_count,1))
    y_locations = (np.random.rand(self.customer_count)).reshape((self.customer_count,1))
    demand = (np.random.randint(1,9,self.customer_count).reshape((self.customer_count,1))).reshape((self.customer_count,1))/10 # Normalise to between 0.1 and 0.9
    capacity = np.repeat(self.vehicle_capacity,self.customer_count).reshape((self.customer_count,1))
    VRP = np.concatenate((np.concatenate((np.concatenate((x_locations,y_locations), axis=1),demand),axis=1),capacity),axis=1)
    self.VRP = VRP.reshape((self.customer_count,4))
    self.unserved_customers = []
    for i in range(1, self.customer_count):
      self.unserved_customers.append(i)
    self.routes = []
    self.route = []
    self.route.append(0)
    self.VRP[0,2] = 0 # Set the demand at thedepot to 0
    self.state = copy.deepcopy(self.VRP)
    self.previous_action = 0
    return self.state
  

  def step(self, action):
    # Calculate the reward as the negative euclidean distance
    reward = -((self.state[self.previous_action,0]-self.state[action,0])**2+(self.state[self.previous_action,1]-self.state[action,1])**2)**0.5 # - Euclidean distance between customers
    load = self.state[0,3]
    self.state[:,3] = max(0,(load-self.state[action,2])) # Update the vehicle load
    self.state[action, 2] = max(0,self.state[action,2]-load) # Update the demand at served customer
    done = False
    if action == 0: # Return to the depot
      self.route.append(action) # End route
      self.routes.append(self.route) # Add subroute to list of all routes
      self.route = [] # Initiate new subroute
      self.state[:,3] = self.vehicle_capacity # Refill the vehicle
    self.route.append(action) # Add action to the subroute
    if max(self.state[:,2]) > 0: # If there are unserved customers left
      done = False
    elif max(self.state[:,2]) == 0 and action == 0: # If there are no unserved customers left and we have returned to the depot
      done = True
      self.route.append(0)
    self.previous_action = action # Update the previous action
    return self.state, reward, done


## Let's test the environment step function

In [3]:
env = VRPEnv() # Create an instance of the environment
state = env.reset() # Reset the environment
action = 2 # Perform action with customer 2
print(state)
state, reward, done = env.step(action) # Perform the actual transition
print(state)

[[0.45621329 0.66205604 0.         2.        ]
 [0.2814365  0.16862356 0.5        2.        ]
 [0.0031236  0.04988002 0.2        2.        ]
 [0.48095493 0.37237519 0.2        2.        ]
 [0.27976012 0.9823364  0.1        2.        ]
 [0.2603242  0.33208958 0.1        2.        ]
 [0.99899722 0.13060665 0.4        2.        ]
 [0.49919138 0.14070669 0.8        2.        ]
 [0.23995861 0.32464986 0.7        2.        ]
 [0.64083404 0.37304389 0.1        2.        ]
 [0.42405637 0.39863593 0.2        2.        ]]
[[0.45621329 0.66205604 0.         1.8       ]
 [0.2814365  0.16862356 0.5        1.8       ]
 [0.0031236  0.04988002 0.         1.8       ]
 [0.48095493 0.37237519 0.2        1.8       ]
 [0.27976012 0.9823364  0.1        1.8       ]
 [0.2603242  0.33208958 0.1        1.8       ]
 [0.99899722 0.13060665 0.4        1.8       ]
 [0.49919138 0.14070669 0.8        1.8       ]
 [0.23995861 0.32464986 0.7        1.8       ]
 [0.64083404 0.37304389 0.1        1.8       ]
 [0.42405637

## Initialize the Experience Replay memory

In [4]:
class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      #self.storage[int(self.ptr)] = transition
      #self.ptr = (self.ptr + 1) % self.max_size
      self.storage.pop(0)
      self.storage.append(transition)
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind =  np.arange((len(self.storage)-(batch_size+1)),len(self.storage)-1,1) #np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

## Build the neural network for the Actor and Actor-target models - contains the attention mechanism

In [5]:
class Actor(nn.Module):
  
  def __init__(self, state_dim=4, embed_size = 128):#, action_dim, max_action):
    super(Actor, self).__init__()
    self.embed = nn.Linear((state_dim), embed_size) # Encoding to higher dimensional space - can also be changed to convolutional layer as in the paper
    self.u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the attention mechanism
    self.v_t_a = nn.Linear(embed_size,1) # Linear for getting u_t
    self.bar_u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the context vector
    self.a_t = nn.Softmax(dim = 1) # Softmax layer for the attention mechanism
    self.v_t_u = nn.Linear(embed_size,1) # Linear for getting u_t
    self.final = nn.Softmax(dim = 1) # Softmax layer for the final output

    self.saved_log_probabilities = [] # Create and empty list for saving the log probabilities of the actions
    self.rewards = [] # Create an empty list for the rewards
    self.dones = [] # Create an empty list for checking whether the episode was completed

  def forward(self, x, hn = env.hn_actor):
    cond1 = (x[:,:,2]<x[:,:,3]).int() # Can we meet the demand
    cond2 = (x[:,:,2]>0).int() # Is there demand at the customer
    mask1 = torch.minimum(cond1,cond2) # Select those customers with demand, and whose demand we can meet
    if torch.sum(mask1[:,1:len(mask1[0])]) == 0: # If only the depot can be visited
      mask1[:,0] = 1
    mask1 = torch.reshape(mask1,(len(x),env.customer_count,1))
    x = self.embed(x)
    u, hn = self.u_t(x, hn)
    u = self.v_t_a(u)
    a = self.a_t(u) # Up to equation (4) now
    c = torch.randn(x.shape)
    c = torch.mul(x,a)
    c = torch.sum(c, 0)
    c = torch.reshape(c,(1,env.customer_count,128))
    u_bar, hu = self.bar_u_t(x,c)
    u_bar = self.v_t_u(u_bar)
    output = self.final(u_bar)
    output = torch.mul(output,mask1)
    #print("Before clamp ", output)
    #output = output.log().clamp(epsilon, 1 - epsilon)
    epsilon = 10 ** -44
    output = output.clamp(min=1e-4)
    #print("After clamp ",output)
    output = output.log()#.clamp(epsilon, 1 - epsilon)
    #print("After log ",output)
    output = self.final(output)
    return output

class Actor_Target(nn.Module):
  
  def __init__(self, state_dim=4, embed_size = 128):#, action_dim, max_action):
    super(Actor_Target, self).__init__()
    self.embed = nn.Linear((state_dim), embed_size) # Encoding to higher dimensional space - can also be changed to convolutional layer as in the paper
    self.u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the attention mechanism
    self.v_t_a = nn.Linear(embed_size,1) # Linear for getting u_t
    self.bar_u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the context vector
    self.a_t = nn.Softmax(dim = 1) # Softmax layer for the attention mechanism
    self.v_t_u = nn.Linear(embed_size,1) # Linear for getting u_t
    self.final = nn.Softmax(dim = 1) # Softmax layer for the final output

  def forward(self, x, hn = env.hn_actor_target):
    cond1 = (x[:,:,2]<x[:,:,3]).int() # Can we meet the demand
    cond2 = (x[:,:,2]>0).int() # Is there demand at the customer
    mask1 = torch.minimum(cond1,cond2) # Select those customers with demand, and whose demand we can meet
    if torch.sum(mask1[:,1:len(mask1[0])]) == 0: # If only the depot can be visited
      mask1[:,0] = 1
    mask1 = torch.reshape(mask1,(len(x),env.customer_count,1))
    x = self.embed(x)
    u, hn = self.u_t(x, hn)
    u = self.v_t_a(u)
    a = self.a_t(u) # Up to equation (4) now
    c = torch.randn(x.shape)
    c = torch.mul(x,a)
    c = torch.sum(c, 0)
    c = torch.reshape(c,(1,env.customer_count,128))
    u_bar, hu = self.bar_u_t(x,c)
    u_bar = self.v_t_u(u_bar)
    output = self.final(u_bar)
    output = torch.mul(output,mask1)
    output = self.final(torch.log(output))
    return output

## Build the neural network for the Critic and Critic-target model

In [6]:
class Critic(nn.Module):
  
  def __init__(self, state_dim=4, action_dim = env.customer_count, embed_size = 128):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.layer_1 = nn.Linear(state_dim, embed_size) # Perform the embedding
    self.layer_2 = nn.Linear(embed_size, embed_size) # Adding the single dense layer
    self.layer_3 = nn.Linear(embed_size, 1) # Adding the output layer

    self.values = [] # Create an empty list to store the predicted values


  def forward(self, x, u): # x is the state, u is the action
    # Forward-Propagation on the Critic Neural Network
    x1 = F.relu(self.layer_1(x))
    ws = torch.mul(x1,u)
    x2 = F.relu(self.layer_2(ws))
    x2 = self.layer_3(x2)
    x2 = torch.sum(x2,1)
    return x2


## Testing the actor and critic networks to confirm their output

In [7]:
env = VRPEnv()
env.reset()
state = env.reset()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state = torch.Tensor(state.reshape(1,env.customer_count,4)).to(device)
actor = Actor().to(device)
actor_target = Actor_Target().to(device)
prediction = actor(state)
prediction_target = actor_target(state)
print(prediction)
critic = Critic().to(device)
q_value = critic(state,prediction)
print(q_value)

state, reward, done = env.step(5)
state = torch.Tensor(state.reshape(1,env.customer_count,4)).to(device)
prediction = actor(state)
print(torch.sum(prediction))

tensor([[[0.0001],
         [0.1003],
         [0.0984],
         [0.1002],
         [0.0963],
         [0.0989],
         [0.1035],
         [0.1017],
         [0.0998],
         [0.1008],
         [0.0998]]], device='cuda:0', grad_fn=<SoftmaxBackward>)
tensor([[0.1259]], device='cuda:0', grad_fn=<SumBackward1>)
tensor(1., device='cuda:0', grad_fn=<SumBackward0>)


## Training Process

In [8]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)

# Building the Training Process into a class
class Actor_Critic(object):
  
  def __init__(self, state_dim):
    self.actor = Actor_Target(state_dim).to(device)
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr = 0.0001)
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr = 0.0001)
    self.max_action = max_action

  def select_action(self, state, state_dim):
    #state_tensor = torch.Tensor(state.reshape(1,env.customer_count,state_dim)).to(device)
    current_Q = actor(state)
    Q = torch.reshape(current_Q,(1,env.customer_count))
    m = Categorical(Q)
    action = m.sample()
    actor.saved_log_probabilities.append(m.log_prob(action))
    return action.item(), current_Q

  def select_target_action(self, state, state_dim):
    #state_tensor = torch.Tensor(state.reshape(1,env.customer_count,state_dim)).to(device)
    target_Q = actor(state)
    Q = target_Q.detach().cpu().numpy().reshape(env.customer_count)
    action = np.argmax(Q)
    return action, target_Q

  def train(self, iterations, batch_size=32, tau=0.005):
    for it in range(iterations):
      running_reward = 8
      
      # Ensuring that the variables have been cleared
      del actor.rewards[:]
      del actor.saved_log_probabilities[:]
      del critic.values[:]
      
      returns = [] # Create an empty list for the returns for the batch of episodes
      Steps = [] # Create an empty list for the number of steps required for each episode
      steps = 0
      for b in range(batch_size):
        # Working with the actor and the critic
        obs = env.reset(((it+1)*b))
        obs = torch.Tensor(obs.reshape(1,env.customer_count,state_dim)).to(device)
        InstantReward = 0
        done = False
        Steps.append(steps)
        steps = 0
        while not done:
          # Complete episode playing actions as selected by the actor
          action, current_Q = self.select_action(obs, state_dim)
          obs, reward, done = env.step(action)
          InstantReward += reward
          actor.rewards.append(reward)
          actor.dones.append(done)
          
          obs = torch.Tensor(obs.reshape(1,env.customer_count,state_dim)).to(device)
          
          # Predict the expected value of the action with the critic
          predicted_value = critic(obs, current_Q)
          critic.values.append(predicted_value)
          steps += 1
        running_reward = 0.05 * InstantReward + (1 - 0.05) * running_reward
        #print(env.routes)
      # Let's finish the batch by updating the return
      R = 0
      for i in range(len(actor.rewards)):
        if actor.dones[len(actor.rewards)-1-i] == True:
          R = 0
        R = actor.rewards[len(actor.rewards)-1-i] + discount*R
        returns.insert(0,R)
      
      returns = torch.tensor(returns)
      returns = (returns - torch.tensor(critic.values))

      actor_loss = []
      critic_loss = []
      # Let's update the models
      for log_prob, R in zip(actor.saved_log_probabilities, returns):
        actor_loss.append(-log_prob * R)
      self.actor_optimizer.zero_grad()
      actor_loss = torch.cat(actor_loss).mean()
      actor_loss.backward(retain_graph=True)
      self.actor_optimizer.step()
      del actor.rewards[:]
      del actor.saved_log_probabilities[:]

      for values, R in zip(critic.values, returns):
        critic_loss.append((R-values)**2)
      self.critic_optimizer.zero_grad()
      critic_loss = torch.cat(critic_loss).mean()
      critic_loss.backward(retain_graph=True)
      self.critic_optimizer.step()
      del critic.values[:]
      
      
      print("Batch {} completed, Last reward: {:.2f} Average reward: {:.2f}".format(it, InstantReward, running_reward))
  
  # Make a save method to save a trained model
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_actor_baseline.pth' % (directory, filename))
  
  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_actor_baseline.pth' % (directory, filename)))


## Create a function that evaluates the policy by calculating its average reward over 10 episodes

In [9]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    obs = torch.Tensor(obs.reshape(1,env.customer_count,state_dim)).to(device)
    done = False
    while not done:
      action, current_Q = policy.select_target_action(obs, state_dim)
      obs, reward, done = env.step(action)
      obs = torch.Tensor(obs.reshape(1,env.customer_count,state_dim)).to(device)
      avg_reward += reward
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

## Set the parameters

In [10]:
env_name = "CVRP" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 1000 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
batch_size = 128 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.001 # Target network update rate

## Create a file name for the two saved models: the Actor and Critic models

In [11]:
file_name = "%s_%s_%s" % ("Actor_Critic", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: Actor_Critic_CVRP_0
---------------------------------------


## Create a folder to save the trained models

In [12]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## Create an instance of the CVRP environment

In [13]:
env = VRPEnv()

## Set seeds and get the necessary information on the states and actions in the chosen environment

In [14]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.customer_count
max_action = 1

## Create the policy network

In [15]:
print(state_dim, action_dim, max_action)
policy = Actor_Critic(state_dim)

4 11 1


## Create the Experience Replay memory

In [16]:
replay_buffer = ReplayBuffer()

## Define a list where all the evaluation results over 10 episodes are stored

In [17]:
evaluations = [evaluate_policy(policy, eval_episodes=1)]
print(env.routes)

---------------------------------------
Average Reward over the Evaluation Step: -5.430767
---------------------------------------
[[0, 7, 5, 4, 6, 1, 0], [0, 10, 2, 9, 0], [0, 3, 8, 0]]


In [18]:
print(env.routes)
print(env.state)

[[0, 7, 5, 4, 6, 1, 0], [0, 10, 2, 9, 0], [0, 3, 8, 0]]
[[0.50808735 0.41159729 0.         2.        ]
 [0.00320192 0.94616417 0.         2.        ]
 [0.44781966 0.8304828  0.         2.        ]
 [0.2957106  0.99642091 0.         2.        ]
 [0.54220998 0.74679985 0.         2.        ]
 [0.46113452 0.37346651 0.         2.        ]
 [0.35870685 0.83994957 0.         2.        ]
 [0.74349391 0.34396168 0.         2.        ]
 [0.15516087 0.86054579 0.         2.        ]
 [0.07953033 0.69805703 0.         2.        ]
 [0.21270309 0.28743551 0.         2.        ]]


## Create a folder directory in which the final results will be saved

In [19]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

## Initialize the training process variables

In [20]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [None]:
# We start the main loop over 500,000 timesteps
env.reset()
total_timesteps = 0
obs = copy.deepcopy(env.reset())

policy.train(int(max_timesteps), batch_size, tau)

# Add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

Batch 0 completed, Last reward: -7.37 Average reward: -6.79
Batch 1 completed, Last reward: -6.71 Average reward: -6.82
Batch 2 completed, Last reward: -5.70 Average reward: -6.82
Batch 3 completed, Last reward: -6.52 Average reward: -6.53
Batch 4 completed, Last reward: -7.07 Average reward: -6.55
Batch 5 completed, Last reward: -7.01 Average reward: -6.74
Batch 6 completed, Last reward: -4.88 Average reward: -6.68
Batch 7 completed, Last reward: -5.24 Average reward: -6.71
Batch 8 completed, Last reward: -7.61 Average reward: -6.83
Batch 9 completed, Last reward: -6.70 Average reward: -6.70
Batch 10 completed, Last reward: -6.85 Average reward: -6.68
Batch 11 completed, Last reward: -8.05 Average reward: -6.89
Batch 12 completed, Last reward: -6.94 Average reward: -6.94
Batch 13 completed, Last reward: -7.41 Average reward: -6.37
Batch 14 completed, Last reward: -5.64 Average reward: -6.98
Batch 15 completed, Last reward: -7.74 Average reward: -6.56
Batch 16 completed, Last reward: -

Batch 134 completed, Last reward: -7.12 Average reward: -6.82
Batch 135 completed, Last reward: -5.41 Average reward: -6.83
Batch 136 completed, Last reward: -5.06 Average reward: -6.55
Batch 137 completed, Last reward: -8.91 Average reward: -6.94
Batch 138 completed, Last reward: -5.25 Average reward: -7.01
Batch 139 completed, Last reward: -6.82 Average reward: -6.66
Batch 140 completed, Last reward: -7.51 Average reward: -6.72
Batch 141 completed, Last reward: -4.83 Average reward: -6.66
Batch 142 completed, Last reward: -8.98 Average reward: -6.63
Batch 143 completed, Last reward: -7.07 Average reward: -6.61
Batch 144 completed, Last reward: -5.91 Average reward: -6.75
Batch 145 completed, Last reward: -5.62 Average reward: -6.53
Batch 146 completed, Last reward: -6.19 Average reward: -7.00
Batch 147 completed, Last reward: -6.42 Average reward: -6.50
Batch 148 completed, Last reward: -6.67 Average reward: -6.88
Batch 149 completed, Last reward: -7.50 Average reward: -6.72
Batch 15

Batch 267 completed, Last reward: -7.31 Average reward: -6.74
Batch 268 completed, Last reward: -6.42 Average reward: -6.65
Batch 269 completed, Last reward: -7.59 Average reward: -7.22
Batch 270 completed, Last reward: -7.36 Average reward: -6.85
Batch 271 completed, Last reward: -4.72 Average reward: -6.84
Batch 272 completed, Last reward: -5.46 Average reward: -6.59
Batch 273 completed, Last reward: -7.11 Average reward: -6.63
Batch 274 completed, Last reward: -6.05 Average reward: -6.70
Batch 275 completed, Last reward: -7.46 Average reward: -6.66
Batch 276 completed, Last reward: -6.52 Average reward: -6.93
Batch 277 completed, Last reward: -5.85 Average reward: -6.76
Batch 278 completed, Last reward: -5.42 Average reward: -6.79
Batch 279 completed, Last reward: -7.08 Average reward: -6.66
Batch 280 completed, Last reward: -7.55 Average reward: -6.55
Batch 281 completed, Last reward: -6.33 Average reward: -7.09
Batch 282 completed, Last reward: -6.26 Average reward: -6.64
Batch 28

Batch 400 completed, Last reward: -7.81 Average reward: -6.85
Batch 401 completed, Last reward: -7.16 Average reward: -6.52
Batch 402 completed, Last reward: -6.42 Average reward: -6.65
Batch 403 completed, Last reward: -6.09 Average reward: -6.64
Batch 404 completed, Last reward: -7.12 Average reward: -6.58
Batch 405 completed, Last reward: -6.13 Average reward: -6.61
Batch 406 completed, Last reward: -9.49 Average reward: -7.26
Batch 407 completed, Last reward: -4.63 Average reward: -6.64
Batch 408 completed, Last reward: -4.56 Average reward: -6.76
Batch 409 completed, Last reward: -7.73 Average reward: -6.53
Batch 410 completed, Last reward: -5.86 Average reward: -6.56
Batch 411 completed, Last reward: -5.37 Average reward: -6.73
Batch 412 completed, Last reward: -7.33 Average reward: -6.69
Batch 413 completed, Last reward: -7.84 Average reward: -6.79
Batch 414 completed, Last reward: -4.44 Average reward: -6.41
Batch 415 completed, Last reward: -7.57 Average reward: -6.89
Batch 41

In [None]:
print(env.routes)

## The inference policy function

In [None]:
def evaluate_final_policy(policy,random = 1, eval_episodes=1):
  avg_reward = 0.
  distance = 0.
  for k in range(eval_episodes):
    obs = env.reset(seed = random)
    obs = torch.Tensor(obs.reshape(1,env.customer_count,state_dim)).to(device)
    done = False
    while not done:
      action, current_Q = policy.select_target_action(obs, state_dim)
      obs, reward, done = env.step(action)
      obs = torch.Tensor(obs.reshape(1,env.customer_count,state_dim)).to(device)
      avg_reward += reward
    total_distance = 0
    for j in range(len(env.routes)):
      for i in range(len(env.routes[j])-1):
        total_distance = total_distance + ((env.VRP[env.routes[j][i],0]-env.VRP[env.routes[j][i+1],0])**2+(env.VRP[env.routes[j][i],1]-env.VRP[env.routes[j][i+1],1])**2)**0.5
    distance += total_distance
  distance /= eval_episodes
  print ("---------------------------------------")
  print ("Average Distance over the Evaluation Step: %f" % (distance))
  print ("---------------------------------------")
  return distance

## Let's see if we can test the implementation

In [None]:
results = np.zeros(100)
#policy.load(file_name, './pytorch_models/')
for i in range(100):
  evaluations = [evaluate_final_policy(policy, random = i)]
  results[i] = evaluations[0]
print(results)

In [None]:
print(results.mean())