# Actor-critic learning for the CVRP

## Importing the libraries

In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from gym import wrappers
from torch.autograd import Variable
from collections import deque

## Creating a CVRP environment

In [2]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
import random
import copy
import math

class VRPEnv(gym.Env):
  def __init__(self):
    # customer count ('0' is depot) 
    self.customer_count = 11
    # the capacity of vehicles
    self.vehicle_capacity = 2
  
    self.action_space = spaces.Discrete(3)
    self.observation_space = spaces.Box(low=0,high=1, shape=(4,1), dtype=np.float64)
    self.VRP = np.array((self.customer_count,4))
    self._max_episode_steps = 1000
    self.viewer = None
    self.state = None
    self.steps_beyond_done = None
    self.route = []
    self.route.append(0)
    self.previous_action = 0
    self.hn_actor = torch.zeros([1,self.customer_count,128], dtype=torch.float32).to(device)
    self.hn_actor_target = torch.zeros([1,self.customer_count,128], dtype=torch.float32).to(device)
    

  def reset(self, seed=200):
    if seed == 200:
      seed = int(time.time())
    np.random.seed(seed)
    x_locations = (np.random.rand(self.customer_count)).reshape((self.customer_count,1))
    y_locations = (np.random.rand(self.customer_count)).reshape((self.customer_count,1))
    demand = (np.random.randint(1,9,self.customer_count).reshape((self.customer_count,1))).reshape((self.customer_count,1))/10 # Normalise to between 0.1 and 0.9
    capacity = np.repeat(self.vehicle_capacity,self.customer_count).reshape((self.customer_count,1))
    VRP = np.concatenate((np.concatenate((np.concatenate((x_locations,y_locations), axis=1),demand),axis=1),capacity),axis=1)
    self.VRP = VRP.reshape((self.customer_count,4))
    self.unserved_customers = []
    for i in range(1, self.customer_count):
      self.unserved_customers.append(i)
    self.routes = []
    self.route = []
    self.route.append(0)
    self.VRP[0,2] = 0 # Set the demand at thedepot to 0
    self.state = copy.deepcopy(self.VRP)
    self.previous_action = 0
    return self.state
  

  def step(self, action):
    # Calculate the reward as the negative euclidean distance
    reward = -((self.state[self.previous_action,0]-self.state[action,0])**2+(self.state[self.previous_action,1]-self.state[action,1])**2)**0.5 # - Euclidean distance between customers
    load = self.state[0,3]
    self.state[:,3] = max(0,(load-self.state[action,2])) # Update the vehicle load
    self.state[action, 2] = max(0,self.state[action,2]-load) # Update the demand at served customer
    done = False
    if action == 0: # Return to the depot
      self.route.append(action) # End route
      self.routes.append(self.route) # Add subroute to list of all routes
      self.route = [] # Initiate new subroute
      self.state[:,3] = self.vehicle_capacity # Refill the vehicle
    self.route.append(action) # Add action to the subroute
    if max(self.state[:,2]) > 0: # If there are unserved customers left
      done = False
    elif max(self.state[:,2]) == 0 and action == 0: # If there are no unserved customers left and we have returned to the depot
      done = True
      self.route.append(0)
    self.previous_action = action # Update the previous action
    return self.state, reward, done


## Let's test the environment step function

In [3]:
env = VRPEnv() # Create an instance of the environment
state = env.reset() # Reset the environment
action = 2 # Perform action with customer 2
print(state)
state, reward, done = env.step(action) # Perform the actual transition
print(state)

[[0.11437463 0.35433916 0.         2.        ]
 [0.90856365 0.30937283 0.3        2.        ]
 [0.59436509 0.75216188 0.7        2.        ]
 [0.50406788 0.0964249  0.8        2.        ]
 [0.90713434 0.04520073 0.6        2.        ]
 [0.51126682 0.32464299 0.2        2.        ]
 [0.85039854 0.7478735  0.3        2.        ]
 [0.35165363 0.10587075 0.1        2.        ]
 [0.28080822 0.34687137 0.1        2.        ]
 [0.35065381 0.80720308 0.1        2.        ]
 [0.68470168 0.90565026 0.2        2.        ]]
[[0.11437463 0.35433916 0.         1.3       ]
 [0.90856365 0.30937283 0.3        1.3       ]
 [0.59436509 0.75216188 0.         1.3       ]
 [0.50406788 0.0964249  0.8        1.3       ]
 [0.90713434 0.04520073 0.6        1.3       ]
 [0.51126682 0.32464299 0.2        1.3       ]
 [0.85039854 0.7478735  0.3        1.3       ]
 [0.35165363 0.10587075 0.1        1.3       ]
 [0.28080822 0.34687137 0.1        1.3       ]
 [0.35065381 0.80720308 0.1        1.3       ]
 [0.68470168

## Initialize the Experience Replay memory

In [4]:
class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      #self.storage[int(self.ptr)] = transition
      #self.ptr = (self.ptr + 1) % self.max_size
      self.storage.pop(0)
      self.storage.append(transition)
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind =  np.arange((len(self.storage)-(batch_size+1)),len(self.storage)-1,1) #np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

## Build the neural network for the Actor and Actor-target models - contains the attention mechanism

In [5]:
class Actor(nn.Module):
  
  def __init__(self, state_dim=4, embed_size = 128):#, action_dim, max_action):
    super(Actor, self).__init__()
    self.embed = nn.Linear((state_dim), embed_size) # Encoding to higher dimensional space - can also be changed to convolutional layer as in the paper
    self.u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the attention mechanism
    self.v_t_a = nn.Linear(embed_size,1) # Linear for getting u_t
    self.bar_u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the context vector
    self.a_t = nn.Softmax(dim = 1) # Softmax layer for the attention mechanism
    self.v_t_u = nn.Linear(embed_size,1) # Linear for getting u_t
    self.final = nn.Softmax(dim = 1) # Softmax layer for the final output

  def forward(self, x, hn = env.hn_actor):
    cond1 = (x[:,:,2]<x[:,:,3]).int() # Can we meet the demand
    cond2 = (x[:,:,2]>0).int() # Is there demand at the customer
    mask1 = torch.minimum(cond1,cond2) # Select those customers with demand, and whose demand we can meet
    mask1 = torch.reshape(mask1,(len(x),env.customer_count,1))
    x = self.embed(x)
    u, hn = self.u_t(x, hn)
    u = self.v_t_a(u)
    a = self.a_t(u) # Up to equation (4) now
    c = torch.randn(x.shape)
    c = torch.mul(x,a)
    c = torch.sum(c, 0)
    c = torch.reshape(c,(1,env.customer_count,128))
    u_bar, hu = self.bar_u_t(x,c)
    u_bar = self.v_t_u(u_bar)
    output = self.final(u_bar)
    output = torch.mul(output,mask1)
    return output

class Actor_Target(nn.Module):
  
  def __init__(self, state_dim=4, embed_size = 128):#, action_dim, max_action):
    super(Actor_Target, self).__init__()
    self.embed = nn.Linear((state_dim), embed_size) # Encoding to higher dimensional space - can also be changed to convolutional layer as in the paper
    self.u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the attention mechanism
    self.v_t_a = nn.Linear(embed_size,1) # Linear for getting u_t
    self.bar_u_t = nn.RNN(embed_size,embed_size,1) # RNN Layer for the context vector
    self.a_t = nn.Softmax(dim = 1) # Softmax layer for the attention mechanism
    self.v_t_u = nn.Linear(embed_size,1) # Linear for getting u_t
    self.final = nn.Softmax(dim = 1) # Softmax layer for the final output

  def forward(self, x, hn = env.hn_actor_target):
    cond1 = (x[:,:,2]<x[:,:,3]).int() # Can we meet the demand
    cond2 = (x[:,:,2]>0).int() # Is there demand at the customer
    mask1 = torch.minimum(cond1,cond2) # Select those customers with demand, and whose demand we can meet
    mask1 = torch.reshape(mask1,(len(x),env.customer_count,1))
    x = self.embed(x)
    u, hn = self.u_t(x, hn)
    u = self.v_t_a(u)
    a = self.a_t(u) # Up to equation (4) now
    c = torch.randn(x.shape)
    c = torch.mul(x,a)
    c = torch.sum(c, 0)
    c = torch.reshape(c,(1,env.customer_count,128))
    u_bar, hu = self.bar_u_t(x,c)
    u_bar = self.v_t_u(u_bar)
    output = self.final(u_bar)
    output = torch.mul(output,mask1)
    return output

## Build the neural network for the Critic and Critic-target model

In [6]:
class Critic(nn.Module):
  
  def __init__(self, state_dim=4, action_dim = env.customer_count, embed_size = 128):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.layer_1 = nn.Linear(state_dim, embed_size) # Perform the embedding
    self.layer_2 = nn.Linear(embed_size, embed_size) # Adding the single dense layer
    self.layer_3 = nn.Linear(embed_size, 1) # Adding the output layer


  def forward(self, x, u): # x is the state, u is the action
    # Forward-Propagation on the Critic Neural Network
    x1 = F.relu(self.layer_1(x))
    ws = torch.mul(x1,u)
    x2 = F.relu(self.layer_2(ws))
    x2 = self.layer_3(x2)
    x2 = torch.sum(x2,1)
    return x2


## Testing the actor and critic networks to confirm their output

In [7]:
env = VRPEnv()
env.reset()
state = env.reset()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state = torch.Tensor(state.reshape(1,env.customer_count,4)).to(device)
actor = Actor().to(device)
actor_target = Actor_Target().to(device)
prediction = actor(state)
prediction_target = actor_target(state)
print(prediction)
critic = Critic().to(device)
q_value = critic(state,prediction)
print(q_value)

tensor([[[0.0000],
         [0.0941],
         [0.0902],
         [0.1007],
         [0.1012],
         [0.0910],
         [0.0877],
         [0.0926],
         [0.0887],
         [0.0828],
         [0.0840]]], device='cuda:0', grad_fn=<MulBackward0>)
tensor([[-0.4111]], device='cuda:0', grad_fn=<SumBackward1>)


## Training Process

In [8]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)

# Building the Training Process into a class
class Actor_Critic(object):
  
  def __init__(self, state_dim):
    self.actor = Actor(state_dim).to(device)
    self.actor_target = Actor_Target(state_dim).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr = 0.0001)
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state, state_dim):
    state_tensor = torch.Tensor(state.reshape(1,env.customer_count,state_dim)).to(device)
    current_Q = actor(state_tensor).detach()
    current_Q = current_Q.detach().cpu().numpy().reshape(env.customer_count)
    action = np.argmax(current_Q)
    return action, current_Q

  def select_target_action(self, state):
    state_tensor = torch.Tensor(state.reshape(1,env.customer_count,state_dim)).to(device)
    target_Q = actor_target(state_tensor).detach()
    target_Q = target_Q.detach().cpu().numpy().reshape(env.customer_count)
    action = torch.argmax(target_Q)
    return action, target_Q

  def train(self, replay_buffer, iterations, batch_size=32, discount=0.99, tau=0.005):
    
    for it in range(iterations):
      
      # Sample a batch of transitions (s, s’, a, r) from the memory
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)

      # Determine the next action based on the actor target
      next_action = self.actor_target(next_state)
      
      # Determine the target Q-value based on the critic target
      target_Q = self.critic_target(next_state, next_action)

      # Get the final target using the RL update rule
      target_Q = reward + ((1 - done) * discount * target_Q).detach()

      # Determine the probabilities of the actions chosen by the actor
      action = self.actor(state)

      # Determine the current Q-value based on the critic
      current_Q = self.critic(state,action)

      # Compute the loss coming from the critic models: Critic Loss = MSE_Loss(Q, Qt)
      critic_loss = F.mse_loss(current_Q, target_Q)

      # Backpropagate this Critic loss and update the parameters of the Critic model with Adam optimizer
      self.critic_optimizer.zero_grad()
      critic_loss.backward(retain_graph=True)
      self.critic_optimizer.step()

      # Update our Actor model by performing gradient descent on the output of the Critic model  
      actor_loss = -self.critic(state, action).mean()
      self.actor_optimizer.zero_grad()
      actor_loss.backward()
      self.actor_optimizer.step()

      # Update the weights of the Actor target by polyak averaging
      for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

      # Update the weights of the Critic target by polyak averaging
      for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
  
  # Make a save method to save a trained model
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))


## Create a function that evaluates the policy by calculating its average reward over 10 episodes

In [9]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action, current_Q = policy.select_action(obs, state_dim)
      obs, reward, done = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

## Set the parameters

In [10]:
env_name = "CVRP" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
batch_size = 128 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.001 # Target network update rate

## Create a file name for the two saved models: the Actor and Critic models

In [11]:
file_name = "%s_%s_%s" % ("Actor_Critic", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: Actor_Critic_CVRP_0
---------------------------------------


## Create a folder to save the trained models

In [12]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## Create an instance of the CVRP environment

In [13]:
env = VRPEnv()

## Set seeds and get the necessary information on the states and actions in the chosen environment

In [14]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.customer_count
max_action = 1

## Create the policy network

In [15]:
print(state_dim, action_dim, max_action)
policy = Actor_Critic(state_dim)

4 11 1


## Create the Experience Replay memory

In [16]:
replay_buffer = ReplayBuffer()

## Define a list where all the evaluation results over 10 episodes are stored

In [17]:
evaluations = [evaluate_policy(policy, eval_episodes=1)]
print(env.routes)

---------------------------------------
Average Reward over the Evaluation Step: -5.339425
---------------------------------------
[[0, 6, 9, 1, 10, 4, 7, 0], [0, 5, 2, 8, 0], [0, 3, 0]]


In [18]:
print(env.routes)
print(env.state)

[[0, 6, 9, 1, 10, 4, 7, 0], [0, 5, 2, 8, 0], [0, 3, 0]]
[[0.89425677 0.62202079 0.         2.        ]
 [0.31273994 0.02823563 0.         2.        ]
 [0.56317247 0.71834622 0.         2.        ]
 [0.15559268 0.93019907 0.         2.        ]
 [0.30514436 0.31988291 0.         2.        ]
 [0.15515951 0.74133423 0.         2.        ]
 [0.57963753 0.01344141 0.         2.        ]
 [0.88074163 0.72017397 0.         2.        ]
 [0.51136044 0.80973621 0.         2.        ]
 [0.33875451 0.02505394 0.         2.        ]
 [0.33504032 0.21694058 0.         2.        ]]


## Create a folder directory in which the final results will be saved

In [19]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

## Initialize the training process variables

In [20]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [None]:
# We start the main loop over 500,000 timesteps
env.reset()
total_timesteps = 0
obs = copy.deepcopy(env.reset())
maximum = -1000
while total_timesteps < max_timesteps:
  
  # If the episode is done
  if done:
    
    # If we are not at the very beginning, we start the training process of the model
    if total_timesteps != 0 and total_timesteps > batch_size:
      print("Total Timesteps: {} Episode Num: {} Reward: {} Epsilon: {}".format(total_timesteps, episode_num, episode_reward, epsilon))
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau)

    # We evaluate the episode and we save the policy
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      print("Total timesteps: {} Epsilon: {}".format(total_timesteps, min(1,epsilon)))
      evaluations.append(evaluate_policy(policy))
      if evaluations[len(evaluations)-1] > maximum:
        policy.save(file_name, directory="./pytorch_models")
        maximum = evaluations[len(evaluations)-1]
      np.save("./results/%s" % (file_name), evaluations)
    
    # When the training step is done, we reset the state of the environment
    obs = copy.deepcopy(env.reset())
    
    # Set the Done to False
    done = False
    
    # Set rewards and episode timesteps to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
  
  # Work with epsilon-greedy
  epsilon = 50000/(total_timesteps+1)
  np.random.seed(total_timesteps)
  if np.random.rand() < min(1,epsilon):
    feasible = np.array([0])
    for i in range(1,env.customer_count):
      if obs[i,2] < obs[i,3] and obs[i,2] != 0:
        feasible = np.concatenate((feasible,np.array([i])))
    if len(feasible) > 1:
      feasible = np.delete(feasible,0)
    action = np.random.choice(feasible)
  else: # Choose greedy action
    action, current_Q = policy.select_action(obs, state_dim)
  
  # The agent performs the action in the environment, then reaches the next state and receives the reward
  new_obs, reward, done = env.step(action)

  # Check if the episode is done
  done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

  # Increase the total reward
  episode_reward += reward

  # Store the new transition into the Experience Replay memory (ReplayBuffer)
  replay_buffer.add((obs, copy.deepcopy(new_obs), action, reward, done_bool)) # Have to use deepcopy to prevent overwriting of historic next states

  # Update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
  obs = copy.deepcopy(new_obs)
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

# Add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

Total Timesteps: 134 Episode Num: 11 Reward: -7.7166221027212 Epsilon: 373.13432835820896
Total Timesteps: 147 Episode Num: 12 Reward: -7.0453468193966575 Epsilon: 340.13605442176873
Total Timesteps: 159 Episode Num: 13 Reward: -6.9251317529338445 Epsilon: 314.4654088050315
Total Timesteps: 172 Episode Num: 14 Reward: -6.57280741089108 Epsilon: 290.69767441860466
Total Timesteps: 184 Episode Num: 15 Reward: -6.555108225007234 Epsilon: 271.7391304347826
Total Timesteps: 197 Episode Num: 16 Reward: -7.047276208222322 Epsilon: 253.80710659898477
Total Timesteps: 210 Episode Num: 17 Reward: -6.895482958275187 Epsilon: 238.0952380952381
Total Timesteps: 223 Episode Num: 18 Reward: -5.889824317403686 Epsilon: 224.2152466367713
Total Timesteps: 236 Episode Num: 19 Reward: -6.677225406454144 Epsilon: 211.864406779661
Total Timesteps: 249 Episode Num: 20 Reward: -7.183859911763287 Epsilon: 200.80321285140562
Total Timesteps: 263 Episode Num: 21 Reward: -6.627466414119959 Epsilon: 190.1140684410

Total Timesteps: 1292 Episode Num: 101 Reward: -7.375219285719137 Epsilon: 38.69969040247678
Total Timesteps: 1304 Episode Num: 102 Reward: -4.714605910170475 Epsilon: 38.34355828220859
Total Timesteps: 1317 Episode Num: 103 Reward: -6.848132720129744 Epsilon: 37.96507213363705
Total Timesteps: 1329 Episode Num: 104 Reward: -6.543914129808644 Epsilon: 37.62227238525207
Total Timesteps: 1342 Episode Num: 105 Reward: -7.122810609516416 Epsilon: 37.257824143070046
Total Timesteps: 1355 Episode Num: 106 Reward: -6.409329938507055 Epsilon: 36.90036900369004
Total Timesteps: 1368 Episode Num: 107 Reward: -5.638941662466966 Epsilon: 36.54970760233918
Total Timesteps: 1380 Episode Num: 108 Reward: -7.5297128916094085 Epsilon: 36.231884057971016
Total Timesteps: 1393 Episode Num: 109 Reward: -5.503562781436424 Epsilon: 35.89375448671931
Total Timesteps: 1406 Episode Num: 110 Reward: -7.352307218590518 Epsilon: 35.56187766714083
Total Timesteps: 1419 Episode Num: 111 Reward: -4.963487535888658 E

Total Timesteps: 2426 Episode Num: 189 Reward: -6.797776877397653 Epsilon: 20.610057708161584
Total Timesteps: 2439 Episode Num: 190 Reward: -8.53843176726857 Epsilon: 20.50020500205002
Total Timesteps: 2453 Episode Num: 191 Reward: -7.9576250536062405 Epsilon: 20.383204239706483
Total Timesteps: 2466 Episode Num: 192 Reward: -9.588808003237716 Epsilon: 20.275750202757504
Total Timesteps: 2479 Episode Num: 193 Reward: -7.29000876005507 Epsilon: 20.169423154497782
Total Timesteps: 2492 Episode Num: 194 Reward: -7.128708665938453 Epsilon: 20.064205457463885
Total Timesteps: 2505 Episode Num: 195 Reward: -6.907882711637171 Epsilon: 19.960079840319363
Total Timesteps: 2518 Episode Num: 196 Reward: -7.314185601454649 Epsilon: 19.857029388403493
Total Timesteps: 2531 Episode Num: 197 Reward: -6.767081224378841 Epsilon: 19.755037534571315
Total Timesteps: 2544 Episode Num: 198 Reward: -6.1324451375658535 Epsilon: 19.654088050314467
Total Timesteps: 2557 Episode Num: 199 Reward: -5.16141901424

Total Timesteps: 3564 Episode Num: 277 Reward: -7.174044320782159 Epsilon: 14.029180695847362
Total Timesteps: 3577 Episode Num: 278 Reward: -8.024884272757578 Epsilon: 13.97819401733296
Total Timesteps: 3590 Episode Num: 279 Reward: -6.652580954866882 Epsilon: 13.927576601671309
Total Timesteps: 3603 Episode Num: 280 Reward: -5.797428674827902 Epsilon: 13.877324451845684
Total Timesteps: 3616 Episode Num: 281 Reward: -6.914752280910663 Epsilon: 13.827433628318584
Total Timesteps: 3629 Episode Num: 282 Reward: -6.71876858104695 Epsilon: 13.777900248002204
Total Timesteps: 3643 Episode Num: 283 Reward: -8.41910557728666 Epsilon: 13.724951962668131
Total Timesteps: 3656 Episode Num: 284 Reward: -5.980656932648756 Epsilon: 13.676148796498905
Total Timesteps: 3669 Episode Num: 285 Reward: -7.221372485644578 Epsilon: 13.627691469065141
Total Timesteps: 3682 Episode Num: 286 Reward: -6.340578494974445 Epsilon: 13.579576317218903
Total Timesteps: 3695 Episode Num: 287 Reward: -5.4934579772830

Total Timesteps: 4705 Episode Num: 365 Reward: -6.171720214407562 Epsilon: 10.626992561105208
Total Timesteps: 4719 Episode Num: 366 Reward: -6.675785042837068 Epsilon: 10.595465140919686
Total Timesteps: 4732 Episode Num: 367 Reward: -5.557286485297405 Epsilon: 10.566356720202874
Total Timesteps: 4746 Episode Num: 368 Reward: -6.271716537999604 Epsilon: 10.53518752633797
Total Timesteps: 4759 Episode Num: 369 Reward: -5.792138446853599 Epsilon: 10.506408909434755
Total Timesteps: 4772 Episode Num: 370 Reward: -7.045041591105368 Epsilon: 10.477787091366304
Total Timesteps: 4785 Episode Num: 371 Reward: -7.839936730691921 Epsilon: 10.449320794148381
Total Timesteps: 4798 Episode Num: 372 Reward: -6.707652032969159 Epsilon: 10.421008753647353
Total Timesteps: 4811 Episode Num: 373 Reward: -8.205094222144142 Epsilon: 10.392849719393057
Total Timesteps: 4824 Episode Num: 374 Reward: -6.728769143084986 Epsilon: 10.364842454394694
Total Timesteps: 4838 Episode Num: 375 Reward: -7.20250330163

Total Timesteps: 5825 Episode Num: 452 Reward: -4.665002160006941 Epsilon: 8.583690987124463
Total Timesteps: 5838 Episode Num: 453 Reward: -6.210815841389306 Epsilon: 8.564576909900651
Total Timesteps: 5850 Episode Num: 454 Reward: -5.231564807123809 Epsilon: 8.547008547008547
Total Timesteps: 5863 Episode Num: 455 Reward: -6.521012943175128 Epsilon: 8.528057308545113
Total Timesteps: 5876 Episode Num: 456 Reward: -6.750015086672204 Epsilon: 8.509189925119129
Total Timesteps: 5889 Episode Num: 457 Reward: -5.784419856913712 Epsilon: 8.49040584139922
Total Timesteps: 5902 Episode Num: 458 Reward: -7.664573761243926 Epsilon: 8.471704506946798
Total Timesteps: 5915 Episode Num: 459 Reward: -6.470760084444697 Epsilon: 8.4530853761623
Total Timesteps: 5928 Episode Num: 460 Reward: -6.6581863012374765 Epsilon: 8.434547908232119
Total Timesteps: 5940 Episode Num: 461 Reward: -4.794339565622102 Epsilon: 8.417508417508417
Total Timesteps: 5953 Episode Num: 462 Reward: -5.388478126537354 Epsilo

Total Timesteps: 6976 Episode Num: 541 Reward: -7.0678773791228755 Epsilon: 7.16743119266055
Total Timesteps: 6988 Episode Num: 542 Reward: -6.670693534251346 Epsilon: 7.1551230681167715
Total Timesteps: 7001 Episode Num: 543 Reward: -5.3948397437858935 Epsilon: 7.14183688044565
Total Timesteps: 7014 Episode Num: 544 Reward: -6.226037774177382 Epsilon: 7.1285999429712
Total Timesteps: 7027 Episode Num: 545 Reward: -5.707143729186672 Epsilon: 7.115411982353779
Total Timesteps: 7040 Episode Num: 546 Reward: -5.43636717723538 Epsilon: 7.1022727272727275
Total Timesteps: 7053 Episode Num: 547 Reward: -7.463730261373983 Epsilon: 7.089181908407769
Total Timesteps: 7066 Episode Num: 548 Reward: -6.570261928428524 Epsilon: 7.076139258420605
Total Timesteps: 7078 Episode Num: 549 Reward: -6.569554685214745 Epsilon: 7.064142413111048
Total Timesteps: 7090 Episode Num: 550 Reward: -6.50323260922499 Epsilon: 7.052186177715091
Total Timesteps: 7103 Episode Num: 551 Reward: -7.388800457450253 Epsilo

Total Timesteps: 8113 Episode Num: 630 Reward: -6.420502971513211 Epsilon: 6.162948354492789
Total Timesteps: 8126 Episode Num: 631 Reward: -6.475201718772217 Epsilon: 6.153088850603003
Total Timesteps: 8138 Episode Num: 632 Reward: -5.95023925708127 Epsilon: 6.144015728680265
Total Timesteps: 8151 Episode Num: 633 Reward: -6.431223960361833 Epsilon: 6.13421666053245
Total Timesteps: 8164 Episode Num: 634 Reward: -7.55563863080256 Epsilon: 6.124448799608035
Total Timesteps: 8177 Episode Num: 635 Reward: -6.425854427660539 Epsilon: 6.114711997064938
Total Timesteps: 8190 Episode Num: 636 Reward: -4.47936134843815 Epsilon: 6.105006105006105
Total Timesteps: 8203 Episode Num: 637 Reward: -4.9333706429648405 Epsilon: 6.095330976472022
Total Timesteps: 8216 Episode Num: 638 Reward: -6.3203618002514 Epsilon: 6.085686465433301
Total Timesteps: 8229 Episode Num: 639 Reward: -5.141571743862517 Epsilon: 6.076072426783327
Total Timesteps: 8242 Episode Num: 640 Reward: -7.264792283446886 Epsilon: 

Total Timesteps: 9260 Episode Num: 719 Reward: -6.445864403751301 Epsilon: 5.399568034557236
Total Timesteps: 9273 Episode Num: 720 Reward: -6.716839318269545 Epsilon: 5.391998274560552
Total Timesteps: 9287 Episode Num: 721 Reward: -8.076039170685204 Epsilon: 5.383869925702595
Total Timesteps: 9300 Episode Num: 722 Reward: -5.390675403754378 Epsilon: 5.376344086021505
Total Timesteps: 9313 Episode Num: 723 Reward: -5.249477020081903 Epsilon: 5.368839256952647
Total Timesteps: 9326 Episode Num: 724 Reward: -7.77618115775501 Epsilon: 5.36135535063264
Total Timesteps: 9339 Episode Num: 725 Reward: -4.331429296703812 Epsilon: 5.353892279687333
Total Timesteps: 9352 Episode Num: 726 Reward: -6.1093354477476 Epsilon: 5.3464499572284
Total Timesteps: 9365 Episode Num: 727 Reward: -8.05703010547475 Epsilon: 5.339028296849973
Total Timesteps: 9377 Episode Num: 728 Reward: -6.187082461799649 Epsilon: 5.332195798229711
Total Timesteps: 9390 Episode Num: 729 Reward: -6.9502341148406765 Epsilon: 5

Total Timesteps: 10372 Episode Num: 806 Reward: -6.036841360204755 Epsilon: 4.820671037408407
Total Timesteps: 10385 Episode Num: 807 Reward: -8.23921803317122 Epsilon: 4.814636494944632
Total Timesteps: 10397 Episode Num: 808 Reward: -4.561236657219309 Epsilon: 4.809079542175628
Total Timesteps: 10410 Episode Num: 809 Reward: -7.121838697080079 Epsilon: 4.803073967339097
Total Timesteps: 10423 Episode Num: 810 Reward: -6.148898865854546 Epsilon: 4.797083373309028
Total Timesteps: 10437 Episode Num: 811 Reward: -8.662387490153085 Epsilon: 4.790648653827728
Total Timesteps: 10450 Episode Num: 812 Reward: -7.059550260951971 Epsilon: 4.784688995215311
Total Timesteps: 10463 Episode Num: 813 Reward: -6.986539107060628 Epsilon: 4.778744146038421
Total Timesteps: 10475 Episode Num: 814 Reward: -6.5212378027475015 Epsilon: 4.77326968973747
Total Timesteps: 10488 Episode Num: 815 Reward: -6.405605473673336 Epsilon: 4.767353165522502
Total Timesteps: 10501 Episode Num: 816 Reward: -7.9701209460

Total Timesteps: 11502 Episode Num: 894 Reward: -5.679110834866029 Epsilon: 4.347070074769605
Total Timesteps: 11515 Episode Num: 895 Reward: -7.892885566806699 Epsilon: 4.342162396873643
Total Timesteps: 11528 Episode Num: 896 Reward: -7.43954941414366 Epsilon: 4.337265787647467
Total Timesteps: 11541 Episode Num: 897 Reward: -6.815438737022008 Epsilon: 4.332380209687202
Total Timesteps: 11554 Episode Num: 898 Reward: -6.527785525938668 Epsilon: 4.327505625757314
Total Timesteps: 11567 Episode Num: 899 Reward: -7.967837385454068 Epsilon: 4.32264199878966
Total Timesteps: 11580 Episode Num: 900 Reward: -6.990093697165355 Epsilon: 4.317789291882556
Total Timesteps: 11593 Episode Num: 901 Reward: -6.8098699075657 Epsilon: 4.3129474682998366
Total Timesteps: 11605 Episode Num: 902 Reward: -5.824092122673298 Epsilon: 4.308487720809996
Total Timesteps: 11618 Episode Num: 903 Reward: -6.351772206101855 Epsilon: 4.30366672404889
Total Timesteps: 11631 Episode Num: 904 Reward: -7.8385089702125

Total Timesteps: 12631 Episode Num: 982 Reward: -5.605431451039165 Epsilon: 3.9585147652600745
Total Timesteps: 12644 Episode Num: 983 Reward: -6.122606916761946 Epsilon: 3.9544447959506486
Total Timesteps: 12657 Episode Num: 984 Reward: -6.485998690433135 Epsilon: 3.9503831871691553
Total Timesteps: 12671 Episode Num: 985 Reward: -7.141685721052445 Epsilon: 3.9460184673664274
Total Timesteps: 12685 Episode Num: 986 Reward: -8.564863810841988 Epsilon: 3.9416633819471816
Total Timesteps: 12698 Episode Num: 987 Reward: -7.0535314526151796 Epsilon: 3.9376279729091195
Total Timesteps: 12711 Episode Num: 988 Reward: -7.487194058011737 Epsilon: 3.93360081818897
Total Timesteps: 12724 Episode Num: 989 Reward: -6.9090822544074735 Epsilon: 3.9295818924866395
Total Timesteps: 12737 Episode Num: 990 Reward: -7.447945390489026 Epsilon: 3.925571170605323
Total Timesteps: 12750 Episode Num: 991 Reward: -6.019778082697929 Epsilon: 3.9215686274509802
Total Timesteps: 12763 Episode Num: 992 Reward: -5.

Total Timesteps: 13740 Episode Num: 1068 Reward: -8.044673575606518 Epsilon: 3.63901018922853
Total Timesteps: 13752 Episode Num: 1069 Reward: -6.978059959674705 Epsilon: 3.6358347876672483
Total Timesteps: 13765 Episode Num: 1070 Reward: -7.235237946536149 Epsilon: 3.6324010170722847
Total Timesteps: 13779 Episode Num: 1071 Reward: -5.73770240726524 Epsilon: 3.628710356339357
Total Timesteps: 13792 Episode Num: 1072 Reward: -5.710963606636497 Epsilon: 3.625290023201856
Total Timesteps: 13805 Episode Num: 1073 Reward: -8.3748974363956 Epsilon: 3.621876131836291
Total Timesteps: 13817 Episode Num: 1074 Reward: -6.729460360183948 Epsilon: 3.6187305493232973
Total Timesteps: 13831 Episode Num: 1075 Reward: -5.457718965647825 Epsilon: 3.615067601764153
Total Timesteps: 13844 Episode Num: 1076 Reward: -7.460625288420177 Epsilon: 3.61167292689974
Total Timesteps: 13858 Episode Num: 1077 Reward: -8.384007751383443 Epsilon: 3.6080242459229326
Total Timesteps: 13871 Episode Num: 1078 Reward: -6

Total Timesteps: 14849 Episode Num: 1154 Reward: -7.183610591208837 Epsilon: 3.367230116506162
Total Timesteps: 14862 Episode Num: 1155 Reward: -8.357520934874588 Epsilon: 3.364284753061499
Total Timesteps: 14874 Episode Num: 1156 Reward: -4.578186740214149 Epsilon: 3.3615705257496304
Total Timesteps: 14887 Episode Num: 1157 Reward: -6.516386053109795 Epsilon: 3.3586350507153893
Total Timesteps: 14901 Episode Num: 1158 Reward: -8.92225678129966 Epsilon: 3.355479498020267
Total Timesteps: 14914 Episode Num: 1159 Reward: -6.0450106557831855 Epsilon: 3.35255464664074
Total Timesteps: 14927 Episode Num: 1160 Reward: -6.268120000800009 Epsilon: 3.349634889797012
Total Timesteps: 14940 Episode Num: 1161 Reward: -6.135675002588255 Epsilon: 3.3467202141900936
Total Timesteps: 14952 Episode Num: 1162 Reward: -7.042899478806658 Epsilon: 3.3440342429106473
Total Timesteps: 14965 Episode Num: 1163 Reward: -6.616061844454602 Epsilon: 3.341129301703976
Total Timesteps: 14978 Episode Num: 1164 Reward

Total Timesteps: 15945 Episode Num: 1239 Reward: -5.942568154175953 Epsilon: 3.1357792411414236
Total Timesteps: 15959 Episode Num: 1240 Reward: -7.8186608574176075 Epsilon: 3.13302838523717
Total Timesteps: 15972 Episode Num: 1241 Reward: -6.985920454208902 Epsilon: 3.130478337089907
Total Timesteps: 15985 Episode Num: 1242 Reward: -6.187146992658892 Epsilon: 3.127932436659368
Total Timesteps: 15998 Episode Num: 1243 Reward: -7.494803243266945 Epsilon: 3.125390673834229
Total Timesteps: 16011 Episode Num: 1244 Reward: -7.438405098601567 Epsilon: 3.1228530385360065
Total Timesteps: 16025 Episode Num: 1245 Reward: -5.662065716065445 Epsilon: 3.1201248049921997
Total Timesteps: 16038 Episode Num: 1246 Reward: -7.719101456035102 Epsilon: 3.117595710188303
Total Timesteps: 16051 Episode Num: 1247 Reward: -6.43721013549987 Epsilon: 3.115070712105165
Total Timesteps: 16064 Episode Num: 1248 Reward: -5.637326441053695 Epsilon: 3.112549800796813
Total Timesteps: 16077 Episode Num: 1249 Reward:

Total Timesteps: 17055 Episode Num: 1325 Reward: -6.886600815882834 Epsilon: 2.9316915860451482
Total Timesteps: 17068 Episode Num: 1326 Reward: -8.057425327815004 Epsilon: 2.929458636044059
Total Timesteps: 17081 Episode Num: 1327 Reward: -5.526331623841249 Epsilon: 2.927229084948188
Total Timesteps: 17094 Episode Num: 1328 Reward: -7.501308722653899 Epsilon: 2.925002925002925
Total Timesteps: 17107 Episode Num: 1329 Reward: -7.526678416641367 Epsilon: 2.9227801484772313
Total Timesteps: 17120 Episode Num: 1330 Reward: -9.077730160028006 Epsilon: 2.9205607476635516
Total Timesteps: 17133 Episode Num: 1331 Reward: -7.168544079382941 Epsilon: 2.9183447148777213
Total Timesteps: 17147 Episode Num: 1332 Reward: -7.130273208062234 Epsilon: 2.915961975855835
Total Timesteps: 17160 Episode Num: 1333 Reward: -4.316648077846791 Epsilon: 2.913752913752914
Total Timesteps: 17174 Episode Num: 1334 Reward: -7.012610719772829 Epsilon: 2.9113776639105624
Total Timesteps: 17187 Episode Num: 1335 Rewa

Total Timesteps: 18165 Episode Num: 1411 Reward: -8.501059496971031 Epsilon: 2.7525461051472613
Total Timesteps: 18178 Episode Num: 1412 Reward: -5.102368587872753 Epsilon: 2.750577621300473
Total Timesteps: 18191 Episode Num: 1413 Reward: -7.5621909760356445 Epsilon: 2.748611950964763
Total Timesteps: 18204 Episode Num: 1414 Reward: -9.412158567397523 Epsilon: 2.7466490881125027
Total Timesteps: 18217 Episode Num: 1415 Reward: -5.006284650226449 Epsilon: 2.7446890267332713
Total Timesteps: 18229 Episode Num: 1416 Reward: -5.767594702555409 Epsilon: 2.742882220637446
Total Timesteps: 18242 Episode Num: 1417 Reward: -5.486633789735495 Epsilon: 2.74092752987611
Total Timesteps: 18255 Episode Num: 1418 Reward: -7.30993826056975 Epsilon: 2.7389756231169544
Total Timesteps: 18267 Episode Num: 1419 Reward: -4.6192577646194914 Epsilon: 2.737176328899108
Total Timesteps: 18280 Episode Num: 1420 Reward: -8.10980192061214 Epsilon: 2.735229759299781
Total Timesteps: 18294 Episode Num: 1421 Reward

Total Timesteps: 19279 Episode Num: 1497 Reward: -7.304743665273288 Epsilon: 2.593495513252762
Total Timesteps: 19292 Episode Num: 1498 Reward: -6.090474713753681 Epsilon: 2.591747874766743
Total Timesteps: 19305 Episode Num: 1499 Reward: -7.702339730484096 Epsilon: 2.59000259000259
Total Timesteps: 19318 Episode Num: 1500 Reward: -6.067478376350382 Epsilon: 2.5882596542085103
Total Timesteps: 19331 Episode Num: 1501 Reward: -6.4604886392351135 Epsilon: 2.586519062645492
Total Timesteps: 19344 Episode Num: 1502 Reward: -5.6775560512446885 Epsilon: 2.584780810587262
Total Timesteps: 19357 Episode Num: 1503 Reward: -7.39883329760988 Epsilon: 2.583044893320246
Total Timesteps: 19370 Episode Num: 1504 Reward: -8.544628683311089 Epsilon: 2.581311306143521
Total Timesteps: 19383 Episode Num: 1505 Reward: -5.726579597158014 Epsilon: 2.5795800443687766
Total Timesteps: 19396 Episode Num: 1506 Reward: -5.239040008340719 Epsilon: 2.5778511033202722
Total Timesteps: 19409 Episode Num: 1507 Reward

Total Timesteps: 20376 Episode Num: 1582 Reward: -10.0143478534223 Epsilon: 2.453867294856694
Total Timesteps: 20389 Episode Num: 1583 Reward: -6.707638267660639 Epsilon: 2.4523027122467997
Total Timesteps: 20401 Episode Num: 1584 Reward: -5.332101221044604 Epsilon: 2.4508602519484337
Total Timesteps: 20414 Episode Num: 1585 Reward: -8.458727376647204 Epsilon: 2.449299500342902
Total Timesteps: 20426 Episode Num: 1586 Reward: -6.353855188343778 Epsilon: 2.4478605698619407
Total Timesteps: 20439 Episode Num: 1587 Reward: -5.240420263219049 Epsilon: 2.446303635207202
Total Timesteps: 20451 Episode Num: 1588 Reward: -5.472119320097721 Epsilon: 2.4448682216028557
Total Timesteps: 20465 Episode Num: 1589 Reward: -6.35630167101737 Epsilon: 2.443195699975568
Total Timesteps: 20477 Episode Num: 1590 Reward: -4.98229456760997 Epsilon: 2.4417639302632224
Total Timesteps: 20491 Episode Num: 1591 Reward: -7.905315193450082 Epsilon: 2.4400956517495485
Total Timesteps: 20504 Episode Num: 1592 Reward

Total Timesteps: 21480 Episode Num: 1668 Reward: -7.677519332333139 Epsilon: 2.3277467411545625
Total Timesteps: 21492 Episode Num: 1669 Reward: -6.864879511037396 Epsilon: 2.3264470500651404
Total Timesteps: 21505 Episode Num: 1670 Reward: -7.560116578369609 Epsilon: 2.325040688212044
Total Timesteps: 21518 Episode Num: 1671 Reward: -7.824086672528109 Epsilon: 2.3236360256529416
Total Timesteps: 21531 Episode Num: 1672 Reward: -8.324482571594993 Epsilon: 2.3222330593098324
Total Timesteps: 21543 Episode Num: 1673 Reward: -5.596761042001733 Epsilon: 2.320939516316205
Total Timesteps: 21556 Episode Num: 1674 Reward: -6.58536847557622 Epsilon: 2.319539803303025
Total Timesteps: 21569 Episode Num: 1675 Reward: -5.417764594268063 Epsilon: 2.318141777551115
Total Timesteps: 21582 Episode Num: 1676 Reward: -6.553613443213861 Epsilon: 2.316745436011491
Total Timesteps: 21595 Episode Num: 1677 Reward: -6.159110683021405 Epsilon: 2.31535077564251
Total Timesteps: 21608 Episode Num: 1678 Reward:

Total Timesteps: 22584 Episode Num: 1754 Reward: -9.283062251498968 Epsilon: 2.2139567835635847
Total Timesteps: 22597 Episode Num: 1755 Reward: -6.063212357224482 Epsilon: 2.212683099526486
Total Timesteps: 22610 Episode Num: 1756 Reward: -6.71813009614022 Epsilon: 2.21141088014153
Total Timesteps: 22623 Episode Num: 1757 Reward: -7.034879738846928 Epsilon: 2.2101401228837907
Total Timesteps: 22636 Episode Num: 1758 Reward: -5.85813492519115 Epsilon: 2.2088708252341402
Total Timesteps: 22649 Episode Num: 1759 Reward: -7.658275289539429 Epsilon: 2.207602984679235
Total Timesteps: 22662 Episode Num: 1760 Reward: -7.688803828893437 Epsilon: 2.2063365987114993
Total Timesteps: 22675 Episode Num: 1761 Reward: -6.471989289458696 Epsilon: 2.205071664829107
Total Timesteps: 22688 Episode Num: 1762 Reward: -7.969226686560699 Epsilon: 2.203808180535966
Total Timesteps: 22701 Episode Num: 1763 Reward: -4.741948487367793 Epsilon: 2.202546143341703
Total Timesteps: 22714 Episode Num: 1764 Reward: 

Total Timesteps: 23690 Episode Num: 1840 Reward: -6.241150484363122 Epsilon: 2.110595187842972
Total Timesteps: 23703 Episode Num: 1841 Reward: -7.262855607930791 Epsilon: 2.1094376239294603
Total Timesteps: 23716 Episode Num: 1842 Reward: -8.765539714620166 Epsilon: 2.1082813290605498
Total Timesteps: 23729 Episode Num: 1843 Reward: -6.705288495859547 Epsilon: 2.107126301150491
Total Timesteps: 23742 Episode Num: 1844 Reward: -6.58170502584785 Epsilon: 2.105972538118103
Total Timesteps: 23755 Episode Num: 1845 Reward: -7.650752315081712 Epsilon: 2.1048200378867605
Total Timesteps: 23768 Episode Num: 1846 Reward: -7.370993573616817 Epsilon: 2.1036687983843825
Total Timesteps: 23781 Episode Num: 1847 Reward: -7.338781339641984 Epsilon: 2.102518817543417
Total Timesteps: 23794 Episode Num: 1848 Reward: -8.391974250905756 Epsilon: 2.101370093300832
Total Timesteps: 23806 Episode Num: 1849 Reward: -5.149935135598511 Epsilon: 2.100310846005209
Total Timesteps: 23819 Episode Num: 1850 Reward

Total Timesteps: 24795 Episode Num: 1926 Reward: -8.389151314307384 Epsilon: 2.0165355918531964
Total Timesteps: 24808 Episode Num: 1927 Reward: -5.52047424927266 Epsilon: 2.015478877781361
Total Timesteps: 24821 Episode Num: 1928 Reward: -5.968677787594331 Epsilon: 2.0144232706176224
Total Timesteps: 24834 Episode Num: 1929 Reward: -6.117027471830295 Epsilon: 2.013368768623661
Total Timesteps: 24848 Episode Num: 1930 Reward: -9.305287703917847 Epsilon: 2.012234385061172
Total Timesteps: 24861 Episode Num: 1931 Reward: -4.975226403800537 Epsilon: 2.0111821728812194
Total Timesteps: 24874 Episode Num: 1932 Reward: -5.624236112099238 Epsilon: 2.0101310605451475
Total Timesteps: 24887 Episode Num: 1933 Reward: -6.952014055689834 Epsilon: 2.009081046329409
Total Timesteps: 24900 Episode Num: 1934 Reward: -7.039407788156505 Epsilon: 2.0080321285140563
Total Timesteps: 24913 Episode Num: 1935 Reward: -8.066256857499958 Epsilon: 2.006984305382732
Total Timesteps: 24926 Episode Num: 1936 Rewar

Total Timesteps: 25880 Episode Num: 2010 Reward: -8.20085691732231 Epsilon: 1.9319938176197837
Total Timesteps: 25893 Episode Num: 2011 Reward: -5.324175355832827 Epsilon: 1.9310238288340478
Total Timesteps: 25906 Episode Num: 2012 Reward: -7.699776045537213 Epsilon: 1.930054813556705
Total Timesteps: 25919 Episode Num: 2013 Reward: -7.847548018171934 Epsilon: 1.929086770322929
Total Timesteps: 25932 Episode Num: 2014 Reward: -6.757741830583228 Epsilon: 1.9281196976708315
Total Timesteps: 25946 Episode Num: 2015 Reward: -6.400597133983929 Epsilon: 1.927079318584753
Total Timesteps: 25958 Episode Num: 2016 Reward: -7.323364926689314 Epsilon: 1.926188458278758
Total Timesteps: 25971 Episode Num: 2017 Reward: -6.519372010102011 Epsilon: 1.9252242886296254
Total Timesteps: 25984 Episode Num: 2018 Reward: -7.603593929911975 Epsilon: 1.9242610837438423
Total Timesteps: 25997 Episode Num: 2019 Reward: -7.207397878389577 Epsilon: 1.923298842174097
Total Timesteps: 26010 Episode Num: 2020 Rewar

Total Timesteps: 26988 Episode Num: 2096 Reward: -7.89836956542605 Epsilon: 1.8526752630798873
Total Timesteps: 27001 Episode Num: 2097 Reward: -4.97619171738875 Epsilon: 1.8517832672863968
Total Timesteps: 27014 Episode Num: 2098 Reward: -5.816635598314516 Epsilon: 1.8508921300066632
Total Timesteps: 27027 Episode Num: 2099 Reward: -5.6176907979558965 Epsilon: 1.85000185000185
Total Timesteps: 27040 Episode Num: 2100 Reward: -8.029357975057017 Epsilon: 1.849112426035503
Total Timesteps: 27054 Episode Num: 2101 Reward: -6.869245845864285 Epsilon: 1.8481555407703112
Total Timesteps: 27067 Episode Num: 2102 Reward: -5.052057733398357 Epsilon: 1.8472678907895224
Total Timesteps: 27080 Episode Num: 2103 Reward: -7.373839938519659 Epsilon: 1.846381093057607
Total Timesteps: 27093 Episode Num: 2104 Reward: -7.944815965905106 Epsilon: 1.8454951463477651
Total Timesteps: 27106 Episode Num: 2105 Reward: -6.673097098761206 Epsilon: 1.8446100494355493
Total Timesteps: 27119 Episode Num: 2106 Rewa

Total Timesteps: 28088 Episode Num: 2182 Reward: -7.106992222460693 Epsilon: 1.7801196240387354
Total Timesteps: 28101 Episode Num: 2183 Reward: -6.120539380525754 Epsilon: 1.7792961104587026
Total Timesteps: 28114 Episode Num: 2184 Reward: -8.681915094714705 Epsilon: 1.7784733584690902
Total Timesteps: 28127 Episode Num: 2185 Reward: -6.532591447143991 Epsilon: 1.7776513670139011
Total Timesteps: 28140 Episode Num: 2186 Reward: -6.268609450169584 Epsilon: 1.7768301350390903
Total Timesteps: 28153 Episode Num: 2187 Reward: -6.020820232406048 Epsilon: 1.7760096614925585
Total Timesteps: 28166 Episode Num: 2188 Reward: -7.171260987658688 Epsilon: 1.7751899453241498
Total Timesteps: 28179 Episode Num: 2189 Reward: -6.771161536753583 Epsilon: 1.7743709854856453
Total Timesteps: 28192 Episode Num: 2190 Reward: -8.262148037321738 Epsilon: 1.7735527809307605
Total Timesteps: 28204 Episode Num: 2191 Reward: -6.145588544477979 Epsilon: 1.772798184654659
Total Timesteps: 28216 Episode Num: 2192 

Total Timesteps: 29194 Episode Num: 2268 Reward: -10.849671469283178 Epsilon: 1.7126806878125642
Total Timesteps: 29207 Episode Num: 2269 Reward: -6.406527086922465 Epsilon: 1.711918375731845
Total Timesteps: 29220 Episode Num: 2270 Reward: -5.820145954409159 Epsilon: 1.7111567419575633
Total Timesteps: 29233 Episode Num: 2271 Reward: -6.083481273918079 Epsilon: 1.7103957855847842
Total Timesteps: 29246 Episode Num: 2272 Reward: -6.14076494266347 Epsilon: 1.7096355057101826
Total Timesteps: 29259 Episode Num: 2273 Reward: -7.109826563898032 Epsilon: 1.708875901432038
Total Timesteps: 29272 Episode Num: 2274 Reward: -5.374515092558527 Epsilon: 1.7081169718502323
Total Timesteps: 29285 Episode Num: 2275 Reward: -6.258967307150875 Epsilon: 1.7073587160662456
Total Timesteps: 29298 Episode Num: 2276 Reward: -7.444659967485539 Epsilon: 1.7066011331831525
Total Timesteps: 29311 Episode Num: 2277 Reward: -6.1332703518579 Epsilon: 1.705844222305619
Total Timesteps: 29324 Episode Num: 2278 Rewa

Total Timesteps: 30286 Episode Num: 2353 Reward: -6.980885164184412 Epsilon: 1.6509278214356469
Total Timesteps: 30299 Episode Num: 2354 Reward: -5.16351057393026 Epsilon: 1.6502194791907323
Total Timesteps: 30312 Episode Num: 2355 Reward: -9.483570463399086 Epsilon: 1.649511744523621
Total Timesteps: 30325 Episode Num: 2356 Reward: -6.686707608277308 Epsilon: 1.6488046166529267
Total Timesteps: 30338 Episode Num: 2357 Reward: -6.987972352355673 Epsilon: 1.6480980947986024
Total Timesteps: 30352 Episode Num: 2358 Reward: -7.369110042227155 Epsilon: 1.6473379019504482
Total Timesteps: 30365 Episode Num: 2359 Reward: -6.051314318476147 Epsilon: 1.6466326362588506
Total Timesteps: 30377 Episode Num: 2360 Reward: -6.962494378640616 Epsilon: 1.6459821575534122
Total Timesteps: 30390 Episode Num: 2361 Reward: -6.594656247817618 Epsilon: 1.6452780519907864
Total Timesteps: 30403 Episode Num: 2362 Reward: -6.719775577619707 Epsilon: 1.6445745485642864
Total Timesteps: 30416 Episode Num: 2363 R

Total Timesteps: 31394 Episode Num: 2439 Reward: -6.251384551125947 Epsilon: 1.5926610180289227
Total Timesteps: 31407 Episode Num: 2440 Reward: -8.30663197671366 Epsilon: 1.592001783041997
Total Timesteps: 31420 Episode Num: 2441 Reward: -5.522023376693896 Epsilon: 1.5913430935709738
Total Timesteps: 31433 Episode Num: 2442 Reward: -7.8863912531700135 Epsilon: 1.5906849489390131
Total Timesteps: 31447 Episode Num: 2443 Reward: -5.570329216395351 Epsilon: 1.5899767863389194
Total Timesteps: 31461 Episode Num: 2444 Reward: -6.706175224594593 Epsilon: 1.5892692539970121
Total Timesteps: 31474 Episode Num: 2445 Reward: -6.445735813857605 Epsilon: 1.5886128232827095
Total Timesteps: 31486 Episode Num: 2446 Reward: -5.830592436475705 Epsilon: 1.588007368354189
Total Timesteps: 31499 Episode Num: 2447 Reward: -7.096832885159415 Epsilon: 1.5873519794279183
Total Timesteps: 31512 Episode Num: 2448 Reward: -7.135034309367316 Epsilon: 1.5866971312515867
Total Timesteps: 31525 Episode Num: 2449 R

Total Timesteps: 32499 Episode Num: 2525 Reward: -6.720393357654601 Epsilon: 1.5385088771962214
Total Timesteps: 32512 Episode Num: 2526 Reward: -6.273094832552012 Epsilon: 1.5378937007874016
Total Timesteps: 32525 Episode Num: 2527 Reward: -5.564915205162756 Epsilon: 1.5372790161414296
Total Timesteps: 32538 Episode Num: 2528 Reward: -5.270960646724698 Epsilon: 1.5366648226688795
Total Timesteps: 32551 Episode Num: 2529 Reward: -6.484540724490954 Epsilon: 1.5360511197812663
Total Timesteps: 32564 Episode Num: 2530 Reward: -6.003045164436498 Epsilon: 1.5354379068910453
Total Timesteps: 32576 Episode Num: 2531 Reward: -6.816456405271202 Epsilon: 1.5348722986247545
Total Timesteps: 32589 Episode Num: 2532 Reward: -8.206873176037043 Epsilon: 1.5342600263892725
Total Timesteps: 32602 Episode Num: 2533 Reward: -7.131270491056155 Epsilon: 1.5336482424391142
Total Timesteps: 32614 Episode Num: 2534 Reward: -7.499131394140192 Epsilon: 1.533083951677194
Total Timesteps: 32627 Episode Num: 2535 

Total Timesteps: 33606 Episode Num: 2611 Reward: -8.0400205915864 Epsilon: 1.4878295542462656
Total Timesteps: 33619 Episode Num: 2612 Reward: -5.454706628504008 Epsilon: 1.487254231238288
Total Timesteps: 33632 Episode Num: 2613 Reward: -7.689107674803925 Epsilon: 1.4866793529971456
Total Timesteps: 33645 Episode Num: 2614 Reward: -7.0643568679241024 Epsilon: 1.486104919007282
Total Timesteps: 33657 Episode Num: 2615 Reward: -6.602225937571003 Epsilon: 1.4855750661080904
Total Timesteps: 33670 Episode Num: 2616 Reward: -8.415246356008044 Epsilon: 1.485001485001485
Total Timesteps: 33683 Episode Num: 2617 Reward: -7.081856541129237 Epsilon: 1.4844283466437076
Total Timesteps: 33696 Episode Num: 2618 Reward: -5.330980671225145 Epsilon: 1.4838556505223173
Total Timesteps: 33708 Episode Num: 2619 Reward: -5.710945731670344 Epsilon: 1.4833274000237333
Total Timesteps: 33721 Episode Num: 2620 Reward: -8.07588540059747 Epsilon: 1.4827555529195457
Total Timesteps: 33733 Episode Num: 2621 Rewa

Total Timesteps: 34699 Episode Num: 2697 Reward: -6.383504444397717 Epsilon: 1.4409637165336178
Total Timesteps: 34712 Episode Num: 2698 Reward: -5.807628655831785 Epsilon: 1.4404240608435124
Total Timesteps: 34725 Episode Num: 2699 Reward: -7.704132815397395 Epsilon: 1.4398848092152627
Total Timesteps: 34738 Episode Num: 2700 Reward: -9.56679856263401 Epsilon: 1.4393459611952328
Total Timesteps: 34751 Episode Num: 2701 Reward: -6.075423631870257 Epsilon: 1.4388075163304652
Total Timesteps: 34764 Episode Num: 2702 Reward: -5.665650331608259 Epsilon: 1.4382694741686803
Total Timesteps: 34777 Episode Num: 2703 Reward: -6.994563527761174 Epsilon: 1.437731834258274
Total Timesteps: 34790 Episode Num: 2704 Reward: -6.430639740926631 Epsilon: 1.4371945961483186
Total Timesteps: 34803 Episode Num: 2705 Reward: -6.694170749738893 Epsilon: 1.4366577593885586
Total Timesteps: 34815 Episode Num: 2706 Reward: -6.94070941826103 Epsilon: 1.4361625736033319
Total Timesteps: 34828 Episode Num: 2707 Re

Total Timesteps: 35780 Episode Num: 2781 Reward: -8.383604312347108 Epsilon: 1.3974287311347122
Total Timesteps: 35793 Episode Num: 2782 Reward: -6.707682488916799 Epsilon: 1.3969211857067023
Total Timesteps: 35806 Episode Num: 2783 Reward: -6.035060300381919 Epsilon: 1.3964140088253365
Total Timesteps: 35818 Episode Num: 2784 Reward: -7.395245388618771 Epsilon: 1.3959461723155955
Total Timesteps: 35831 Episode Num: 2785 Reward: -6.905975020512384 Epsilon: 1.3954397030504313
Total Timesteps: 35845 Episode Num: 2786 Reward: -6.821665519376867 Epsilon: 1.3948946854512485
Total Timesteps: 35858 Episode Num: 2787 Reward: -9.503956050274686 Epsilon: 1.394388978749512
Total Timesteps: 35871 Episode Num: 2788 Reward: -5.879006101021299 Epsilon: 1.3938836385938502
Total Timesteps: 35883 Episode Num: 2789 Reward: -6.952731254701402 Epsilon: 1.3934174957500767
Total Timesteps: 35896 Episode Num: 2790 Reward: -6.7329091635627405 Epsilon: 1.3929128593715177
Total Timesteps: 35909 Episode Num: 2791

Total Timesteps: 36887 Episode Num: 2867 Reward: -7.990480962123045 Epsilon: 1.3554910944235097
Total Timesteps: 36900 Episode Num: 2868 Reward: -5.627286277607436 Epsilon: 1.3550135501355014
Total Timesteps: 36913 Episode Num: 2869 Reward: -5.158511369624251 Epsilon: 1.3545363422100616
Total Timesteps: 36926 Episode Num: 2870 Reward: -5.671176459549128 Epsilon: 1.3540594702919353
Total Timesteps: 36938 Episode Num: 2871 Reward: -5.705814912563772 Epsilon: 1.3536195787535872
Total Timesteps: 36951 Episode Num: 2872 Reward: -6.53248183881525 Epsilon: 1.3531433520067115
Total Timesteps: 36963 Episode Num: 2873 Reward: -5.343087040442935 Epsilon: 1.3527040554067582
Total Timesteps: 36976 Episode Num: 2874 Reward: -7.203597406437822 Epsilon: 1.3522284725227174
Total Timesteps: 36989 Episode Num: 2875 Reward: -6.780630994548492 Epsilon: 1.3517532239314392
Total Timesteps: 37002 Episode Num: 2876 Reward: -7.489458463111841 Epsilon: 1.3512783092805793
Total Timesteps: 37016 Episode Num: 2877 

Total Timesteps: 38000 Episode Num: 2953 Reward: -5.150784407301246 Epsilon: 1.3157894736842106
Total Timesteps: 38013 Episode Num: 2954 Reward: -5.915507514426405 Epsilon: 1.3153394891221424
Total Timesteps: 38025 Episode Num: 2955 Reward: -4.535998201934415 Epsilon: 1.3149243918474687
Total Timesteps: 38039 Episode Num: 2956 Reward: -5.9598207264715155 Epsilon: 1.3144404427035412
Total Timesteps: 38051 Episode Num: 2957 Reward: -6.453462273669542 Epsilon: 1.3140259125909963
Total Timesteps: 38064 Episode Num: 2958 Reward: -7.506855584643847 Epsilon: 1.3135771332492645
Total Timesteps: 38077 Episode Num: 2959 Reward: -4.8275223360970445 Epsilon: 1.3131286603461407
Total Timesteps: 38090 Episode Num: 2960 Reward: -7.115951658439483 Epsilon: 1.3126804935678655
Total Timesteps: 38103 Episode Num: 2961 Reward: -7.306605668219104 Epsilon: 1.3122326326011076
Total Timesteps: 38116 Episode Num: 2962 Reward: -5.792437270287783 Epsilon: 1.3117850771329624
Total Timesteps: 38128 Episode Num: 29

Total Timesteps: 39111 Episode Num: 3039 Reward: -5.985382770398939 Epsilon: 1.278412722763417
Total Timesteps: 39123 Episode Num: 3040 Reward: -6.490398189085313 Epsilon: 1.2780206016920992
Total Timesteps: 39136 Episode Num: 3041 Reward: -4.950221589448996 Epsilon: 1.277596075224857
Total Timesteps: 39149 Episode Num: 3042 Reward: -7.413263668950503 Epsilon: 1.2771718306981021
Total Timesteps: 39162 Episode Num: 3043 Reward: -4.894485463989873 Epsilon: 1.2767478678310606
Total Timesteps: 39174 Episode Num: 3044 Reward: -3.638508705976733 Epsilon: 1.2763567672435798
Total Timesteps: 39186 Episode Num: 3045 Reward: -6.128423730205852 Epsilon: 1.2759659061909865
Total Timesteps: 39199 Episode Num: 3046 Reward: -7.962792860935746 Epsilon: 1.2755427434373325
Total Timesteps: 39212 Episode Num: 3047 Reward: -6.791900988635874 Epsilon: 1.2751198612669592
Total Timesteps: 39225 Episode Num: 3048 Reward: -7.521786481556555 Epsilon: 1.2746972594008923
Total Timesteps: 39238 Episode Num: 3049 R

Total Timesteps: 40183 Episode Num: 3123 Reward: -6.445534846384768 Epsilon: 1.2443072941293583
Total Timesteps: 40196 Episode Num: 3124 Reward: -6.599232454655161 Epsilon: 1.2439048661558365
Total Timesteps: 40208 Episode Num: 3125 Reward: -5.579708398463088 Epsilon: 1.243533625149224
Total Timesteps: 40220 Episode Num: 3126 Reward: -5.9915352964402615 Epsilon: 1.2431626056688214
Total Timesteps: 40233 Episode Num: 3127 Reward: -8.13137936537247 Epsilon: 1.2427609176546617
Total Timesteps: 40246 Episode Num: 3128 Reward: -4.276977238222373 Epsilon: 1.242359489141778
Total Timesteps: 40258 Episode Num: 3129 Reward: -5.261278614523829 Epsilon: 1.241989169854439
Total Timesteps: 40271 Episode Num: 3130 Reward: -7.721086911577589 Epsilon: 1.2415882396761937
Total Timesteps: 40284 Episode Num: 3131 Reward: -5.980190328309676 Epsilon: 1.2411875682653162
Total Timesteps: 40297 Episode Num: 3132 Reward: -4.868396502633236 Epsilon: 1.2407871553713676
Total Timesteps: 40310 Episode Num: 3133 Re

Total Timesteps: 41287 Episode Num: 3209 Reward: -7.5283107363057296 Epsilon: 1.2110349504686706
Total Timesteps: 41300 Episode Num: 3210 Reward: -6.920119390893094 Epsilon: 1.2106537530266344
Total Timesteps: 41312 Episode Num: 3211 Reward: -5.6359383714484235 Epsilon: 1.210302091402014
Total Timesteps: 41325 Episode Num: 3212 Reward: -7.415752678839669 Epsilon: 1.2099213551119177
Total Timesteps: 41338 Episode Num: 3213 Reward: -6.726926101682065 Epsilon: 1.209540858290193
Total Timesteps: 41352 Episode Num: 3214 Reward: -9.535818318769566 Epsilon: 1.2091313600309537
Total Timesteps: 41365 Episode Num: 3215 Reward: -6.694251067432836 Epsilon: 1.20875135984528
Total Timesteps: 41378 Episode Num: 3216 Reward: -6.562844744817393 Epsilon: 1.2083715984339505
Total Timesteps: 41392 Episode Num: 3217 Reward: -7.375380388396522 Epsilon: 1.2079628913799767
Total Timesteps: 41406 Episode Num: 3218 Reward: -6.018269742649554 Epsilon: 1.207554460706178
Total Timesteps: 41419 Episode Num: 3219 Re

Total Timesteps: 42395 Episode Num: 3295 Reward: -4.82886626717663 Epsilon: 1.1793843613633683
Total Timesteps: 42408 Episode Num: 3296 Reward: -8.049192914382811 Epsilon: 1.179022825881909
Total Timesteps: 42420 Episode Num: 3297 Reward: -9.284904140195486 Epsilon: 1.1786892975011787
Total Timesteps: 42434 Episode Num: 3298 Reward: -7.589737558093594 Epsilon: 1.1783004194749493
Total Timesteps: 42447 Episode Num: 3299 Reward: -7.722892847390032 Epsilon: 1.1779395481423893
Total Timesteps: 42460 Episode Num: 3300 Reward: -7.310600148626906 Epsilon: 1.1775788977861517
Total Timesteps: 42472 Episode Num: 3301 Reward: -6.631088479038863 Epsilon: 1.1772461857223582
Total Timesteps: 42484 Episode Num: 3302 Reward: -6.542778405922639 Epsilon: 1.176913661613784
Total Timesteps: 42497 Episode Num: 3303 Reward: -6.665720832649026 Epsilon: 1.1765536390804057
Total Timesteps: 42509 Episode Num: 3304 Reward: -5.971975809819116 Epsilon: 1.1762215060340164
Total Timesteps: 42522 Episode Num: 3305 Re

Total Timesteps: 43496 Episode Num: 3381 Reward: -7.96861490810643 Epsilon: 1.149530991355527
Total Timesteps: 43509 Episode Num: 3382 Reward: -5.352084525036361 Epsilon: 1.149187524420235
Total Timesteps: 43522 Episode Num: 3383 Reward: -7.145891324301363 Epsilon: 1.1488442626717523
Total Timesteps: 43535 Episode Num: 3384 Reward: -6.738479557405825 Epsilon: 1.1485012059262663
Total Timesteps: 43548 Episode Num: 3385 Reward: -7.029797187572473 Epsilon: 1.1481583540001836
Total Timesteps: 43561 Episode Num: 3386 Reward: -5.317385609929823 Epsilon: 1.1478157067101307
Total Timesteps: 43574 Episode Num: 3387 Reward: -5.999309535202976 Epsilon: 1.1474732638729517
Total Timesteps: 43586 Episode Num: 3388 Reward: -8.544812213077117 Epsilon: 1.147157344101317
Total Timesteps: 43598 Episode Num: 3389 Reward: -6.205137256798576 Epsilon: 1.1468415982384512
Total Timesteps: 43611 Episode Num: 3390 Reward: -6.668715438846906 Epsilon: 1.1464997363050606
Total Timesteps: 43624 Episode Num: 3391 Rew

Total Timesteps: 44606 Episode Num: 3467 Reward: -5.822993886245516 Epsilon: 1.1209254360399947
Total Timesteps: 44619 Episode Num: 3468 Reward: -6.635926738112115 Epsilon: 1.1205988480243843
Total Timesteps: 44631 Episode Num: 3469 Reward: -7.436359362240463 Epsilon: 1.1202975510295534
Total Timesteps: 44644 Episode Num: 3470 Reward: -6.3714959477947 Epsilon: 1.1199713287339843
Total Timesteps: 44657 Episode Num: 3471 Reward: -7.918390184011345 Epsilon: 1.11964529637011
Total Timesteps: 44671 Episode Num: 3472 Reward: -8.938578245397691 Epsilon: 1.1192943968122495
Total Timesteps: 44684 Episode Num: 3473 Reward: -7.043730495771152 Epsilon: 1.1189687583922656
Total Timesteps: 44697 Episode Num: 3474 Reward: -6.631861325238019 Epsilon: 1.1186433093943664
Total Timesteps: 44710 Episode Num: 3475 Reward: -6.722926863863289 Epsilon: 1.1183180496533214
Total Timesteps: 44723 Episode Num: 3476 Reward: -5.564748594690732 Epsilon: 1.117992979004092
Total Timesteps: 44736 Episode Num: 3477 Rewa

Total Timesteps: 45690 Episode Num: 3551 Reward: -5.481556339469736 Epsilon: 1.094331363536879
Total Timesteps: 45702 Episode Num: 3552 Reward: -6.7277311823125725 Epsilon: 1.094044024331539
Total Timesteps: 45715 Episode Num: 3553 Reward: -5.974617567290971 Epsilon: 1.0937329104232747
Total Timesteps: 45727 Episode Num: 3554 Reward: -5.428007630077687 Epsilon: 1.0934458853631335
Total Timesteps: 45740 Episode Num: 3555 Reward: -7.286873089467297 Epsilon: 1.0931351114997814
Total Timesteps: 45753 Episode Num: 3556 Reward: -7.177669162396352 Epsilon: 1.0928245142395034
Total Timesteps: 45767 Episode Num: 3557 Reward: -8.210330725712042 Epsilon: 1.0924902222125112
Total Timesteps: 45780 Episode Num: 3558 Reward: -6.998552414357504 Epsilon: 1.09217999126256
Total Timesteps: 45793 Episode Num: 3559 Reward: -5.992425149577839 Epsilon: 1.0918699364531697
Total Timesteps: 45806 Episode Num: 3560 Reward: -6.925955773183389 Epsilon: 1.0915600576343711
Total Timesteps: 45820 Episode Num: 3561 Re

Total Timesteps: 46800 Episode Num: 3637 Reward: -7.900322057779514 Epsilon: 1.0683760683760684
Total Timesteps: 46813 Episode Num: 3638 Reward: -8.104533974631545 Epsilon: 1.0680793796594963
Total Timesteps: 46825 Episode Num: 3639 Reward: -6.214601873348173 Epsilon: 1.0678056593699947
Total Timesteps: 46838 Episode Num: 3640 Reward: -6.607837141891758 Epsilon: 1.0675092873307999
Total Timesteps: 46852 Episode Num: 3641 Reward: -8.11600891250094 Epsilon: 1.0671903013745412
Total Timesteps: 46865 Episode Num: 3642 Reward: -7.806199388488988 Epsilon: 1.066894270777766
Total Timesteps: 46878 Episode Num: 3643 Reward: -9.15936729756835 Epsilon: 1.066598404368787
Total Timesteps: 46891 Episode Num: 3644 Reward: -5.737638227887335 Epsilon: 1.066302702011047
Total Timesteps: 46903 Episode Num: 3645 Reward: -7.292798792508717 Epsilon: 1.066029891478157
Total Timesteps: 46916 Episode Num: 3646 Reward: -6.307723987151178 Epsilon: 1.0657345042203086
Total Timesteps: 46929 Episode Num: 3647 Rewar

Total Timesteps: 47907 Episode Num: 3723 Reward: -4.181500042006758 Epsilon: 1.0436888137432943
Total Timesteps: 47920 Episode Num: 3724 Reward: -6.066452215561766 Epsilon: 1.0434056761268782
Total Timesteps: 47932 Episode Num: 3725 Reward: -5.731325904787452 Epsilon: 1.0431444546440791
Total Timesteps: 47945 Episode Num: 3726 Reward: -6.119321032319982 Epsilon: 1.0428616122640526
Total Timesteps: 47959 Episode Num: 3727 Reward: -7.57527873358382 Epsilon: 1.0425571842615569
Total Timesteps: 47971 Episode Num: 3728 Reward: -5.029978355684773 Epsilon: 1.0422963874007212
Total Timesteps: 47984 Episode Num: 3729 Reward: -8.397808306353657 Epsilon: 1.0420140046682227
Total Timesteps: 47997 Episode Num: 3730 Reward: -5.833129298024207 Epsilon: 1.041731774902598
Total Timesteps: 48011 Episode Num: 3731 Reward: -8.296121547845075 Epsilon: 1.0414280060819396
Total Timesteps: 48023 Episode Num: 3732 Reward: -5.944415211649387 Epsilon: 1.0411677737750662
Total Timesteps: 48035 Episode Num: 3733 R

Total Timesteps: 49009 Episode Num: 3809 Reward: -8.013602451989815 Epsilon: 1.0202207757758779
Total Timesteps: 49021 Episode Num: 3810 Reward: -6.57693865459696 Epsilon: 1.0199710328226679
Total Timesteps: 49034 Episode Num: 3811 Reward: -7.958942090638467 Epsilon: 1.019700615899172
Total Timesteps: 49047 Episode Num: 3812 Reward: -6.533369558171014 Epsilon: 1.0194303423247089
Total Timesteps: 49060 Episode Num: 3813 Reward: -6.814185964915568 Epsilon: 1.019160211985324
Total Timesteps: 49073 Episode Num: 3814 Reward: -7.360968088647724 Epsilon: 1.0188902247671836
Total Timesteps: 49086 Episode Num: 3815 Reward: -7.704140598171046 Epsilon: 1.0186203805565741
Total Timesteps: 49099 Episode Num: 3816 Reward: -5.39010810549757 Epsilon: 1.018350679239903
Total Timesteps: 49112 Episode Num: 3817 Reward: -7.641658866674363 Epsilon: 1.0180811207036977
Total Timesteps: 49125 Episode Num: 3818 Reward: -6.588918818957942 Epsilon: 1.0178117048346056
Total Timesteps: 49137 Episode Num: 3819 Rewa

Total Timesteps: 50085 Episode Num: 3893 Reward: -6.356869749215142 Epsilon: 0.9983028850953379
Total Timesteps: 50098 Episode Num: 3894 Reward: -6.451605806296434 Epsilon: 0.998043834085193
Total Timesteps: 50111 Episode Num: 3895 Reward: -9.445719330839964 Epsilon: 0.9977849174831873
Total Timesteps: 50124 Episode Num: 3896 Reward: -7.8404759129445445 Epsilon: 0.9975261351847419
Total Timesteps: 50137 Episode Num: 3897 Reward: -7.243252597970097 Epsilon: 0.997267487085386
Total Timesteps: 50150 Episode Num: 3898 Reward: -6.551264518523332 Epsilon: 0.9970089730807578
Total Timesteps: 50163 Episode Num: 3899 Reward: -7.451538623325737 Epsilon: 0.9967505930666029
Total Timesteps: 50176 Episode Num: 3900 Reward: -7.198591520601704 Epsilon: 0.9964923469387755
Total Timesteps: 50190 Episode Num: 3901 Reward: -5.7057056480890225 Epsilon: 0.9962143853357243
Total Timesteps: 50202 Episode Num: 3902 Reward: -8.01739295840309 Epsilon: 0.9959762559260588
Total Timesteps: 50215 Episode Num: 3903 

Total Timesteps: 51192 Episode Num: 3979 Reward: -7.750473678835586 Epsilon: 0.9767151117362087
Total Timesteps: 51205 Episode Num: 3980 Reward: -5.56559297861079 Epsilon: 0.9764671418806757
Total Timesteps: 51218 Episode Num: 3981 Reward: -9.002703335112546 Epsilon: 0.9762192979030809
Total Timesteps: 51231 Episode Num: 3982 Reward: -8.485496196859291 Epsilon: 0.975971579707599
Total Timesteps: 51244 Episode Num: 3983 Reward: -7.7847164883086 Epsilon: 0.9757239871985013
Total Timesteps: 51258 Episode Num: 3984 Reward: -6.692589718364124 Epsilon: 0.9754574895626048
Total Timesteps: 51271 Episode Num: 3985 Reward: -6.830245350482115 Epsilon: 0.9752101577890036
Total Timesteps: 51284 Episode Num: 3986 Reward: -6.873089286208229 Epsilon: 0.9749629514078465
Total Timesteps: 51297 Episode Num: 3987 Reward: -6.827956800653636 Epsilon: 0.9747158703238006
Total Timesteps: 51310 Episode Num: 3988 Reward: -7.272542362999711 Epsilon: 0.9744689144416293
Total Timesteps: 51323 Episode Num: 3989 Rew

Total Timesteps: 52296 Episode Num: 4065 Reward: -7.166530318553134 Epsilon: 0.9560960685329662
Total Timesteps: 52309 Episode Num: 4066 Reward: -5.18077346169759 Epsilon: 0.9558584564797645
Total Timesteps: 52322 Episode Num: 4067 Reward: -7.176805500926111 Epsilon: 0.9556209625014335
Total Timesteps: 52336 Episode Num: 4068 Reward: -10.140731216661173 Epsilon: 0.9553653317028432
Total Timesteps: 52349 Episode Num: 4069 Reward: -6.934853868281097 Epsilon: 0.9551280826758869
Total Timesteps: 52362 Episode Num: 4070 Reward: -7.721129941513414 Epsilon: 0.954890951453344
Total Timesteps: 52375 Episode Num: 4071 Reward: -5.599926723488058 Epsilon: 0.954653937947494
Total Timesteps: 52387 Episode Num: 4072 Reward: -6.392079066273214 Epsilon: 0.9544352606562697
Total Timesteps: 52399 Episode Num: 4073 Reward: -5.606734430991048 Epsilon: 0.9542166835244947
Total Timesteps: 52412 Episode Num: 4074 Reward: -6.7967829238149635 Epsilon: 0.953980004579104
Total Timesteps: 52425 Episode Num: 4075 R

Total Timesteps: 53416 Episode Num: 4151 Reward: -7.332459703498207 Epsilon: 0.9360491238580201
Total Timesteps: 53428 Episode Num: 4152 Reward: -6.297287030000099 Epsilon: 0.9358388859773902
Total Timesteps: 53441 Episode Num: 4153 Reward: -5.259063290457689 Epsilon: 0.9356112348197078
Total Timesteps: 53454 Episode Num: 4154 Reward: -8.190348301765885 Epsilon: 0.9353836943914393
Total Timesteps: 53468 Episode Num: 4155 Reward: -7.227874166946089 Epsilon: 0.9351387745941497
Total Timesteps: 53480 Episode Num: 4156 Reward: -6.966124252588043 Epsilon: 0.9349289454001496
Total Timesteps: 53493 Episode Num: 4157 Reward: -4.890870036307949 Epsilon: 0.9347017366758268
Total Timesteps: 53506 Episode Num: 4158 Reward: -8.441527758085057 Epsilon: 0.9344746383583149
Total Timesteps: 53519 Episode Num: 4159 Reward: -7.890029575531382 Epsilon: 0.9342476503671593
Total Timesteps: 53533 Episode Num: 4160 Reward: -7.666946243350901 Epsilon: 0.9340033250518371
Total Timesteps: 53546 Episode Num: 4161

Total Timesteps: 54525 Episode Num: 4237 Reward: -7.050910523255289 Epsilon: 0.9170105456212746
Total Timesteps: 54538 Episode Num: 4238 Reward: -7.904642061883702 Epsilon: 0.916791961568081
Total Timesteps: 54551 Episode Num: 4239 Reward: -6.882354498233037 Epsilon: 0.9165734816960276
Total Timesteps: 54563 Episode Num: 4240 Reward: -8.16590362268022 Epsilon: 0.916371900372047
Total Timesteps: 54576 Episode Num: 4241 Reward: -6.778572488915188 Epsilon: 0.9161536206391088
Total Timesteps: 54589 Episode Num: 4242 Reward: -5.984415391469358 Epsilon: 0.9159354448698456
Total Timesteps: 54601 Episode Num: 4243 Reward: -7.188870380516389 Epsilon: 0.9157341440632956
Total Timesteps: 54614 Episode Num: 4244 Reward: -7.335997246236981 Epsilon: 0.9155161680155272
Total Timesteps: 54627 Episode Num: 4245 Reward: -9.452381213989137 Epsilon: 0.9152982957145733
Total Timesteps: 54640 Episode Num: 4246 Reward: -5.948495532865578 Epsilon: 0.9150805270863837
Total Timesteps: 54654 Episode Num: 4247 Re

Total Timesteps: 55617 Episode Num: 4321 Reward: -7.107895123322894 Epsilon: 0.8990056996961361
Total Timesteps: 55629 Episode Num: 4322 Reward: -5.694078513572329 Epsilon: 0.8988117708389509
Total Timesteps: 55642 Episode Num: 4323 Reward: -8.15403360913328 Epsilon: 0.8986017756371086
Total Timesteps: 55655 Episode Num: 4324 Reward: -8.01116806164337 Epsilon: 0.898391878537418
Total Timesteps: 55667 Episode Num: 4325 Reward: -7.074167483761288 Epsilon: 0.8981982143819498
Total Timesteps: 55680 Episode Num: 4326 Reward: -6.552065297967236 Epsilon: 0.8979885057471264
Total Timesteps: 55693 Episode Num: 4327 Reward: -7.969428199422262 Epsilon: 0.897778895013736
Total Timesteps: 55706 Episode Num: 4328 Reward: -5.163221572416923 Epsilon: 0.8975693821132373
Total Timesteps: 55718 Episode Num: 4329 Reward: -5.405341574816616 Epsilon: 0.8973760723644065
Total Timesteps: 55732 Episode Num: 4330 Reward: -7.548121383036818 Epsilon: 0.8971506495370702
Total Timesteps: 55745 Episode Num: 4331 Rew

Total Timesteps: 56725 Episode Num: 4407 Reward: -7.730508206672031 Epsilon: 0.881445570736007
Total Timesteps: 56738 Episode Num: 4408 Reward: -7.123571222391436 Epsilon: 0.8812436109838203
Total Timesteps: 56751 Episode Num: 4409 Reward: -4.953872685165485 Epsilon: 0.8810417437578193
Total Timesteps: 56764 Episode Num: 4410 Reward: -5.564903460058169 Epsilon: 0.8808399689944331
Total Timesteps: 56777 Episode Num: 4411 Reward: -6.027118716311275 Epsilon: 0.8806382866301495
Total Timesteps: 56789 Episode Num: 4412 Reward: -6.769565608510981 Epsilon: 0.8804522002500484
Total Timesteps: 56802 Episode Num: 4413 Reward: -7.046866565398492 Epsilon: 0.8802506953980493
Total Timesteps: 56815 Episode Num: 4414 Reward: -4.5662512790855265 Epsilon: 0.8800492827598345
Total Timesteps: 56828 Episode Num: 4415 Reward: -6.848324675197007 Epsilon: 0.8798479622721194
Total Timesteps: 56841 Episode Num: 4416 Reward: -7.028472189301347 Epsilon: 0.8796467338716771
Total Timesteps: 56854 Episode Num: 4417

Total Timesteps: 57837 Episode Num: 4493 Reward: -7.98107181494597 Epsilon: 0.8644985044175874
Total Timesteps: 57850 Episode Num: 4494 Reward: -7.190355368183769 Epsilon: 0.8643042350907519
Total Timesteps: 57863 Episode Num: 4495 Reward: -4.791832335750927 Epsilon: 0.8641100530563572
Total Timesteps: 57876 Episode Num: 4496 Reward: -6.223821854605248 Epsilon: 0.8639159582555809
Total Timesteps: 57889 Episode Num: 4497 Reward: -5.345215839151302 Epsilon: 0.8637219506296533
Total Timesteps: 57902 Episode Num: 4498 Reward: -6.954637982521602 Epsilon: 0.8635280301198577
Total Timesteps: 57915 Episode Num: 4499 Reward: -8.215517035870885 Epsilon: 0.86333419666753
Total Timesteps: 57928 Episode Num: 4500 Reward: -7.320380017926428 Epsilon: 0.8631404502140588
Total Timesteps: 57941 Episode Num: 4501 Reward: -9.586925516620438 Epsilon: 0.8629467907008854
Total Timesteps: 57954 Episode Num: 4502 Reward: -5.135501970061851 Epsilon: 0.8627532180695034
Total Timesteps: 57966 Episode Num: 4503 Re

Total Timesteps: 58943 Episode Num: 4579 Reward: -6.280982478137054 Epsilon: 0.8482771491101573
Total Timesteps: 58956 Episode Num: 4580 Reward: -5.8554123439425885 Epsilon: 0.8480901010923401
Total Timesteps: 58969 Episode Num: 4581 Reward: -7.707369852992369 Epsilon: 0.8479031355457952
Total Timesteps: 58983 Episode Num: 4582 Reward: -7.330565425744242 Epsilon: 0.8477018802027703
Total Timesteps: 58996 Episode Num: 4583 Reward: -6.645994688795951 Epsilon: 0.8475150857685266
Total Timesteps: 59009 Episode Num: 4584 Reward: -6.906833799086655 Epsilon: 0.8473283736379197
Total Timesteps: 59022 Episode Num: 4585 Reward: -7.176939317871415 Epsilon: 0.8471417437565654
Total Timesteps: 59035 Episode Num: 4586 Reward: -7.9264831530276245 Epsilon: 0.8469551960701279
Total Timesteps: 59048 Episode Num: 4587 Reward: -6.715726588514623 Epsilon: 0.8467687305243192
Total Timesteps: 59061 Episode Num: 4588 Reward: -7.559631861357513 Epsilon: 0.846582347064899
Total Timesteps: 59073 Episode Num: 458

Total Timesteps: 60030 Episode Num: 4663 Reward: -6.841097081355627 Epsilon: 0.8329168748958854
Total Timesteps: 60043 Episode Num: 4664 Reward: -5.477736671196643 Epsilon: 0.8327365388138501
Total Timesteps: 60056 Episode Num: 4665 Reward: -6.510005694252814 Epsilon: 0.8325562808045824
Total Timesteps: 60069 Episode Num: 4666 Reward: -8.262399530818868 Epsilon: 0.8323761008173933
Total Timesteps: 60082 Episode Num: 4667 Reward: -7.159483903843726 Epsilon: 0.8321959988016377
Total Timesteps: 60094 Episode Num: 4668 Reward: -5.0647331192829235 Epsilon: 0.8320298199487469
Total Timesteps: 60106 Episode Num: 4669 Reward: -5.543497990050382 Epsilon: 0.8318637074501714
Total Timesteps: 60119 Episode Num: 4670 Reward: -6.311946447384706 Epsilon: 0.8316838270762986
Total Timesteps: 60133 Episode Num: 4671 Reward: -7.366256918808036 Epsilon: 0.8314901967305806
Total Timesteps: 60146 Episode Num: 4672 Reward: -6.854439039609983 Epsilon: 0.8313104778372626
Total Timesteps: 60159 Episode Num: 467

Total Timesteps: 61141 Episode Num: 4749 Reward: -7.904564598569471 Epsilon: 0.8177818485140904
Total Timesteps: 61154 Episode Num: 4750 Reward: -4.390447113200384 Epsilon: 0.817608006017595
Total Timesteps: 61167 Episode Num: 4751 Reward: -7.892646074579238 Epsilon: 0.8174342374156
Total Timesteps: 61180 Episode Num: 4752 Reward: -5.798149788771063 Epsilon: 0.8172605426610003
Total Timesteps: 61193 Episode Num: 4753 Reward: -3.095290678132548 Epsilon: 0.8170869217067311
Total Timesteps: 61206 Episode Num: 4754 Reward: -5.825315190858009 Epsilon: 0.8169133745057674
Total Timesteps: 61218 Episode Num: 4755 Reward: -5.000292657002349 Epsilon: 0.8167532425103727
Total Timesteps: 61231 Episode Num: 4756 Reward: -7.953603819361826 Epsilon: 0.8165798370106645
Total Timesteps: 61243 Episode Num: 4757 Reward: -6.722604347735041 Epsilon: 0.816419835736329
Total Timesteps: 61256 Episode Num: 4758 Reward: -6.933826137078291 Epsilon: 0.8162465717643986
Total Timesteps: 61269 Episode Num: 4759 Rewa

Total Timesteps: 62250 Episode Num: 4835 Reward: -5.12829750059087 Epsilon: 0.8032128514056225
Total Timesteps: 62262 Episode Num: 4836 Reward: -5.854216998417238 Epsilon: 0.8030580450354952
Total Timesteps: 62275 Episode Num: 4837 Reward: -5.914987934273269 Epsilon: 0.8028904054596547
Total Timesteps: 62287 Episode Num: 4838 Reward: -8.174131770351202 Epsilon: 0.8027357233451603
Total Timesteps: 62300 Episode Num: 4839 Reward: -8.251110089987119 Epsilon: 0.8025682182985554
Total Timesteps: 62313 Episode Num: 4840 Reward: -7.429114661614024 Epsilon: 0.8024007831431643
Total Timesteps: 62327 Episode Num: 4841 Reward: -5.678249359377795 Epsilon: 0.8022205464726363
Total Timesteps: 62340 Episode Num: 4842 Reward: -7.534287412099001 Epsilon: 0.8020532563362207
Total Timesteps: 62353 Episode Num: 4843 Reward: -7.39217792860083 Epsilon: 0.8018860359565698
Total Timesteps: 62366 Episode Num: 4844 Reward: -6.56499337757373 Epsilon: 0.8017188852900619
Total Timesteps: 62380 Episode Num: 4845 Re

Total Timesteps: 63360 Episode Num: 4921 Reward: -7.544914632803662 Epsilon: 0.7891414141414141
Total Timesteps: 63374 Episode Num: 4922 Reward: -6.442485083917266 Epsilon: 0.7889670842932432
Total Timesteps: 63387 Episode Num: 4923 Reward: -6.500051064918641 Epsilon: 0.7888052755296827
Total Timesteps: 63400 Episode Num: 4924 Reward: -8.407478590954572 Epsilon: 0.7886435331230284
Total Timesteps: 63413 Episode Num: 4925 Reward: -5.6046853732019875 Epsilon: 0.7884818570324696
Total Timesteps: 63426 Episode Num: 4926 Reward: -5.973709299891313 Epsilon: 0.7883202472172295
Total Timesteps: 63439 Episode Num: 4927 Reward: -7.049133516047006 Epsilon: 0.7881587036365643
Total Timesteps: 63452 Episode Num: 4928 Reward: -6.264215696771195 Epsilon: 0.7879972262497636
Total Timesteps: 63465 Episode Num: 4929 Reward: -6.989121868272017 Epsilon: 0.7878358150161506
Total Timesteps: 63477 Episode Num: 4930 Reward: -6.153642978592286 Epsilon: 0.7876868787119744
Total Timesteps: 63490 Episode Num: 493

Total Timesteps: 64471 Episode Num: 5007 Reward: -6.099958067609526 Epsilon: 0.7755424919731352
Total Timesteps: 64484 Episode Num: 5008 Reward: -7.031120280334728 Epsilon: 0.7753861422988648
Total Timesteps: 64497 Episode Num: 5009 Reward: -6.364661075315715 Epsilon: 0.7752298556522009
Total Timesteps: 64510 Episode Num: 5010 Reward: -5.305005179491612 Epsilon: 0.7750736319950395
Total Timesteps: 64523 Episode Num: 5011 Reward: -7.705120399047401 Epsilon: 0.7749174712893077
Total Timesteps: 64536 Episode Num: 5012 Reward: -6.383284809141935 Epsilon: 0.774761373496963
Total Timesteps: 64549 Episode Num: 5013 Reward: -6.226280728748338 Epsilon: 0.7746053385799935
Total Timesteps: 64562 Episode Num: 5014 Reward: -4.698505421860851 Epsilon: 0.7744493665004182
Total Timesteps: 64575 Episode Num: 5015 Reward: -5.586531602389007 Epsilon: 0.7742934572202865
Total Timesteps: 64587 Episode Num: 5016 Reward: -6.775395970657273 Epsilon: 0.7741495966680602
Total Timesteps: 64601 Episode Num: 5017 

Total Timesteps: 65556 Episode Num: 5091 Reward: -7.3219280317420905 Epsilon: 0.7627066935139423
Total Timesteps: 65569 Episode Num: 5092 Reward: -6.899682658129598 Epsilon: 0.7625554759108725
Total Timesteps: 65581 Episode Num: 5093 Reward: -7.31909945429665 Epsilon: 0.7624159436422134
Total Timesteps: 65594 Episode Num: 5094 Reward: -7.786726141294843 Epsilon: 0.76226484129646
Total Timesteps: 65607 Episode Num: 5095 Reward: -7.462702996501938 Epsilon: 0.7621137988324417
Total Timesteps: 65620 Episode Num: 5096 Reward: -7.149727731049267 Epsilon: 0.7619628162145687
Total Timesteps: 65633 Episode Num: 5097 Reward: -8.722298247425028 Epsilon: 0.7618118934072798
Total Timesteps: 65646 Episode Num: 5098 Reward: -6.448444271645546 Epsilon: 0.7616610303750418
Total Timesteps: 65659 Episode Num: 5099 Reward: -6.245711410912312 Epsilon: 0.7615102270823497
Total Timesteps: 65673 Episode Num: 5100 Reward: -8.273746052542858 Epsilon: 0.761347890304996
Total Timesteps: 65685 Episode Num: 5101 Re

Total Timesteps: 66661 Episode Num: 5177 Reward: -5.14592257507528 Epsilon: 0.7500637554192107
Total Timesteps: 66674 Episode Num: 5178 Reward: -9.123087692750358 Epsilon: 0.7499175090740019
Total Timesteps: 66688 Episode Num: 5179 Reward: -7.924010124659194 Epsilon: 0.7497600767754319
Total Timesteps: 66700 Episode Num: 5180 Reward: -5.003808891283647 Epsilon: 0.7496251874062968
Total Timesteps: 66713 Episode Num: 5181 Reward: -6.153186219200814 Epsilon: 0.7494791120171481
Total Timesteps: 66726 Episode Num: 5182 Reward: -6.031706440048824 Epsilon: 0.7493330935467434
Total Timesteps: 66739 Episode Num: 5183 Reward: -7.043572379435188 Epsilon: 0.7491871319618214
Total Timesteps: 66751 Episode Num: 5184 Reward: -6.124136247897899 Epsilon: 0.7490524486524547
Total Timesteps: 66763 Episode Num: 5185 Reward: -5.929210090536252 Epsilon: 0.748917813759118
Total Timesteps: 66776 Episode Num: 5186 Reward: -7.195762797097282 Epsilon: 0.7487720138972086
Total Timesteps: 66790 Episode Num: 5187 R

Total Timesteps: 67771 Episode Num: 5263 Reward: -8.030247580802106 Epsilon: 0.7377786959023771
Total Timesteps: 67784 Episode Num: 5264 Reward: -6.7354459341572674 Epsilon: 0.7376372005192966
Total Timesteps: 67797 Episode Num: 5265 Reward: -6.137387362489383 Epsilon: 0.7374957593993835
Total Timesteps: 67810 Episode Num: 5266 Reward: -5.454637998470552 Epsilon: 0.737354372511429
Total Timesteps: 67823 Episode Num: 5267 Reward: -5.033707809274709 Epsilon: 0.7372130398242485
Total Timesteps: 67837 Episode Num: 5268 Reward: -7.465310665524664 Epsilon: 0.7370608959712251
Total Timesteps: 67850 Episode Num: 5269 Reward: -7.944816044884717 Epsilon: 0.7369196757553427
Total Timesteps: 67863 Episode Num: 5270 Reward: -7.605089246131326 Epsilon: 0.7367785096444307
Total Timesteps: 67876 Episode Num: 5271 Reward: -6.745712218017254 Epsilon: 0.7366373976074018
Total Timesteps: 67889 Episode Num: 5272 Reward: -6.795840689749428 Epsilon: 0.7364963396131922
Total Timesteps: 67902 Episode Num: 5273

Total Timesteps: 68876 Episode Num: 5349 Reward: -8.804136035436702 Epsilon: 0.7259422730704455
Total Timesteps: 68889 Episode Num: 5350 Reward: -5.74342528040056 Epsilon: 0.7258052809592243
Total Timesteps: 68902 Episode Num: 5351 Reward: -7.533810491910826 Epsilon: 0.7256683405416389
Total Timesteps: 68914 Episode Num: 5352 Reward: -5.11783136377229 Epsilon: 0.7255419798589546
Total Timesteps: 68927 Episode Num: 5353 Reward: -7.990496846119753 Epsilon: 0.725405138770003
Total Timesteps: 68940 Episode Num: 5354 Reward: -6.357002335552223 Epsilon: 0.725268349289237
Total Timesteps: 68953 Episode Num: 5355 Reward: -4.6010441251495 Epsilon: 0.7251316113874668
Total Timesteps: 68967 Episode Num: 5356 Reward: -7.509346024729442 Epsilon: 0.724984412835124
Total Timesteps: 68979 Episode Num: 5357 Reward: -7.336785551446811 Epsilon: 0.7248582902042651
Total Timesteps: 68992 Episode Num: 5358 Reward: -7.068319792060067 Epsilon: 0.724721706864564
Total Timesteps: 69004 Episode Num: 5359 Reward:

Total Timesteps: 69977 Episode Num: 5435 Reward: -6.6909784535202315 Epsilon: 0.7145204853023136
Total Timesteps: 69990 Episode Num: 5436 Reward: -7.052841291818042 Epsilon: 0.7143877696813831
Total Timesteps: 70003 Episode Num: 5437 Reward: -5.819913223437596 Epsilon: 0.7142551033527135
Total timesteps: 70003 Epsilon: 0.7142551033527135
---------------------------------------
Average Reward over the Evaluation Step: -6.659671
---------------------------------------
Total Timesteps: 70016 Episode Num: 5438 Reward: -5.168512256783375 Epsilon: 0.7141224862888482
Total Timesteps: 70029 Episode Num: 5439 Reward: -5.716931990023149 Epsilon: 0.7139899184623513
Total Timesteps: 70042 Episode Num: 5440 Reward: -9.364044306391591 Epsilon: 0.7138573998458068
Total Timesteps: 70055 Episode Num: 5441 Reward: -6.56760874634011 Epsilon: 0.7137249304118193
Total Timesteps: 70068 Episode Num: 5442 Reward: -8.684008536147068 Epsilon: 0.7135925101330136
Total Timesteps: 70081 Episode Num: 5443 Reward: -

Total Timesteps: 71058 Episode Num: 5519 Reward: -7.585527900258619 Epsilon: 0.7036505389963129
Total Timesteps: 71070 Episode Num: 5520 Reward: -6.714041048742441 Epsilon: 0.7035317292809906
Total Timesteps: 71083 Episode Num: 5521 Reward: -5.432862155065825 Epsilon: 0.7034030640237469
Total Timesteps: 71096 Episode Num: 5522 Reward: -6.249502324514731 Epsilon: 0.7032744458197367
Total Timesteps: 71109 Episode Num: 5523 Reward: -6.76938470682791 Epsilon: 0.7031458746431535
Total Timesteps: 71121 Episode Num: 5524 Reward: -5.444632772139178 Epsilon: 0.7030272352750946
Total Timesteps: 71134 Episode Num: 5525 Reward: -7.0247175712241745 Epsilon: 0.7028987544634071
Total Timesteps: 71147 Episode Num: 5526 Reward: -6.147105278861842 Epsilon: 0.7027703206038203
Total Timesteps: 71160 Episode Num: 5527 Reward: -9.473168553457274 Epsilon: 0.7026419336706015
Total Timesteps: 71173 Episode Num: 5528 Reward: -4.719316114166896 Epsilon: 0.7025135936380369
Total Timesteps: 71186 Episode Num: 5529

Total Timesteps: 72161 Episode Num: 5605 Reward: -6.8343456444445705 Epsilon: 0.6928950541151038
Total Timesteps: 72174 Episode Num: 5606 Reward: -7.369817972203423 Epsilon: 0.6927702496743979
Total Timesteps: 72187 Episode Num: 5607 Reward: -6.726170477393469 Epsilon: 0.6926454901852134
Total Timesteps: 72201 Episode Num: 5608 Reward: -8.33963924333127 Epsilon: 0.6925111840556225
Total Timesteps: 72215 Episode Num: 5609 Reward: -5.428355086634585 Epsilon: 0.6923769300006923
Total Timesteps: 72228 Episode Num: 5610 Reward: -7.591643435401287 Epsilon: 0.6922523121227225
Total Timesteps: 72241 Episode Num: 5611 Reward: -6.9434190637617474 Epsilon: 0.6921277390955275
Total Timesteps: 72253 Episode Num: 5612 Reward: -4.723454690705034 Epsilon: 0.6920127883963295
Total Timesteps: 72266 Episode Num: 5613 Reward: -6.93669187805903 Epsilon: 0.6918883015525974
Total Timesteps: 72280 Episode Num: 5614 Reward: -6.39838238884504 Epsilon: 0.6917542888765911
Total Timesteps: 72293 Episode Num: 5615 

Total Timesteps: 73266 Episode Num: 5691 Reward: -7.163360426584101 Epsilon: 0.6824447902164715
Total Timesteps: 73279 Episode Num: 5692 Reward: -6.293318899055215 Epsilon: 0.6823237216665075
Total Timesteps: 73291 Episode Num: 5693 Reward: -3.504262131502187 Epsilon: 0.6822120042024259
Total Timesteps: 73304 Episode Num: 5694 Reward: -5.984144407604979 Epsilon: 0.682091018225472
Total Timesteps: 73317 Episode Num: 5695 Reward: -7.8166361379880165 Epsilon: 0.6819700751531023
Total Timesteps: 73330 Episode Num: 5696 Reward: -7.6207780531497376 Epsilon: 0.6818491749624983
Total Timesteps: 73342 Episode Num: 5697 Reward: -6.5951698860970005 Epsilon: 0.681737612827575
Total Timesteps: 73355 Episode Num: 5698 Reward: -6.272489758935795 Epsilon: 0.6816167950378297
Total Timesteps: 73368 Episode Num: 5699 Reward: -6.837876932965727 Epsilon: 0.6814960200632428
Total Timesteps: 73380 Episode Num: 5700 Reward: -7.223534105201986 Epsilon: 0.681384573453257
Total Timesteps: 73393 Episode Num: 5701

Total Timesteps: 74366 Episode Num: 5777 Reward: -6.309868356959153 Epsilon: 0.6723502675954065
Total Timesteps: 74379 Episode Num: 5778 Reward: -6.403193631762973 Epsilon: 0.6722327538686995
Total Timesteps: 74393 Episode Num: 5779 Reward: -9.80625548901999 Epsilon: 0.6721062465554555
Total Timesteps: 74406 Episode Num: 5780 Reward: -7.190508933217328 Epsilon: 0.6719888181060667
Total Timesteps: 74418 Episode Num: 5781 Reward: -6.07283492314265 Epsilon: 0.6718804590287296
Total Timesteps: 74431 Episode Num: 5782 Reward: -6.259667193722064 Epsilon: 0.6717631094570811
Total Timesteps: 74444 Episode Num: 5783 Reward: -5.13566582694815 Epsilon: 0.671645800870453
Total Timesteps: 74457 Episode Num: 5784 Reward: -7.056243661262594 Epsilon: 0.6715285332473777
Total Timesteps: 74469 Episode Num: 5785 Reward: -6.497914069233142 Epsilon: 0.6714203225503229
Total Timesteps: 74482 Episode Num: 5786 Reward: -8.270760545136485 Epsilon: 0.6713031336430278
Total Timesteps: 74495 Episode Num: 5787 Rew

Total Timesteps: 75457 Episode Num: 5861 Reward: -7.402869341124314 Epsilon: 0.6626290470069046
Total Timesteps: 75470 Episode Num: 5862 Reward: -8.33274987928126 Epsilon: 0.6625149065853981
Total Timesteps: 75482 Episode Num: 5863 Reward: -5.023352597694836 Epsilon: 0.662409581092181
Total Timesteps: 75494 Episode Num: 5864 Reward: -7.405629292710064 Epsilon: 0.6623042890825761
Total Timesteps: 75506 Episode Num: 5865 Reward: -6.295686507227739 Epsilon: 0.6621990305406192
Total Timesteps: 75518 Episode Num: 5866 Reward: -5.130578154437987 Epsilon: 0.6620938054503562
Total Timesteps: 75531 Episode Num: 5867 Reward: -6.230411510943837 Epsilon: 0.6619798493333863
Total Timesteps: 75544 Episode Num: 5868 Reward: -5.790386330358033 Epsilon: 0.6618659324367256
Total Timesteps: 75557 Episode Num: 5869 Reward: -6.155970926989789 Epsilon: 0.66175205474013
Total Timesteps: 75570 Episode Num: 5870 Reward: -6.78381989738317 Epsilon: 0.661638216223369
Total Timesteps: 75583 Episode Num: 5871 Rewar

Total Timesteps: 76558 Episode Num: 5947 Reward: -6.3087536535230395 Epsilon: 0.653099610752632
Total Timesteps: 76570 Episode Num: 5948 Reward: -6.364049650579591 Epsilon: 0.6529972574115188
Total Timesteps: 76583 Episode Num: 5949 Reward: -6.400324695664595 Epsilon: 0.6528864108222452
Total Timesteps: 76596 Episode Num: 5950 Reward: -5.27958804188493 Epsilon: 0.6527756018591049
Total Timesteps: 76609 Episode Num: 5951 Reward: -5.588332896754697 Epsilon: 0.6526648305029435
Total Timesteps: 76622 Episode Num: 5952 Reward: -5.753540442569845 Epsilon: 0.6525540967346193
Total Timesteps: 76634 Episode Num: 5953 Reward: -6.640666428767016 Epsilon: 0.6524519142939166
Total Timesteps: 76647 Episode Num: 5954 Reward: -6.7891946697132886 Epsilon: 0.6523412527561417
Total Timesteps: 76661 Episode Num: 5955 Reward: -5.3707025663962185 Epsilon: 0.6522221207654478
Total Timesteps: 76674 Episode Num: 5956 Reward: -6.787773505888872 Epsilon: 0.6521115371573154
Total Timesteps: 76687 Episode Num: 595

Total Timesteps: 77672 Episode Num: 6033 Reward: -5.4388599766506305 Epsilon: 0.643732619219281
Total Timesteps: 77685 Episode Num: 6034 Reward: -7.704417752802607 Epsilon: 0.6436248954109545
Total Timesteps: 77698 Episode Num: 6035 Reward: -6.630116843243674 Epsilon: 0.6435172076501325
Total Timesteps: 77711 Episode Num: 6036 Reward: -7.633069591771199 Epsilon: 0.6434095559187245
Total Timesteps: 77724 Episode Num: 6037 Reward: -7.329093238874793 Epsilon: 0.6433019401986516
Total Timesteps: 77737 Episode Num: 6038 Reward: -5.921770876698167 Epsilon: 0.6431943604718474
Total Timesteps: 77751 Episode Num: 6039 Reward: -8.400506686037872 Epsilon: 0.6430785456135613
Total Timesteps: 77763 Episode Num: 6040 Reward: -5.178071033081638 Epsilon: 0.6429793089258388
Total Timesteps: 77776 Episode Num: 6041 Reward: -7.294503856465541 Epsilon: 0.6428718370705616
Total Timesteps: 77789 Episode Num: 6042 Reward: -6.5666702575176075 Epsilon: 0.6427644011364074
Total Timesteps: 77802 Episode Num: 604

Total Timesteps: 78777 Episode Num: 6119 Reward: -7.77051038029742 Epsilon: 0.6347030224557929
Total Timesteps: 78790 Episode Num: 6120 Reward: -5.72369412663382 Epsilon: 0.634598299276558
Total Timesteps: 78803 Episode Num: 6121 Reward: -4.264164438456453 Epsilon: 0.6344936106493407
Total Timesteps: 78815 Episode Num: 6122 Reward: -6.611204794925738 Epsilon: 0.6343970056461333
Total Timesteps: 78827 Episode Num: 6123 Reward: -6.573670074471456 Epsilon: 0.6343004300556916
Total Timesteps: 78840 Episode Num: 6124 Reward: -5.571953526502808 Epsilon: 0.6341958396752917
Total Timesteps: 78853 Episode Num: 6125 Reward: -5.909150003202787 Epsilon: 0.6340912837812132
Total Timesteps: 78866 Episode Num: 6126 Reward: -7.101975499063886 Epsilon: 0.633986762356402
Total Timesteps: 78879 Episode Num: 6127 Reward: -8.4375207705102 Epsilon: 0.6338822753838157
Total Timesteps: 78892 Episode Num: 6128 Reward: -6.180932653578665 Epsilon: 0.6337778228464229
Total Timesteps: 78905 Episode Num: 6129 Rewar

Total Timesteps: 79880 Episode Num: 6205 Reward: -7.542545721856095 Epsilon: 0.6259389083625438
Total Timesteps: 79893 Episode Num: 6206 Reward: -7.367296456047517 Epsilon: 0.6258370570638229
Total Timesteps: 79906 Episode Num: 6207 Reward: -6.6513454189613945 Epsilon: 0.6257352389057143
Total Timesteps: 79918 Episode Num: 6208 Reward: -7.078668708202941 Epsilon: 0.6256412823143722
Total Timesteps: 79931 Episode Num: 6209 Reward: -5.601693570239894 Epsilon: 0.6255395278427643
Total Timesteps: 79944 Episode Num: 6210 Reward: -6.13097185290486 Epsilon: 0.6254378064645252
Total Timesteps: 79957 Episode Num: 6211 Reward: -6.42526141440484 Epsilon: 0.6253361181635129
Total Timesteps: 79969 Episode Num: 6212 Reward: -5.042835129947726 Epsilon: 0.6252422813840364
Total Timesteps: 79982 Episode Num: 6213 Reward: -7.489480952309465 Epsilon: 0.6251406566477458
Total Timesteps: 79995 Episode Num: 6214 Reward: -6.594497828889315 Epsilon: 0.6250390649415588
Total Timesteps: 80009 Episode Num: 6215 

Total Timesteps: 80962 Episode Num: 6289 Reward: -6.102457708772467 Epsilon: 0.6175736765396111
Total Timesteps: 80975 Episode Num: 6290 Reward: -7.3510798814573155 Epsilon: 0.6174745291756715
Total Timesteps: 80988 Episode Num: 6291 Reward: -6.448297099799374 Epsilon: 0.6173754136415271
Total Timesteps: 81001 Episode Num: 6292 Reward: -7.975904361108232 Epsilon: 0.6172763299218528
Total Timesteps: 81014 Episode Num: 6293 Reward: -4.054414319571312 Epsilon: 0.6171772780013332
Total Timesteps: 81026 Episode Num: 6294 Reward: -4.4787876155228945 Epsilon: 0.61708587367018
Total Timesteps: 81039 Episode Num: 6295 Reward: -5.502029425200535 Epsilon: 0.6169868828588704
Total Timesteps: 81052 Episode Num: 6296 Reward: -6.3888306743931835 Epsilon: 0.6168879238020036
Total Timesteps: 81065 Episode Num: 6297 Reward: -8.96528569792326 Epsilon: 0.6167889964843027
Total Timesteps: 81077 Episode Num: 6298 Reward: -6.290346155657657 Epsilon: 0.6166977071179249
Total Timesteps: 81090 Episode Num: 6299

Total Timesteps: 82076 Episode Num: 6375 Reward: -7.332732583921221 Epsilon: 0.6091914810663288
Total Timesteps: 82090 Episode Num: 6376 Reward: -7.223490276838751 Epsilon: 0.6090875867949811
Total Timesteps: 82103 Episode Num: 6377 Reward: -5.530394842791811 Epsilon: 0.6089911452687478
Total Timesteps: 82116 Episode Num: 6378 Reward: -6.599369758290685 Epsilon: 0.608894734278338
Total Timesteps: 82130 Episode Num: 6379 Reward: -8.435813718451074 Epsilon: 0.6087909411907951
Total Timesteps: 82143 Episode Num: 6380 Reward: -4.131117503852479 Epsilon: 0.6086945935746199
Total Timesteps: 82156 Episode Num: 6381 Reward: -8.753234835488085 Epsilon: 0.6085982764496811
Total Timesteps: 82169 Episode Num: 6382 Reward: -5.682499938332688 Epsilon: 0.6085019898015066
Total Timesteps: 82182 Episode Num: 6383 Reward: -7.216323146624649 Epsilon: 0.6084057336156335
Total Timesteps: 82195 Episode Num: 6384 Reward: -5.344578401811023 Epsilon: 0.6083095078776082
Total Timesteps: 82208 Episode Num: 6385 

Total Timesteps: 83188 Episode Num: 6461 Reward: -6.352538337706449 Epsilon: 0.6010482281098235
Total Timesteps: 83201 Episode Num: 6462 Reward: -6.8273762940828675 Epsilon: 0.6009543154529393
Total Timesteps: 83214 Episode Num: 6463 Reward: -5.472482721558721 Epsilon: 0.6008604321388228
Total Timesteps: 83227 Episode Num: 6464 Reward: -5.2883048079695 Epsilon: 0.6007665781537241
Total Timesteps: 83240 Episode Num: 6465 Reward: -8.182194876495332 Epsilon: 0.6006727534839019
Total Timesteps: 83252 Episode Num: 6466 Reward: -5.839793275248377 Epsilon: 0.6005861721039735
Total Timesteps: 83266 Episode Num: 6467 Reward: -5.205935450546986 Epsilon: 0.6004851920351644
Total Timesteps: 83280 Episode Num: 6468 Reward: -7.6682779824676235 Epsilon: 0.6003842459173871
Total Timesteps: 83293 Episode Num: 6469 Reward: -6.150612699273648 Epsilon: 0.6002905406216609
Total Timesteps: 83306 Episode Num: 6470 Reward: -6.6814183445514725 Epsilon: 0.6001968645715795
Total Timesteps: 83318 Episode Num: 647

Total Timesteps: 84298 Episode Num: 6547 Reward: -5.693677198962145 Epsilon: 0.5931338821798856
Total Timesteps: 84310 Episode Num: 6548 Reward: -5.433632358730702 Epsilon: 0.5930494603249911
Total Timesteps: 84323 Episode Num: 6549 Reward: -7.469755824993755 Epsilon: 0.5929580304306061
Total Timesteps: 84335 Episode Num: 6550 Reward: -5.317388088007513 Epsilon: 0.5928736586233474
Total Timesteps: 84347 Episode Num: 6551 Reward: -5.881768763032555 Epsilon: 0.5927893108231472
Total Timesteps: 84360 Episode Num: 6552 Reward: -6.310205703073269 Epsilon: 0.5926979611190137
Total Timesteps: 84373 Episode Num: 6553 Reward: -6.786840315376644 Epsilon: 0.5926066395647897
Total Timesteps: 84386 Episode Num: 6554 Reward: -8.471617723964899 Epsilon: 0.5925153461474653
Total Timesteps: 84399 Episode Num: 6555 Reward: -7.412142928775006 Epsilon: 0.5924240808540385
Total Timesteps: 84412 Episode Num: 6556 Reward: -6.16288340626226 Epsilon: 0.592332843671516
Total Timesteps: 84425 Episode Num: 6557 R

Total Timesteps: 85375 Episode Num: 6631 Reward: -7.371325783243913 Epsilon: 0.5856515373352855
Total Timesteps: 85388 Episode Num: 6632 Reward: -6.983683314340068 Epsilon: 0.5855623741040896
Total Timesteps: 85400 Episode Num: 6633 Reward: -6.556577695212477 Epsilon: 0.585480093676815
Total Timesteps: 85413 Episode Num: 6634 Reward: -4.873119039288135 Epsilon: 0.5853909826373035
Total Timesteps: 85426 Episode Num: 6635 Reward: -6.107610485467458 Epsilon: 0.5853018987193594
Total Timesteps: 85439 Episode Num: 6636 Reward: -6.260185860892517 Epsilon: 0.5852128419106029
Total Timesteps: 85452 Episode Num: 6637 Reward: -8.001535362075378 Epsilon: 0.5851238121986613
Total Timesteps: 85465 Episode Num: 6638 Reward: -5.473472837262223 Epsilon: 0.5850348095711695
Total Timesteps: 85478 Episode Num: 6639 Reward: -7.16084735671732 Epsilon: 0.5849458340157702
Total Timesteps: 85490 Episode Num: 6640 Reward: -6.630354224574694 Epsilon: 0.5848637267516669
Total Timesteps: 85502 Episode Num: 6641 R

Total Timesteps: 86481 Episode Num: 6717 Reward: -5.148498587498022 Epsilon: 0.578161677131393
Total Timesteps: 86493 Episode Num: 6718 Reward: -6.521838483033616 Epsilon: 0.5780814632397997
Total Timesteps: 86506 Episode Num: 6719 Reward: -5.631939844449069 Epsilon: 0.5779945899706379
Total Timesteps: 86519 Episode Num: 6720 Reward: -5.157931947278062 Epsilon: 0.5779077428079381
Total Timesteps: 86532 Episode Num: 6721 Reward: -5.556308040419811 Epsilon: 0.5778209217399344
Total Timesteps: 86544 Episode Num: 6722 Reward: -5.714835071188919 Epsilon: 0.5777408023664263
Total Timesteps: 86558 Episode Num: 6723 Reward: -6.469729909149466 Epsilon: 0.5776473578409852
Total Timesteps: 86571 Episode Num: 6724 Reward: -4.5321530618541335 Epsilon: 0.5775606149865429
Total Timesteps: 86584 Episode Num: 6725 Reward: -6.670973083680258 Epsilon: 0.5774738981798023
Total Timesteps: 86598 Episode Num: 6726 Reward: -6.4146616439342505 Epsilon: 0.577380539966281
Total Timesteps: 86610 Episode Num: 6727

Total Timesteps: 87591 Episode Num: 6803 Reward: -6.108003414793425 Epsilon: 0.570834903129317
Total Timesteps: 87603 Episode Num: 6804 Reward: -5.926393046619288 Epsilon: 0.5707567092451171
Total Timesteps: 87615 Episode Num: 6805 Reward: -7.494564110349801 Epsilon: 0.5706785367802317
Total Timesteps: 87628 Episode Num: 6806 Reward: -7.3360845562827395 Epsilon: 0.5705938741041676
Total Timesteps: 87641 Episode Num: 6807 Reward: -6.3622933697744255 Epsilon: 0.5705092365445397
Total Timesteps: 87654 Episode Num: 6808 Reward: -6.153272729208186 Epsilon: 0.5704246240901727
Total Timesteps: 87667 Episode Num: 6809 Reward: -9.711629895198381 Epsilon: 0.5703400367298984
Total Timesteps: 87680 Episode Num: 6810 Reward: -8.074489762721068 Epsilon: 0.5702554744525548
Total Timesteps: 87692 Episode Num: 6811 Reward: -5.4009282870535005 Epsilon: 0.5701774392190849
Total Timesteps: 87705 Episode Num: 6812 Reward: -6.328805770580758 Epsilon: 0.5700929251467989
Total Timesteps: 87718 Episode Num: 68

Total Timesteps: 88697 Episode Num: 6889 Reward: -5.8940675099580915 Epsilon: 0.5637169239094896
Total Timesteps: 88709 Episode Num: 6890 Reward: -5.645603288591966 Epsilon: 0.5636406678014632
Total Timesteps: 88721 Episode Num: 6891 Reward: -6.632763325057207 Epsilon: 0.5635644323215473
Total Timesteps: 88733 Episode Num: 6892 Reward: -5.861795352764606 Epsilon: 0.5634882174613729
Total Timesteps: 88746 Episode Num: 6893 Reward: -6.525970946166982 Epsilon: 0.5634056746219548
Total Timesteps: 88758 Episode Num: 6894 Reward: -6.530146367465336 Epsilon: 0.563329502692715
Total Timesteps: 88771 Episode Num: 6895 Reward: -6.611905036711643 Epsilon: 0.5632470063421613
Total Timesteps: 88784 Episode Num: 6896 Reward: -9.028037485238434 Epsilon: 0.5631645341502973
Total Timesteps: 88797 Episode Num: 6897 Reward: -6.674835824712314 Epsilon: 0.5630820861065126
Total Timesteps: 88810 Episode Num: 6898 Reward: -6.507025258852547 Epsilon: 0.5629996622002027
Total Timesteps: 88822 Episode Num: 6899

Total Timesteps: 89804 Episode Num: 6975 Reward: -7.0761568652834335 Epsilon: 0.5567680726916395
Total Timesteps: 89817 Episode Num: 6976 Reward: -4.88179448362258 Epsilon: 0.5566874867786722
Total Timesteps: 89830 Episode Num: 6977 Reward: -6.604070559187916 Epsilon: 0.5566069241901369
Total Timesteps: 89843 Episode Num: 6978 Reward: -4.709145056358538 Epsilon: 0.5565263849159089
Total Timesteps: 89855 Episode Num: 6979 Reward: -6.05409256104626 Epsilon: 0.5564520616548885
Total Timesteps: 89868 Episode Num: 6980 Reward: -8.702493764527713 Epsilon: 0.5563715671874304
Total Timesteps: 89881 Episode Num: 6981 Reward: -6.476697120260621 Epsilon: 0.5562910960047174
Total Timesteps: 89894 Episode Num: 6982 Reward: -7.549390034639861 Epsilon: 0.5562106480966471
Total Timesteps: 89907 Episode Num: 6983 Reward: -5.590202735271993 Epsilon: 0.5561302234531238
Total Timesteps: 89920 Episode Num: 6984 Reward: -5.445368090256591 Epsilon: 0.556049822064057
Total Timesteps: 89933 Episode Num: 6985 R

Total Timesteps: 90888 Episode Num: 7059 Reward: -8.204892867874735 Epsilon: 0.5501276296100696
Total Timesteps: 90900 Episode Num: 7060 Reward: -5.143056313654423 Epsilon: 0.5500550055005501
Total Timesteps: 90912 Episode Num: 7061 Reward: -8.249724865517177 Epsilon: 0.5499824005631819
Total Timesteps: 90925 Episode Num: 7062 Reward: -6.713441021951836 Epsilon: 0.5499037668408029
Total Timesteps: 90938 Episode Num: 7063 Reward: -6.629257955404437 Epsilon: 0.549825155600519
Total Timesteps: 90952 Episode Num: 7064 Reward: -5.935895753726673 Epsilon: 0.5497405224733926
Total Timesteps: 90965 Episode Num: 7065 Reward: -8.19820754172318 Epsilon: 0.549661957895894
Total Timesteps: 90978 Episode Num: 7066 Reward: -6.0608507047832525 Epsilon: 0.5495834157708457
Total Timesteps: 90991 Episode Num: 7067 Reward: -5.868369722835443 Epsilon: 0.5495048960886242
Total Timesteps: 91005 Episode Num: 7068 Reward: -9.32365451445589 Epsilon: 0.5494203615185979
Total Timesteps: 91018 Episode Num: 7069 Re

Total Timesteps: 91994 Episode Num: 7145 Reward: -7.201436237187211 Epsilon: 0.543513707415701
Total Timesteps: 92007 Episode Num: 7146 Reward: -6.627557704011675 Epsilon: 0.5434369124088385
Total Timesteps: 92020 Episode Num: 7147 Reward: -7.753463506265054 Epsilon: 0.5433601391001956
Total Timesteps: 92033 Episode Num: 7148 Reward: -7.261583530972487 Epsilon: 0.5432833874805776
Total Timesteps: 92046 Episode Num: 7149 Reward: -7.094052566376396 Epsilon: 0.5432066575407948
Total Timesteps: 92059 Episode Num: 7150 Reward: -8.15385851254719 Epsilon: 0.5431299492716627
Total Timesteps: 92071 Episode Num: 7151 Reward: -5.609953011567885 Epsilon: 0.5430591608649846
Total Timesteps: 92083 Episode Num: 7152 Reward: -4.964261134080079 Epsilon: 0.5429883909082024
Total Timesteps: 92096 Episode Num: 7153 Reward: -7.789956052098831 Epsilon: 0.542911744266852
Total Timesteps: 92109 Episode Num: 7154 Reward: -6.862397157398387 Epsilon: 0.5428351192608757
Total Timesteps: 92122 Episode Num: 7155 Re

Total Timesteps: 93101 Episode Num: 7231 Reward: -5.397658785769987 Epsilon: 0.5370511594934534
Total Timesteps: 93114 Episode Num: 7232 Reward: -4.731674174521053 Epsilon: 0.5369761797366669
Total Timesteps: 93127 Episode Num: 7233 Reward: -6.07188497651955 Epsilon: 0.5369012209133763
Total Timesteps: 93140 Episode Num: 7234 Reward: -7.307828011317834 Epsilon: 0.5368262830148164
Total Timesteps: 93153 Episode Num: 7235 Reward: -5.70032686617372 Epsilon: 0.5367513660322265
Total Timesteps: 93167 Episode Num: 7236 Reward: -5.773258402864893 Epsilon: 0.5366707095860123
Total Timesteps: 93180 Episode Num: 7237 Reward: -5.622283126250268 Epsilon: 0.5365958360163126
Total Timesteps: 93192 Episode Num: 7238 Reward: -4.8102253503569194 Epsilon: 0.5365267404927462
Total Timesteps: 93205 Episode Num: 7239 Reward: -5.52090820684002 Epsilon: 0.5364519070865297
Total Timesteps: 93218 Episode Num: 7240 Reward: -6.419799014173012 Epsilon: 0.5363770945525542
Total Timesteps: 93230 Episode Num: 7241 R

Total Timesteps: 94210 Episode Num: 7317 Reward: -4.721830629677908 Epsilon: 0.5307292219509606
Total Timesteps: 94223 Episode Num: 7318 Reward: -7.747564639489873 Epsilon: 0.5306559969434215
Total Timesteps: 94237 Episode Num: 7319 Reward: -5.43519185888955 Epsilon: 0.5305771618366459
Total Timesteps: 94249 Episode Num: 7320 Reward: -4.772391775474896 Epsilon: 0.5305096075289923
Total Timesteps: 94262 Episode Num: 7321 Reward: -6.430973862366793 Epsilon: 0.5304364431053871
Total Timesteps: 94275 Episode Num: 7322 Reward: -5.388260366842693 Epsilon: 0.5303632988597189
Total Timesteps: 94288 Episode Num: 7323 Reward: -6.404216633613167 Epsilon: 0.5302901747836416
Total Timesteps: 94301 Episode Num: 7324 Reward: -4.712108864276646 Epsilon: 0.5302170708688136
Total Timesteps: 94314 Episode Num: 7325 Reward: -5.149168785460704 Epsilon: 0.5301439871068983
Total Timesteps: 94327 Episode Num: 7326 Reward: -6.389074560481963 Epsilon: 0.5300709234895629
Total Timesteps: 94340 Episode Num: 7327 

Total Timesteps: 95295 Episode Num: 7401 Reward: -7.049501187295878 Epsilon: 0.5246864998163597
Total Timesteps: 95308 Episode Num: 7402 Reward: -7.008640354036093 Epsilon: 0.5246149326394427
Total Timesteps: 95321 Episode Num: 7403 Reward: -6.652212547268785 Epsilon: 0.5245433849833719
Total Timesteps: 95334 Episode Num: 7404 Reward: -7.6225261973456675 Epsilon: 0.524471856840162
Total Timesteps: 95347 Episode Num: 7405 Reward: -6.436832771876814 Epsilon: 0.5244003482018312
Total Timesteps: 95360 Episode Num: 7406 Reward: -9.190496312859233 Epsilon: 0.5243288590604027
Total Timesteps: 95373 Episode Num: 7407 Reward: -5.02320179586331 Epsilon: 0.5242573894079037
Total Timesteps: 95385 Episode Num: 7408 Reward: -5.626422621462263 Epsilon: 0.5241914347119568
Total Timesteps: 95398 Episode Num: 7409 Reward: -9.264360118542443 Epsilon: 0.5241200025157761
Total Timesteps: 95411 Episode Num: 7410 Reward: -7.09797965310241 Epsilon: 0.5240485897852449
Total Timesteps: 95423 Episode Num: 7411 R

Total Timesteps: 96397 Episode Num: 7487 Reward: -6.722624712175739 Epsilon: 0.5186883409234727
Total Timesteps: 96410 Episode Num: 7488 Reward: -6.190863757486793 Epsilon: 0.5186184005808526
Total Timesteps: 96423 Episode Num: 7489 Reward: -5.731273960565878 Epsilon: 0.5185484790973108
Total Timesteps: 96436 Episode Num: 7490 Reward: -7.576442994669255 Epsilon: 0.5184785764652204
Total Timesteps: 96449 Episode Num: 7491 Reward: -5.2851602996254625 Epsilon: 0.5184086926769588
Total Timesteps: 96462 Episode Num: 7492 Reward: -5.151474781802514 Epsilon: 0.5183388277249072
Total Timesteps: 96475 Episode Num: 7493 Reward: -5.5761215171503125 Epsilon: 0.5182689816014512
Total Timesteps: 96488 Episode Num: 7494 Reward: -5.366029223523867 Epsilon: 0.5181991542989802
Total Timesteps: 96501 Episode Num: 7495 Reward: -4.654299217294788 Epsilon: 0.518129345809888
Total Timesteps: 96514 Episode Num: 7496 Reward: -5.727889764299638 Epsilon: 0.5180595561265723
Total Timesteps: 96527 Episode Num: 749

Total Timesteps: 97499 Episode Num: 7573 Reward: -7.785945581102058 Epsilon: 0.5128257725720263
Total Timesteps: 97513 Episode Num: 7574 Reward: -6.902913130876505 Epsilon: 0.5127521458677304
Total Timesteps: 97526 Episode Num: 7575 Reward: -6.382491732902906 Epsilon: 0.5126837971412751
Total Timesteps: 97539 Episode Num: 7576 Reward: -8.119646493318328 Epsilon: 0.5126154666338593
Total Timesteps: 97552 Episode Num: 7577 Reward: -5.465731065369473 Epsilon: 0.5125471543381991
Total Timesteps: 97565 Episode Num: 7578 Reward: -7.292987127286633 Epsilon: 0.5124788602470148
Total Timesteps: 97577 Episode Num: 7579 Reward: -6.0386190153348105 Epsilon: 0.5124158356989864
Total Timesteps: 97590 Episode Num: 7580 Reward: -6.514468688183418 Epsilon: 0.5123475765959628
Total Timesteps: 97604 Episode Num: 7581 Reward: -5.975531234841836 Epsilon: 0.5122740871275767
Total Timesteps: 97617 Episode Num: 7582 Reward: -6.378055222816712 Epsilon: 0.5122058657815749
Total Timesteps: 97630 Episode Num: 758

Total Timesteps: 98612 Episode Num: 7659 Reward: -5.215606866538529 Epsilon: 0.5070376830406036
Total Timesteps: 98625 Episode Num: 7660 Reward: -6.939625080966932 Epsilon: 0.5069708491761724
Total Timesteps: 98639 Episode Num: 7661 Reward: -5.501657059512197 Epsilon: 0.5068988939466134
Total Timesteps: 98652 Episode Num: 7662 Reward: -5.487648680876019 Epsilon: 0.5068320966630174
Total Timesteps: 98664 Episode Num: 7663 Reward: -6.063597249311553 Epsilon: 0.5067704532554934
Total Timesteps: 98677 Episode Num: 7664 Reward: -5.468363060631875 Epsilon: 0.5067036898162692
Total Timesteps: 98690 Episode Num: 7665 Reward: -6.057292839844053 Epsilon: 0.506636943965954
Total Timesteps: 98704 Episode Num: 7666 Reward: -6.76913482837968 Epsilon: 0.5065650834819257
Total Timesteps: 98717 Episode Num: 7667 Reward: -5.226200016529098 Epsilon: 0.506498374140219
Total Timesteps: 98730 Episode Num: 7668 Reward: -9.256648004126337 Epsilon: 0.5064316823660489
Total Timesteps: 98743 Episode Num: 7669 Re

Total Timesteps: 99723 Episode Num: 7745 Reward: -4.962190501494596 Epsilon: 0.501388847106485
Total Timesteps: 99736 Episode Num: 7746 Reward: -6.554112877870846 Epsilon: 0.5013234940242239
Total Timesteps: 99749 Episode Num: 7747 Reward: -7.642811836464186 Epsilon: 0.5012581579765211
Total Timesteps: 99762 Episode Num: 7748 Reward: -8.062421433980735 Epsilon: 0.501192838956717
Total Timesteps: 99775 Episode Num: 7749 Reward: -7.959463841696871 Epsilon: 0.5011275369581558
Total Timesteps: 99788 Episode Num: 7750 Reward: -6.867261568623966 Epsilon: 0.5010622519741853
Total Timesteps: 99800 Episode Num: 7751 Reward: -6.160678213496657 Epsilon: 0.501002004008016
Total Timesteps: 99813 Episode Num: 7752 Reward: -4.686298896943857 Epsilon: 0.5009367517257272
Total Timesteps: 99826 Episode Num: 7753 Reward: -7.030351356726905 Epsilon: 0.5008715164386032
Total Timesteps: 99839 Episode Num: 7754 Reward: -8.03337245131268 Epsilon: 0.5008062981400054
Total Timesteps: 99851 Episode Num: 7755 Rew

Total Timesteps: 100795 Episode Num: 7828 Reward: -8.207545701848721 Epsilon: 0.4960563520015874
Total Timesteps: 100808 Episode Num: 7829 Reward: -8.773490859032087 Epsilon: 0.4959923815570193
Total Timesteps: 100820 Episode Num: 7830 Reward: -5.242959767282836 Epsilon: 0.4959333465582226
Total Timesteps: 100833 Episode Num: 7831 Reward: -6.381222303970288 Epsilon: 0.4958694078327532
Total Timesteps: 100846 Episode Num: 7832 Reward: -8.325355887743909 Epsilon: 0.4958054855918926
Total Timesteps: 100859 Episode Num: 7833 Reward: -5.803671368561287 Epsilon: 0.4957415798292666
Total Timesteps: 100871 Episode Num: 7834 Reward: -7.3489240700036085 Epsilon: 0.49568260451467716
Total Timesteps: 100884 Episode Num: 7835 Reward: -6.736731185355595 Epsilon: 0.49561873042306015
Total Timesteps: 100897 Episode Num: 7836 Reward: -6.353227584739767 Epsilon: 0.49555487279106414
Total Timesteps: 100910 Episode Num: 7837 Reward: -6.72494356124942 Epsilon: 0.4954910316123278
Total Timesteps: 100923 Epi

Total Timesteps: 101895 Episode Num: 7913 Reward: -5.4821688623749365 Epsilon: 0.49070121203199374
Total Timesteps: 101908 Episode Num: 7914 Reward: -7.447081381501063 Epsilon: 0.49063861522157237
Total Timesteps: 101921 Episode Num: 7915 Reward: -5.365490694779108 Epsilon: 0.4905760343795685
Total Timesteps: 101935 Episode Num: 7916 Reward: -7.15177574649427 Epsilon: 0.4905086574778045
Total Timesteps: 101948 Episode Num: 7917 Reward: -4.501705470714605 Epsilon: 0.4904461097814572
Total Timesteps: 101960 Episode Num: 7918 Reward: -6.3006090657903755 Epsilon: 0.49038838760298153
Total Timesteps: 101973 Episode Num: 7919 Reward: -5.360744296680697 Epsilon: 0.4903258705735832
Total Timesteps: 101986 Episode Num: 7920 Reward: -6.0677881076788545 Epsilon: 0.49026336948208576
Total Timesteps: 101999 Episode Num: 7921 Reward: -6.545467947167859 Epsilon: 0.49020088432239534
Total Timesteps: 102011 Episode Num: 7922 Reward: -6.398471003146651 Epsilon: 0.49014321984883985
Total Timesteps: 10202

Total Timesteps: 102979 Episode Num: 7998 Reward: -6.59562090925632 Epsilon: 0.4855358859573311
Total Timesteps: 102991 Episode Num: 7999 Reward: -6.534252012097931 Epsilon: 0.4854793137264421
Total Timesteps: 103004 Episode Num: 8000 Reward: -4.893976937582686 Epsilon: 0.4854180420177857
Total Timesteps: 103017 Episode Num: 8001 Reward: -6.297374725589061 Epsilon: 0.4853567857732219
Total Timesteps: 103030 Episode Num: 8002 Reward: -6.617027813062965 Epsilon: 0.48529554498689703
Total Timesteps: 103043 Episode Num: 8003 Reward: -6.2024473766532005 Epsilon: 0.48523431965296043
Total Timesteps: 103057 Episode Num: 8004 Reward: -6.742878477642829 Epsilon: 0.48516840195231764
Total Timesteps: 103069 Episode Num: 8005 Reward: -6.010482269034473 Epsilon: 0.48511191531886405
Total Timesteps: 103081 Episode Num: 8006 Reward: -5.84562968915234 Epsilon: 0.48505544183700194
Total Timesteps: 103094 Episode Num: 8007 Reward: -6.86567226663229 Epsilon: 0.4849942770675306
Total Timesteps: 103107 Epi

Total Timesteps: 104080 Episode Num: 8083 Reward: -7.370893931299214 Epsilon: 0.4803996925441968
Total Timesteps: 104093 Episode Num: 8084 Reward: -7.457413043065715 Epsilon: 0.4803396962331761
Total Timesteps: 104107 Episode Num: 8085 Reward: -7.439298717784991 Epsilon: 0.480275101578184
Total Timesteps: 104121 Episode Num: 8086 Reward: -5.8445367030965905 Epsilon: 0.4802105242938504
Total Timesteps: 104133 Episode Num: 8087 Reward: -5.930930835964439 Epsilon: 0.48015518615616565
Total Timesteps: 104145 Episode Num: 8088 Reward: -5.912753726933466 Epsilon: 0.4800998607710404
Total Timesteps: 104157 Episode Num: 8089 Reward: -6.019244553052871 Epsilon: 0.48004454813406683
Total Timesteps: 104170 Episode Num: 8090 Reward: -4.920915392814086 Epsilon: 0.47998464049150424
Total Timesteps: 104182 Episode Num: 8091 Reward: -6.004592954932381 Epsilon: 0.47992935439903245
Total Timesteps: 104196 Episode Num: 8092 Reward: -6.037462007515629 Epsilon: 0.4798648700525932
Total Timesteps: 104209 Ep

Total Timesteps: 105147 Episode Num: 8166 Reward: -6.2319468086802345 Epsilon: 0.47552474155230295
Total Timesteps: 105160 Episode Num: 8167 Reward: -6.263532078769069 Epsilon: 0.47546595663750474
Total Timesteps: 105173 Episode Num: 8168 Reward: -5.1244332477321075 Epsilon: 0.4754071862550274
Total Timesteps: 105185 Episode Num: 8169 Reward: -7.020946445060672 Epsilon: 0.47535294956505203
Total Timesteps: 105198 Episode Num: 8170 Reward: -6.661455502255559 Epsilon: 0.47529420711420367
Total Timesteps: 105211 Episode Num: 8171 Reward: -7.645559647563591 Epsilon: 0.47523547917993364
Total Timesteps: 105224 Episode Num: 8172 Reward: -7.8387775227657945 Epsilon: 0.4751767657568616
Total Timesteps: 105237 Episode Num: 8173 Reward: -5.418265781573505 Epsilon: 0.47511806683960967
Total Timesteps: 105250 Episode Num: 8174 Reward: -6.0976218386160905 Epsilon: 0.4750593824228028
Total Timesteps: 105263 Episode Num: 8175 Reward: -6.330983102653555 Epsilon: 0.47500071250106873
Total Timesteps: 10

Total Timesteps: 106239 Episode Num: 8251 Reward: -8.673737114730365 Epsilon: 0.47063696006174754
Total Timesteps: 106252 Episode Num: 8252 Reward: -6.654788225396824 Epsilon: 0.47057937732936794
Total Timesteps: 106265 Episode Num: 8253 Reward: -5.287664358646713 Epsilon: 0.4705218086858326
Total Timesteps: 106277 Episode Num: 8254 Reward: -4.868787049250956 Epsilon: 0.4704686808999125
Total Timesteps: 106290 Episode Num: 8255 Reward: -7.380892445684805 Epsilon: 0.4704111393357795
Total Timesteps: 106303 Episode Num: 8256 Reward: -5.69328359693891 Epsilon: 0.4703536118453854
Total Timesteps: 106316 Episode Num: 8257 Reward: -8.081262658164473 Epsilon: 0.47029609842356745
Total Timesteps: 106329 Episode Num: 8258 Reward: -5.810721478409879 Epsilon: 0.4702385990651657
Total Timesteps: 106341 Episode Num: 8259 Reward: -5.182617000921105 Epsilon: 0.47018553521219475
Total Timesteps: 106354 Episode Num: 8260 Reward: -5.968083987138877 Epsilon: 0.4701280628843297
Total Timesteps: 106367 Epi

Total Timesteps: 107329 Episode Num: 8336 Reward: -8.847061760306662 Epsilon: 0.4658573172208816
Total Timesteps: 107341 Episode Num: 8337 Reward: -5.8250068157893065 Epsilon: 0.4658052375140906
Total Timesteps: 107354 Episode Num: 8338 Reward: -5.020619095423336 Epsilon: 0.4657488309704343
Total Timesteps: 107367 Episode Num: 8339 Reward: -4.725338370420205 Epsilon: 0.4656924380861904
Total Timesteps: 107379 Episode Num: 8340 Reward: -6.141055662507391 Epsilon: 0.4656403952355675
Total Timesteps: 107393 Episode Num: 8341 Reward: -7.668196970013728 Epsilon: 0.46557969327609805
Total Timesteps: 107405 Episode Num: 8342 Reward: -4.664284063974161 Epsilon: 0.4655276756203156
Total Timesteps: 107418 Episode Num: 8343 Reward: -7.206566703166615 Epsilon: 0.46547133627511217
Total Timesteps: 107431 Episode Num: 8344 Reward: -6.426186978279938 Epsilon: 0.46541501056492074
Total Timesteps: 107444 Episode Num: 8345 Reward: -6.045654333666261 Epsilon: 0.46535869848479205
Total Timesteps: 107457 E

Total Timesteps: 108422 Episode Num: 8421 Reward: -5.163616264309666 Epsilon: 0.46116101898138756
Total Timesteps: 108435 Episode Num: 8422 Reward: -7.133451955015487 Epsilon: 0.4611057315442431
Total Timesteps: 108448 Episode Num: 8423 Reward: -7.570340032119097 Epsilon: 0.4610504573620537
Total Timesteps: 108462 Episode Num: 8424 Reward: -7.453044727493729 Epsilon: 0.4609909461378179
Total Timesteps: 108474 Episode Num: 8425 Reward: -5.33957893575294 Epsilon: 0.4609399487434777
Total Timesteps: 108487 Episode Num: 8426 Reward: -6.443850154892596 Epsilon: 0.4608847142975656
Total Timesteps: 108500 Episode Num: 8427 Reward: -6.6887170985642115 Epsilon: 0.4608294930875576
Total Timesteps: 108513 Episode Num: 8428 Reward: -6.342808876864151 Epsilon: 0.46077428510869667
Total Timesteps: 108526 Episode Num: 8429 Reward: -5.400507569602533 Epsilon: 0.460719090356228
Total Timesteps: 108539 Episode Num: 8430 Reward: -7.627348453165321 Epsilon: 0.46066390882539915
Total Timesteps: 108552 Epis

Total Timesteps: 109513 Episode Num: 8506 Reward: -6.118619919053482 Epsilon: 0.45656680028855023
Total Timesteps: 109525 Episode Num: 8507 Reward: -6.5533687982844455 Epsilon: 0.45651677699155446
Total Timesteps: 109538 Episode Num: 8508 Reward: -7.492023308019522 Epsilon: 0.45646259745476453
Total Timesteps: 109550 Episode Num: 8509 Reward: -5.5670454464711385 Epsilon: 0.45641259698767683
Total Timesteps: 109562 Episode Num: 8510 Reward: -5.180692994562552 Epsilon: 0.4563626074733941
Total Timesteps: 109575 Episode Num: 8511 Reward: -7.177994955650102 Epsilon: 0.4563084645220169
Total Timesteps: 109588 Episode Num: 8512 Reward: -5.8477510268649535 Epsilon: 0.45625433441617697
Total Timesteps: 109600 Episode Num: 8513 Reward: -6.365018635898199 Epsilon: 0.4562043795620438
Total Timesteps: 109612 Episode Num: 8514 Reward: -5.644972212424069 Epsilon: 0.4561544356457322
Total Timesteps: 109625 Episode Num: 8515 Reward: -5.9003511516582146 Epsilon: 0.45610034207525657
Total Timesteps: 109

Total Timesteps: 110580 Episode Num: 8589 Reward: -6.736440466860896 Epsilon: 0.4521613311629589
Total Timesteps: 110593 Episode Num: 8590 Reward: -5.731720495112702 Epsilon: 0.45210818044541695
Total Timesteps: 110607 Episode Num: 8591 Reward: -6.322607878494003 Epsilon: 0.4520509551836683
Total Timesteps: 110620 Episode Num: 8592 Reward: -6.308592690615707 Epsilon: 0.451997830410414
Total Timesteps: 110633 Episode Num: 8593 Reward: -8.877378105669562 Epsilon: 0.4519447181220793
Total Timesteps: 110645 Episode Num: 8594 Reward: -6.814040311109735 Epsilon: 0.4518957024718695
Total Timesteps: 110658 Episode Num: 8595 Reward: -6.520545073620665 Epsilon: 0.4518426141806286
Total Timesteps: 110671 Episode Num: 8596 Reward: -7.939871743055879 Epsilon: 0.4517895383614497
Total Timesteps: 110683 Episode Num: 8597 Reward: -4.588831550613782 Epsilon: 0.4517405563636692
Total Timesteps: 110696 Episode Num: 8598 Reward: -6.856510059212625 Epsilon: 0.45168750451687506
Total Timesteps: 110709 Episo

Total Timesteps: 111672 Episode Num: 8674 Reward: -5.45429647756375 Epsilon: 0.4477398094419371
Total Timesteps: 111685 Episode Num: 8675 Reward: -8.02115436952131 Epsilon: 0.44768769306531764
Total Timesteps: 111699 Episode Num: 8676 Reward: -8.830754867664014 Epsilon: 0.4476315813033241
Total Timesteps: 111712 Episode Num: 8677 Reward: -5.816002754610576 Epsilon: 0.44757949011744486
Total Timesteps: 111725 Episode Num: 8678 Reward: -7.949725765627207 Epsilon: 0.44752741105392707
Total Timesteps: 111738 Episode Num: 8679 Reward: -7.054234984941135 Epsilon: 0.4474753441085396
Total Timesteps: 111751 Episode Num: 8680 Reward: -5.263768807249287 Epsilon: 0.44742328927705344
Total Timesteps: 111764 Episode Num: 8681 Reward: -5.860179676207834 Epsilon: 0.4473712465552414
Total Timesteps: 111776 Episode Num: 8682 Reward: -7.16654573073592 Epsilon: 0.4473232178643
Total Timesteps: 111789 Episode Num: 8683 Reward: -8.640565365864235 Epsilon: 0.44727119841844903
Total Timesteps: 111801 Episode

Total Timesteps: 112765 Episode Num: 8759 Reward: -6.9683636121922845 Epsilon: 0.4433999911320002
Total Timesteps: 112778 Episode Num: 8760 Reward: -7.4004050148951235 Epsilon: 0.44334888010072887
Total Timesteps: 112791 Episode Num: 8761 Reward: -6.395569037771557 Epsilon: 0.44329778085130905
Total Timesteps: 112803 Episode Num: 8762 Reward: -7.239375736694526 Epsilon: 0.443250622767125
Total Timesteps: 112815 Episode Num: 8763 Reward: -3.5909613133802405 Epsilon: 0.44320347471524174
Total Timesteps: 112828 Episode Num: 8764 Reward: -6.464315559919179 Epsilon: 0.4431524089764952
Total Timesteps: 112841 Episode Num: 8765 Reward: -4.725642604295725 Epsilon: 0.4431013550039436
Total Timesteps: 112854 Episode Num: 8766 Reward: -7.83366036133118 Epsilon: 0.4430503127935208
Total Timesteps: 112867 Episode Num: 8767 Reward: -7.804027752154765 Epsilon: 0.4429992823411626
Total Timesteps: 112880 Episode Num: 8768 Reward: -6.074436471956966 Epsilon: 0.44294826364280654
Total Timesteps: 112893 E

Total Timesteps: 113864 Episode Num: 8844 Reward: -6.459797601423327 Epsilon: 0.43912035410665357
Total Timesteps: 113877 Episode Num: 8845 Reward: -6.931887500077206 Epsilon: 0.43907022489176917
Total Timesteps: 113890 Episode Num: 8846 Reward: -6.774953189713378 Epsilon: 0.4390201071209061
Total Timesteps: 113903 Episode Num: 8847 Reward: -7.546414510009686 Epsilon: 0.438970000790146
Total Timesteps: 113916 Episode Num: 8848 Reward: -7.135517734517035 Epsilon: 0.43891990589557217
Total Timesteps: 113929 Episode Num: 8849 Reward: -6.787717255868442 Epsilon: 0.43886982243326983
Total Timesteps: 113942 Episode Num: 8850 Reward: -5.443113199996994 Epsilon: 0.43881975039932597
Total Timesteps: 113955 Episode Num: 8851 Reward: -8.666621654039604 Epsilon: 0.43876968978982933
Total Timesteps: 113968 Episode Num: 8852 Reward: -6.54080394778967 Epsilon: 0.43871964060087043
Total Timesteps: 113981 Episode Num: 8853 Reward: -7.946031792047307 Epsilon: 0.4386696028285416
Total Timesteps: 113994 E

Total Timesteps: 114958 Episode Num: 8929 Reward: -5.041547175698189 Epsilon: 0.434941456879904
Total Timesteps: 114971 Episode Num: 8930 Reward: -5.472955942573178 Epsilon: 0.43489227718294177
Total Timesteps: 114985 Episode Num: 8931 Reward: -5.810005138527606 Epsilon: 0.434839326868722
Total Timesteps: 114998 Episode Num: 8932 Reward: -6.973792235101691 Epsilon: 0.43479017026383066
Total Timesteps: 115010 Episode Num: 8933 Reward: -4.8551387247635285 Epsilon: 0.43474480479958266
Total timesteps: 115010 Epsilon: 0.43474480479958266
---------------------------------------
Average Reward over the Evaluation Step: -5.956508
---------------------------------------
Total Timesteps: 115022 Episode Num: 8934 Reward: -7.00969738313765 Epsilon: 0.4346994488010989
Total Timesteps: 115035 Episode Num: 8935 Reward: -5.090301074485387 Epsilon: 0.43465032381449126
Total Timesteps: 115048 Episode Num: 8936 Reward: -8.947266882679013 Epsilon: 0.4346012099297684
Total Timesteps: 115061 Episode Num: 8

Total Timesteps: 116026 Episode Num: 9012 Reward: -5.240225299607501 Epsilon: 0.4309378932308276
Total Timesteps: 116039 Episode Num: 9013 Reward: -6.325481847404686 Epsilon: 0.43088961469850656
Total Timesteps: 116052 Episode Num: 9014 Reward: -6.1256161659698 Epsilon: 0.4308413469823872
Total Timesteps: 116065 Episode Num: 9015 Reward: -7.158649820064365 Epsilon: 0.43079309007883515
Total Timesteps: 116078 Episode Num: 9016 Reward: -7.431285837812238 Epsilon: 0.43074484398421753
Total Timesteps: 116091 Episode Num: 9017 Reward: -6.726469625144767 Epsilon: 0.4306966086949031
Total Timesteps: 116104 Episode Num: 9018 Reward: -6.586529578294175 Epsilon: 0.4306483842072625
Total Timesteps: 116117 Episode Num: 9019 Reward: -7.5227350239987 Epsilon: 0.4306001705176675
Total Timesteps: 116130 Episode Num: 9020 Reward: -5.729781735240418 Epsilon: 0.43055196762249204
Total Timesteps: 116142 Episode Num: 9021 Reward: -6.706845805989432 Epsilon: 0.430507482220041
Total Timesteps: 116155 Episode

Total Timesteps: 117124 Episode Num: 9097 Reward: -4.3566605356100485 Epsilon: 0.42689798845667837
Total Timesteps: 117137 Episode Num: 9098 Reward: -4.973309149043927 Epsilon: 0.42685061082322406
Total Timesteps: 117151 Episode Num: 9099 Reward: -8.203016290693885 Epsilon: 0.4267996005155739
Total Timesteps: 117164 Episode Num: 9100 Reward: -6.0837437859889825 Epsilon: 0.4267522447168072
Total Timesteps: 117177 Episode Num: 9101 Reward: -4.6760452265718495 Epsilon: 0.4267048994256552
Total Timesteps: 117191 Episode Num: 9102 Reward: -9.054127535897459 Epsilon: 0.42665392393613844
Total Timesteps: 117203 Episode Num: 9103 Reward: -4.870612098726316 Epsilon: 0.4266102403522094
Total Timesteps: 117217 Episode Num: 9104 Reward: -7.268818857640026 Epsilon: 0.4265592874753662
Total Timesteps: 117230 Episode Num: 9105 Reward: -7.541755772542905 Epsilon: 0.4265119849867781
Total Timesteps: 117242 Episode Num: 9106 Reward: -5.404829111795665 Epsilon: 0.4264683304617799
Total Timesteps: 117255 

Total Timesteps: 118224 Episode Num: 9182 Reward: -6.084397570516957 Epsilon: 0.4229259710380295
Total Timesteps: 118237 Episode Num: 9183 Reward: -6.394061145918403 Epsilon: 0.42287947089320604
Total Timesteps: 118250 Episode Num: 9184 Reward: -7.501587151938841 Epsilon: 0.42283298097251587
Total Timesteps: 118263 Episode Num: 9185 Reward: -7.363611602921188 Epsilon: 0.4227865012725874
Total Timesteps: 118276 Episode Num: 9186 Reward: -6.63753189959797 Epsilon: 0.4227400317900504
Total Timesteps: 118289 Episode Num: 9187 Reward: -7.66284898333677 Epsilon: 0.4226935725215362
Total Timesteps: 118302 Episode Num: 9188 Reward: -6.604379821757044 Epsilon: 0.4226471234636777
Total Timesteps: 118315 Episode Num: 9189 Reward: -4.619395490948367 Epsilon: 0.42260068461310907
Total Timesteps: 118328 Episode Num: 9190 Reward: -4.536285228811725 Epsilon: 0.4225542559664661
Total Timesteps: 118341 Episode Num: 9191 Reward: -5.56827845954953 Epsilon: 0.422507837520386
Total Timesteps: 118354 Episode

Total Timesteps: 119319 Episode Num: 9267 Reward: -5.474388465080965 Epsilon: 0.4190447455979349
Total Timesteps: 119332 Episode Num: 9268 Reward: -5.919798173200568 Epsilon: 0.41899909496195487
Total Timesteps: 119345 Episode Num: 9269 Reward: -8.940877996281447 Epsilon: 0.4189534542712305
Total Timesteps: 119358 Episode Num: 9270 Reward: -7.660859417560937 Epsilon: 0.4189078235225121
Total Timesteps: 119372 Episode Num: 9271 Reward: -7.3272174852217615 Epsilon: 0.4188586938310492
Total Timesteps: 119385 Episode Num: 9272 Reward: -6.380491098320441 Epsilon: 0.41881308372073545
Total Timesteps: 119397 Episode Num: 9273 Reward: -4.315851920652353 Epsilon: 0.4187709908959187
Total Timesteps: 119409 Episode Num: 9274 Reward: -5.69618940919799 Epsilon: 0.4187289065313335
Total Timesteps: 119422 Episode Num: 9275 Reward: -6.405528771381835 Epsilon: 0.41868332468054464
Total Timesteps: 119434 Episode Num: 9276 Reward: -5.251640989961815 Epsilon: 0.41864125793325185
Total Timesteps: 119447 Ep

Total Timesteps: 120384 Episode Num: 9350 Reward: -8.768339194276628 Epsilon: 0.41533758639021795
Total Timesteps: 120397 Episode Num: 9351 Reward: -6.048220118334275 Epsilon: 0.4152927398523219
Total Timesteps: 120409 Episode Num: 9352 Reward: -4.476053393850647 Epsilon: 0.4152513516431496
Total Timesteps: 120422 Episode Num: 9353 Reward: -6.324317760940851 Epsilon: 0.41520652372490074
Total Timesteps: 120435 Episode Num: 9354 Reward: -6.5099918301491435 Epsilon: 0.4151617054842861
Total Timesteps: 120447 Episode Num: 9355 Reward: -5.928905969294081 Epsilon: 0.41512034338754805
Total Timesteps: 120460 Episode Num: 9356 Reward: -8.665836295090129 Epsilon: 0.41507554374896233
Total Timesteps: 120473 Episode Num: 9357 Reward: -6.7976546107339155 Epsilon: 0.415030753778855
Total Timesteps: 120486 Episode Num: 9358 Reward: -6.908255935976773 Epsilon: 0.41498597347409655
Total Timesteps: 120498 Episode Num: 9359 Reward: -5.619273465162197 Epsilon: 0.41494464638417233
Total Timesteps: 120511

Total Timesteps: 121472 Episode Num: 9435 Reward: -5.625269459245937 Epsilon: 0.41161749209694415
Total Timesteps: 121485 Episode Num: 9436 Reward: -6.672470475285685 Epsilon: 0.41157344528131046
Total Timesteps: 121498 Episode Num: 9437 Reward: -7.733556231181845 Epsilon: 0.41152940789148795
Total Timesteps: 121511 Episode Num: 9438 Reward: -6.986383383003668 Epsilon: 0.4114853799244513
Total Timesteps: 121523 Episode Num: 9439 Reward: -4.27500649540055 Epsilon: 0.41144474708491396
Total Timesteps: 121536 Episode Num: 9440 Reward: -4.09436251550516 Epsilon: 0.4114007372301211
Total Timesteps: 121549 Episode Num: 9441 Reward: -5.486902398508103 Epsilon: 0.4113567367892784
Total Timesteps: 121563 Episode Num: 9442 Reward: -7.201520989842549 Epsilon: 0.41130936222370296
Total Timesteps: 121575 Episode Num: 9443 Reward: -7.088588397835565 Epsilon: 0.41126876413736374
Total Timesteps: 121587 Episode Num: 9444 Reward: -5.683159455162776 Epsilon: 0.4112281740646615
Total Timesteps: 121599 Ep

Total Timesteps: 122564 Episode Num: 9520 Reward: -7.198269140889129 Epsilon: 0.4079501321758428
Total Timesteps: 122577 Episode Num: 9521 Reward: -6.490965708022329 Epsilon: 0.40790686670419407
Total Timesteps: 122589 Episode Num: 9522 Reward: -6.2371492805284445 Epsilon: 0.40786693749031316
Total Timesteps: 122602 Episode Num: 9523 Reward: -6.953969221810698 Epsilon: 0.4078236896624851
Total Timesteps: 122616 Episode Num: 9524 Reward: -9.393575005148982 Epsilon: 0.40777712533437727
Total Timesteps: 122629 Episode Num: 9525 Reward: -6.469602453448516 Epsilon: 0.4077338965497558
Total Timesteps: 122641 Episode Num: 9526 Reward: -7.126784928117057 Epsilon: 0.4076940011904665
Total Timesteps: 122654 Episode Num: 9527 Reward: -7.357359475484488 Epsilon: 0.4076507900272311
Total Timesteps: 122667 Episode Num: 9528 Reward: -5.912415087950285 Epsilon: 0.4076075880228586
Total Timesteps: 122680 Episode Num: 9529 Reward: -4.481125919767173 Epsilon: 0.40756439517443754
Total Timesteps: 122693 E

Total Timesteps: 123666 Episode Num: 9605 Reward: -7.824733474689863 Epsilon: 0.4043148480584801
Total Timesteps: 123679 Episode Num: 9606 Reward: -6.555127527571939 Epsilon: 0.4042723501968806
Total Timesteps: 123692 Episode Num: 9607 Reward: -4.382962450650902 Epsilon: 0.4042298612683116
Total Timesteps: 123704 Episode Num: 9608 Reward: -5.368251329393616 Epsilon: 0.404190648645153
Total Timesteps: 123718 Episode Num: 9609 Reward: -6.581815410426233 Epsilon: 0.40414491019900095
Total Timesteps: 123731 Episode Num: 9610 Reward: -6.50688542088929 Epsilon: 0.4041024480526303
Total Timesteps: 123743 Episode Num: 9611 Reward: -5.102076819949221 Epsilon: 0.4040632601440081
Total Timesteps: 123756 Episode Num: 9612 Reward: -7.690085368876205 Epsilon: 0.40402081515239663
Total Timesteps: 123770 Episode Num: 9613 Reward: -6.755657032720793 Epsilon: 0.4039751151329078
Total Timesteps: 123783 Episode Num: 9614 Reward: -7.242342675812427 Epsilon: 0.40393268865676224
Total Timesteps: 123796 Episo

Total Timesteps: 124766 Episode Num: 9690 Reward: -8.733181880749894 Epsilon: 0.4007502043826042
Total Timesteps: 124779 Episode Num: 9691 Reward: -7.1585917766846645 Epsilon: 0.400708452544098
Total Timesteps: 124792 Episode Num: 9692 Reward: -3.6634905739572154 Epsilon: 0.400666709404449
Total Timesteps: 124805 Episode Num: 9693 Reward: -4.073917129306671 Epsilon: 0.4006249749609391
Total Timesteps: 124818 Episode Num: 9694 Reward: -6.2843716725387235 Epsilon: 0.400583249210851
Total Timesteps: 124830 Episode Num: 9695 Reward: -5.176705618217544 Epsilon: 0.40054474084755265
Total Timesteps: 124842 Episode Num: 9696 Reward: -6.824817917094134 Epsilon: 0.40050623988721745
Total Timesteps: 124855 Episode Num: 9697 Reward: -8.973981112070074 Epsilon: 0.4004645388650835
Total Timesteps: 124868 Episode Num: 9698 Reward: -8.674800384674505 Epsilon: 0.4004228465259314
Total Timesteps: 124881 Episode Num: 9699 Reward: -5.897243287918031 Epsilon: 0.40038116286704944
Total Timesteps: 124894 Epi

Total Timesteps: 125835 Episode Num: 9773 Reward: -5.726939408304925 Epsilon: 0.39734573052012556
Total Timesteps: 125848 Episode Num: 9774 Reward: -7.226588207623303 Epsilon: 0.39730468501684574
Total Timesteps: 125860 Episode Num: 9775 Reward: -7.00793979465308 Epsilon: 0.3972668043858255
Total Timesteps: 125873 Episode Num: 9776 Reward: -6.752301563273896 Epsilon: 0.39722577518610025
Total Timesteps: 125886 Episode Num: 9777 Reward: -6.088169428502064 Epsilon: 0.3971847544603848
Total Timesteps: 125899 Episode Num: 9778 Reward: -5.613283652887292 Epsilon: 0.3971437422060541
Total Timesteps: 125911 Episode Num: 9779 Reward: -5.8490144259024754 Epsilon: 0.3971058922572293
Total Timesteps: 125923 Episode Num: 9780 Reward: -5.364668703560342 Epsilon: 0.39706804952232716
Total Timesteps: 125936 Episode Num: 9781 Reward: -6.217656912972867 Epsilon: 0.3970270613645026
Total Timesteps: 125949 Episode Num: 9782 Reward: -7.999702299142483 Epsilon: 0.3969860816679767
Total Timesteps: 125961 Ep

Total Timesteps: 126924 Episode Num: 9858 Reward: -6.057523520606055 Epsilon: 0.39393652894645614
Total Timesteps: 126937 Episode Num: 9859 Reward: -5.074439189069141 Epsilon: 0.3938961847215548
Total Timesteps: 126950 Episode Num: 9860 Reward: -7.492430083721869 Epsilon: 0.3938558487593541
Total Timesteps: 126962 Episode Num: 9861 Reward: -5.214629845105762 Epsilon: 0.39381862289503944
Total Timesteps: 126975 Episode Num: 9862 Reward: -5.315895420192965 Epsilon: 0.39377830281551485
Total Timesteps: 126988 Episode Num: 9863 Reward: -5.919891012380265 Epsilon: 0.39373799099127477
Total Timesteps: 127001 Episode Num: 9864 Reward: -6.140157178791141 Epsilon: 0.3936976874197841
Total Timesteps: 127014 Episode Num: 9865 Reward: -5.715611638571158 Epsilon: 0.3936573920985088
Total Timesteps: 127027 Episode Num: 9866 Reward: -9.461479171090332 Epsilon: 0.393617105024916
Total Timesteps: 127040 Episode Num: 9867 Reward: -5.608792670960951 Epsilon: 0.39357682619647355
Total Timesteps: 127053 Ep

Total Timesteps: 128017 Episode Num: 9943 Reward: -8.733274515509551 Epsilon: 0.39057312700656943
Total Timesteps: 128030 Episode Num: 9944 Reward: -5.789812471607218 Epsilon: 0.39053346871826916
Total Timesteps: 128043 Episode Num: 9945 Reward: -6.79899188151205 Epsilon: 0.3904938184828534
Total Timesteps: 128056 Episode Num: 9946 Reward: -5.744913398939438 Epsilon: 0.39045417629786966
Total Timesteps: 128069 Episode Num: 9947 Reward: -5.571978360597544 Epsilon: 0.3904145421608664
Total Timesteps: 128082 Episode Num: 9948 Reward: -7.615690642350202 Epsilon: 0.39037491606939306
Total Timesteps: 128094 Episode Num: 9949 Reward: -5.476739510717876 Epsilon: 0.3903383452776867
Total Timesteps: 128107 Episode Num: 9950 Reward: -7.168258502847774 Epsilon: 0.3902987346515023
Total Timesteps: 128120 Episode Num: 9951 Reward: -5.388792294242929 Epsilon: 0.3902591320636903
Total Timesteps: 128133 Episode Num: 9952 Reward: -5.59027786954314 Epsilon: 0.39021953751180416
Total Timesteps: 128146 Epi

Total Timesteps: 129096 Episode Num: 10027 Reward: -5.582163894470705 Epsilon: 0.38730866951725845
Total Timesteps: 129109 Episode Num: 10028 Reward: -6.18458367045551 Epsilon: 0.38726967136295687
Total Timesteps: 129122 Episode Num: 10029 Reward: -6.824081463895846 Epsilon: 0.38723068106132186
Total Timesteps: 129134 Episode Num: 10030 Reward: -6.537144301096084 Epsilon: 0.38719469698143016
Total Timesteps: 129147 Episode Num: 10031 Reward: -6.673338605871589 Epsilon: 0.3871557217744121
Total Timesteps: 129160 Episode Num: 10032 Reward: -4.896628881068805 Epsilon: 0.387116754413131
Total Timesteps: 129172 Episode Num: 10033 Reward: -5.615925742790553 Epsilon: 0.38708079150280245
Total Timesteps: 129184 Episode Num: 10034 Reward: -6.891249076692888 Epsilon: 0.38704483527371814
Total Timesteps: 129197 Episode Num: 10035 Reward: -4.7770299335234165 Epsilon: 0.3870058902296493
Total Timesteps: 129210 Episode Num: 10036 Reward: -5.854214712234377 Epsilon: 0.3869669530222119
Total Timesteps

Total Timesteps: 130159 Episode Num: 10109 Reward: -6.495810096130768 Epsilon: 0.3841455450641139
Total Timesteps: 130172 Episode Num: 10110 Reward: -5.817258876637119 Epsilon: 0.384107181267861
Total Timesteps: 130185 Episode Num: 10111 Reward: -7.828074404065535 Epsilon: 0.3840688251334639
Total Timesteps: 130198 Episode Num: 10112 Reward: -6.101513881976037 Epsilon: 0.3840304766586276
Total Timesteps: 130211 Episode Num: 10113 Reward: -6.605972808986084 Epsilon: 0.38399213584105796
Total Timesteps: 130223 Episode Num: 10114 Reward: -6.822779276490542 Epsilon: 0.3839567511115548
Total Timesteps: 130235 Episode Num: 10115 Reward: -5.992376031499288 Epsilon: 0.3839213729028295
Total Timesteps: 130247 Episode Num: 10116 Reward: -5.101125191087444 Epsilon: 0.38388600121307975
Total Timesteps: 130259 Episode Num: 10117 Reward: -4.628336706993578 Epsilon: 0.3838506360405039
Total Timesteps: 130272 Episode Num: 10118 Reward: -6.832048260423725 Epsilon: 0.3838123311225743
Total Timesteps: 13

Total Timesteps: 131239 Episode Num: 10193 Reward: -7.086496218364562 Epsilon: 0.3809843110660703
Total Timesteps: 131251 Episode Num: 10194 Reward: -5.62294629697005 Epsilon: 0.38094947848016397
Total Timesteps: 131264 Episode Num: 10195 Reward: -7.622686351180395 Epsilon: 0.38091175036567526
Total Timesteps: 131277 Episode Num: 10196 Reward: -8.925340685306027 Epsilon: 0.3808740297234093
Total Timesteps: 131290 Episode Num: 10197 Reward: -6.589075444110275 Epsilon: 0.3808363165511463
Total Timesteps: 131303 Episode Num: 10198 Reward: -5.882257539485858 Epsilon: 0.3807986108466676
Total Timesteps: 131316 Episode Num: 10199 Reward: -6.268945097405656 Epsilon: 0.38076091260775535
Total Timesteps: 131328 Episode Num: 10200 Reward: -5.489668059246401 Epsilon: 0.3807261208576998
Total Timesteps: 131341 Episode Num: 10201 Reward: -6.607819555429763 Epsilon: 0.3806884369694155
Total Timesteps: 131354 Episode Num: 10202 Reward: -6.8204283794188365 Epsilon: 0.38065076054021957
Total Timesteps:

Total Timesteps: 132316 Episode Num: 10277 Reward: -6.317329316045127 Epsilon: 0.3778832491913299
Total Timesteps: 132328 Episode Num: 10278 Reward: -5.83530725813399 Epsilon: 0.37784898131914635
Total Timesteps: 132341 Episode Num: 10279 Reward: -6.121852258313316 Epsilon: 0.3778118648038023
Total Timesteps: 132355 Episode Num: 10280 Reward: -6.633123251527948 Epsilon: 0.3777719013259794
Total Timesteps: 132368 Episode Num: 10281 Reward: -5.315074447257258 Epsilon: 0.37773479995164994
Total Timesteps: 132381 Episode Num: 10282 Reward: -6.303740944334436 Epsilon: 0.3776977058641346
Total Timesteps: 132394 Episode Num: 10283 Reward: -8.319749753553783 Epsilon: 0.3776606190612868
Total Timesteps: 132406 Episode Num: 10284 Reward: -5.646520240988018 Epsilon: 0.3776263915532529
Total Timesteps: 132418 Episode Num: 10285 Reward: -5.265429788441777 Epsilon: 0.37759217024875774
Total Timesteps: 132430 Episode Num: 10286 Reward: -8.08686364888352 Epsilon: 0.37755795514611495
Total Timesteps: 1

Total Timesteps: 133405 Episode Num: 10361 Reward: -7.233930375563308 Epsilon: 0.37479854578164234
Total Timesteps: 133418 Episode Num: 10362 Reward: -6.375389363139458 Epsilon: 0.37476202611341797
Total Timesteps: 133432 Episode Num: 10363 Reward: -7.171430032301666 Epsilon: 0.37472270519815337
Total Timesteps: 133445 Episode Num: 10364 Reward: -6.139776292965014 Epsilon: 0.3746862003072427
Total Timesteps: 133457 Episode Num: 10365 Reward: -7.284492639833694 Epsilon: 0.37465250979716314
Total Timesteps: 133470 Episode Num: 10366 Reward: -5.014758974771824 Epsilon: 0.3746160185809545
Total Timesteps: 133482 Episode Num: 10367 Reward: -5.743478456044352 Epsilon: 0.3745823406901305
Total Timesteps: 133495 Episode Num: 10368 Reward: -6.033802529129468 Epsilon: 0.3745458631409416
Total Timesteps: 133508 Episode Num: 10369 Reward: -7.869707138441029 Epsilon: 0.3745093926955688
Total Timesteps: 133521 Episode Num: 10370 Reward: -7.917280798239635 Epsilon: 0.37447292935193716
Total Timesteps

Total Timesteps: 134484 Episode Num: 10445 Reward: -7.17279664533107 Epsilon: 0.37179143987388835
Total Timesteps: 134497 Episode Num: 10446 Reward: -7.722848891041864 Epsilon: 0.37175550384023437
Total Timesteps: 134510 Episode Num: 10447 Reward: -5.917914722090823 Epsilon: 0.3717195747528065
Total Timesteps: 134523 Episode Num: 10448 Reward: -8.084980716934446 Epsilon: 0.3716836526095909
Total Timesteps: 134535 Episode Num: 10449 Reward: -5.975379077629489 Epsilon: 0.3716504998699223
Total Timesteps: 134548 Episode Num: 10450 Reward: -5.9565030784952295 Epsilon: 0.371614591075304
Total Timesteps: 134560 Episode Num: 10451 Reward: -8.643139413881114 Epsilon: 0.37158145065398335
Total Timesteps: 134573 Episode Num: 10452 Reward: -7.315247044927827 Epsilon: 0.3715455552005231
Total Timesteps: 134586 Episode Num: 10453 Reward: -5.56310799314059 Epsilon: 0.3715096666815271
Total Timesteps: 134598 Episode Num: 10454 Reward: -6.801533395711038 Epsilon: 0.37147654497095056
Total Timesteps: 1

Total Timesteps: 135544 Episode Num: 10527 Reward: -7.446471748905779 Epsilon: 0.36888390485746325
Total Timesteps: 135557 Episode Num: 10528 Reward: -6.512278640230646 Epsilon: 0.36884852866321916
Total Timesteps: 135570 Episode Num: 10529 Reward: -7.881667855624891 Epsilon: 0.36881315925352215
Total Timesteps: 135583 Episode Num: 10530 Reward: -9.474223973790174 Epsilon: 0.36877779662642074
Total Timesteps: 135595 Episode Num: 10531 Reward: -7.053283558519484 Epsilon: 0.3687451602197721
Total Timesteps: 135608 Episode Num: 10532 Reward: -7.0955178988811385 Epsilon: 0.36870981063064123
Total Timesteps: 135621 Episode Num: 10533 Reward: -7.8789046243612075 Epsilon: 0.3686744678184057
Total Timesteps: 135634 Episode Num: 10534 Reward: -4.6860858652607735 Epsilon: 0.36863913178111685
Total Timesteps: 135647 Episode Num: 10535 Reward: -6.976510430234378 Epsilon: 0.3686038025168268
Total Timesteps: 135659 Episode Num: 10536 Reward: -8.348848308192657 Epsilon: 0.3685711968981048
Total Times

Total Timesteps: 136620 Episode Num: 10611 Reward: -6.263020544646936 Epsilon: 0.36597862684819205
Total Timesteps: 136633 Episode Num: 10612 Reward: -7.553978696720763 Epsilon: 0.3659438056692014
Total Timesteps: 136646 Episode Num: 10613 Reward: -7.054424426962349 Epsilon: 0.3659089911157297
Total Timesteps: 136659 Episode Num: 10614 Reward: -5.71231186416162 Epsilon: 0.36587418318588605
Total Timesteps: 136672 Episode Num: 10615 Reward: -7.19680650163226 Epsilon: 0.36583938187778037
Total Timesteps: 136685 Episode Num: 10616 Reward: -5.165959174211309 Epsilon: 0.36580458718952336
Total Timesteps: 136698 Episode Num: 10617 Reward: -5.361879877456201 Epsilon: 0.3657697991192263
Total Timesteps: 136712 Episode Num: 10618 Reward: -6.444485592426255 Epsilon: 0.36573234244250685
Total Timesteps: 136725 Episode Num: 10619 Reward: -7.275160313354203 Epsilon: 0.36569756811117204
Total Timesteps: 136737 Episode Num: 10620 Reward: -5.127416998036934 Epsilon: 0.3656654745972195
Total Timesteps:

Total Timesteps: 137698 Episode Num: 10695 Reward: -5.479528564289135 Epsilon: 0.36311348022483986
Total Timesteps: 137710 Episode Num: 10696 Reward: -4.886947698329263 Epsilon: 0.3630818386464309
Total Timesteps: 137722 Episode Num: 10697 Reward: -5.967232065797589 Epsilon: 0.36305020258201304
Total Timesteps: 137735 Episode Num: 10698 Reward: -8.345745167289962 Epsilon: 0.36301593639960794
Total Timesteps: 137749 Episode Num: 10699 Reward: -6.99221348230571 Epsilon: 0.3629790415901386
Total Timesteps: 137762 Episode Num: 10700 Reward: -6.532644490609257 Epsilon: 0.36294478883872183
Total Timesteps: 137775 Episode Num: 10701 Reward: -5.611595470846175 Epsilon: 0.3629105425512611
Total Timesteps: 137788 Episode Num: 10702 Reward: -3.7730264571274836 Epsilon: 0.3628763027259268
Total Timesteps: 137801 Episode Num: 10703 Reward: -6.593241930619594 Epsilon: 0.36284206936089
Total Timesteps: 137814 Episode Num: 10704 Reward: -5.671147526788659 Epsilon: 0.3628078424543225
Total Timesteps: 1

Total Timesteps: 138777 Episode Num: 10779 Reward: -5.355440075231547 Epsilon: 0.3602902498252592
Total Timesteps: 138790 Episode Num: 10780 Reward: -6.399852844834525 Epsilon: 0.36025650262987247
Total Timesteps: 138803 Episode Num: 10781 Reward: -6.199302474163655 Epsilon: 0.3602227617558698
Total Timesteps: 138816 Episode Num: 10782 Reward: -6.807446015992582 Epsilon: 0.36018902720147533
Total Timesteps: 138829 Episode Num: 10783 Reward: -6.637258605846436 Epsilon: 0.36015529896491366
Total Timesteps: 138842 Episode Num: 10784 Reward: -6.437636146679414 Epsilon: 0.36012157704441017
Total Timesteps: 138855 Episode Num: 10785 Reward: -5.727169322986267 Epsilon: 0.36008786143819094
Total Timesteps: 138867 Episode Num: 10786 Reward: -4.169755483136331 Epsilon: 0.360056744943003
Total Timesteps: 138881 Episode Num: 10787 Reward: -7.3309490213707935 Epsilon: 0.3600204491615124
Total Timesteps: 138894 Episode Num: 10788 Reward: -5.097632308932828 Epsilon: 0.35998675248750844
Total Timestep

Total Timesteps: 139863 Episode Num: 10863 Reward: -5.575368695471493 Epsilon: 0.35749268927450434
Total Timesteps: 139877 Episode Num: 10864 Reward: -6.834899361489613 Epsilon: 0.35745690856967194
Total Timesteps: 139889 Episode Num: 10865 Reward: -4.264033352464117 Epsilon: 0.3574262450943248
Total Timesteps: 139902 Episode Num: 10866 Reward: -7.11137942937704 Epsilon: 0.35739303226544294
Total Timesteps: 139915 Episode Num: 10867 Reward: -6.654118778193077 Epsilon: 0.3573598256084051
Total Timesteps: 139928 Episode Num: 10868 Reward: -6.429245421157629 Epsilon: 0.35732662512149105
Total Timesteps: 139941 Episode Num: 10869 Reward: -4.868818061339004 Epsilon: 0.35729343080298126
Total Timesteps: 139954 Episode Num: 10870 Reward: -5.675481454506605 Epsilon: 0.3572602426511568
Total Timesteps: 139967 Episode Num: 10871 Reward: -4.745690709864927 Epsilon: 0.35722706066429943
Total Timesteps: 139980 Episode Num: 10872 Reward: -6.762678831661748 Epsilon: 0.35719388484069153
Total Timestep

Total Timesteps: 140925 Episode Num: 10945 Reward: -6.899858767510713 Epsilon: 0.3547986517651233
Total Timesteps: 140937 Episode Num: 10946 Reward: -3.68453348530055 Epsilon: 0.3547684426374905
Total Timesteps: 140950 Episode Num: 10947 Reward: -7.871939358466838 Epsilon: 0.35473572188719404
Total Timesteps: 140963 Episode Num: 10948 Reward: -6.54001703694639 Epsilon: 0.3547030071720948
Total Timesteps: 140975 Episode Num: 10949 Reward: -5.723203061341706 Epsilon: 0.35467281432878167
Total Timesteps: 140988 Episode Num: 10950 Reward: -5.361868168366001 Epsilon: 0.3546401112151389
Total Timesteps: 141001 Episode Num: 10951 Reward: -5.156238450727183 Epsilon: 0.3546074141318147
Total Timesteps: 141014 Episode Num: 10952 Reward: -7.353186704812698 Epsilon: 0.3545747230771413
Total Timesteps: 141027 Episode Num: 10953 Reward: -5.543167429172057 Epsilon: 0.3545420380494515
Total Timesteps: 141040 Episode Num: 10954 Reward: -5.510830366924172 Epsilon: 0.35450935904707886
Total Timesteps: 14

Total Timesteps: 142006 Episode Num: 11029 Reward: -4.528204933034962 Epsilon: 0.35209779868456265
Total Timesteps: 142019 Episode Num: 11030 Reward: -7.117564458022676 Epsilon: 0.3520655686915131
Total Timesteps: 142032 Episode Num: 11031 Reward: -8.057562239716072 Epsilon: 0.35203334459840036
Total Timesteps: 142044 Episode Num: 11032 Reward: -4.656974903850719 Epsilon: 0.35200360451691026
Total Timesteps: 142056 Episode Num: 11033 Reward: -5.833968950239168 Epsilon: 0.3519738694599313
Total Timesteps: 142069 Episode Num: 11034 Reward: -7.865304311253077 Epsilon: 0.351941662150082
Total Timesteps: 142081 Episode Num: 11035 Reward: -5.3641573987350855 Epsilon: 0.3519119375567458
Total Timesteps: 142094 Episode Num: 11036 Reward: -6.874417030436023 Epsilon: 0.3518797415795178
Total Timesteps: 142106 Episode Num: 11037 Reward: -4.6381625878875585 Epsilon: 0.35185002744430216
Total Timesteps: 142119 Episode Num: 11038 Reward: -5.087148108662321 Epsilon: 0.35181784279371514
Total Timestep

Total Timesteps: 143089 Episode Num: 11113 Reward: -6.427568322199648 Epsilon: 0.3494328704512576
Total Timesteps: 143102 Episode Num: 11114 Reward: -3.8099174459682725 Epsilon: 0.3494011264692317
Total Timesteps: 143115 Episode Num: 11115 Reward: -6.14428010335565 Epsilon: 0.34936938825420116
Total Timesteps: 143127 Episode Num: 11116 Reward: -4.397693605987945 Epsilon: 0.3493400965576027
Total Timesteps: 143140 Episode Num: 11117 Reward: -7.653023704605539 Epsilon: 0.3493083694285315
Total Timesteps: 143153 Episode Num: 11118 Reward: -4.528258887247985 Epsilon: 0.34927664806186387
Total Timesteps: 143166 Episode Num: 11119 Reward: -6.631474642098018 Epsilon: 0.34924493245603005
Total Timesteps: 143179 Episode Num: 11120 Reward: -6.630780202811668 Epsilon: 0.34921322260946086
Total Timesteps: 143192 Episode Num: 11121 Reward: -4.955140322593278 Epsilon: 0.34918151852058776
Total Timesteps: 143205 Episode Num: 11122 Reward: -6.974657764929608 Epsilon: 0.34914982018784263
Total Timestep

Total Timesteps: 144171 Episode Num: 11197 Reward: -6.673456321405149 Epsilon: 0.34681038489016514
Total Timesteps: 144183 Episode Num: 11198 Reward: -5.313576981361738 Epsilon: 0.3467815207063246
Total Timesteps: 144196 Episode Num: 11199 Reward: -6.4549629207165715 Epsilon: 0.3467502565951899
Total Timesteps: 144209 Episode Num: 11200 Reward: -6.580664723523136 Epsilon: 0.34671899812078305
Total Timesteps: 144222 Episode Num: 11201 Reward: -6.90914947682478 Epsilon: 0.3466877452815798
Total Timesteps: 144235 Episode Num: 11202 Reward: -4.667501950005122 Epsilon: 0.3466564980760564
Total Timesteps: 144248 Episode Num: 11203 Reward: -6.345572669729121 Epsilon: 0.3466252565026898
Total Timesteps: 144261 Episode Num: 11204 Reward: -6.498047571863953 Epsilon: 0.3465940205599573
Total Timesteps: 144274 Episode Num: 11205 Reward: -7.117159602311012 Epsilon: 0.34656279024633685
Total Timesteps: 144287 Episode Num: 11206 Reward: -5.31471519465411 Epsilon: 0.3465315655603069
Total Timesteps: 1

Total Timesteps: 145227 Episode Num: 11279 Reward: -5.1635719699866485 Epsilon: 0.3442885964731076
Total Timesteps: 145239 Episode Num: 11280 Reward: -4.9056764499322645 Epsilon: 0.3442601505105378
Total Timesteps: 145252 Episode Num: 11281 Reward: -5.6073258931480465 Epsilon: 0.3442293393550519
Total Timesteps: 145264 Episode Num: 11282 Reward: -6.780696373618399 Epsilon: 0.3442009031831699
Total Timesteps: 145277 Episode Num: 11283 Reward: -7.110565038238625 Epsilon: 0.3441701026315246
Total Timesteps: 145290 Episode Num: 11284 Reward: -6.119826505456292 Epsilon: 0.34413930759171313
Total Timesteps: 145302 Episode Num: 11285 Reward: -7.080904399115138 Epsilon: 0.3441108862919987
Total Timesteps: 145316 Episode Num: 11286 Reward: -7.550699549779443 Epsilon: 0.3440777340416747
Total Timesteps: 145329 Episode Num: 11287 Reward: -7.125812143029421 Epsilon: 0.3440469555284905
Total Timesteps: 145342 Episode Num: 11288 Reward: -6.53345311110061 Epsilon: 0.3440161825212258
Total Timesteps: 

Total Timesteps: 146305 Episode Num: 11363 Reward: -8.802839226191354 Epsilon: 0.3417518198284406
Total Timesteps: 146318 Episode Num: 11364 Reward: -5.062144468798977 Epsilon: 0.3417214560067798
Total Timesteps: 146331 Episode Num: 11365 Reward: -6.206386058745937 Epsilon: 0.34169109758014365
Total Timesteps: 146344 Episode Num: 11366 Reward: -5.814115956081637 Epsilon: 0.3416607445470945
Total Timesteps: 146356 Episode Num: 11367 Reward: -5.662849425374027 Epsilon: 0.3416327311487059
Total Timesteps: 146369 Episode Num: 11368 Reward: -5.94216055454806 Epsilon: 0.3416023884839003
Total Timesteps: 146382 Episode Num: 11369 Reward: -6.06405462985971 Epsilon: 0.34157205120848194
Total Timesteps: 146395 Episode Num: 11370 Reward: -7.1844119757290885 Epsilon: 0.34154171932101507
Total Timesteps: 146407 Episode Num: 11371 Reward: -4.6679338315328085 Epsilon: 0.3415137254366253
Total Timesteps: 146420 Episode Num: 11372 Reward: -5.280972975984507 Epsilon: 0.3414834039065701
Total Timesteps: 

Total Timesteps: 147383 Episode Num: 11447 Reward: -6.900443945344984 Epsilon: 0.339252152554908
Total Timesteps: 147396 Episode Num: 11448 Reward: -6.7118333522602205 Epsilon: 0.3392222312681484
Total Timesteps: 147410 Episode Num: 11449 Reward: -8.848926509202483 Epsilon: 0.3391900142459806
Total Timesteps: 147423 Episode Num: 11450 Reward: -7.057869135699258 Epsilon: 0.33916010391865586
Total Timesteps: 147435 Episode Num: 11451 Reward: -4.516609235613146 Epsilon: 0.33913249906738563
Total Timesteps: 147448 Episode Num: 11452 Reward: -5.42249870923971 Epsilon: 0.3391025988823178
Total Timesteps: 147460 Episode Num: 11453 Reward: -6.05237759392975 Epsilon: 0.33907500339075003
Total Timesteps: 147472 Episode Num: 11454 Reward: -4.928314901987229 Epsilon: 0.33904741239014863
Total Timesteps: 147485 Episode Num: 11455 Reward: -8.08446164328485 Epsilon: 0.3390175272061566
Total Timesteps: 147498 Episode Num: 11456 Reward: -5.779164281348133 Epsilon: 0.33898764729013275
Total Timesteps: 1

Total Timesteps: 148462 Episode Num: 11531 Reward: -7.883490474592103 Epsilon: 0.33678651776212093
Total Timesteps: 148476 Episode Num: 11532 Reward: -6.880908181643764 Epsilon: 0.3367547617123306
Total Timesteps: 148490 Episode Num: 11533 Reward: -6.29303869619942 Epsilon: 0.3367230116506162
Total Timesteps: 148502 Episode Num: 11534 Reward: -4.941365679135667 Epsilon: 0.3366958020767397
Total Timesteps: 148515 Episode Num: 11535 Reward: -5.840921883317822 Epsilon: 0.3366663300003367
Total Timesteps: 148527 Episode Num: 11536 Reward: -6.526970370607385 Epsilon: 0.33663912958586656
Total Timesteps: 148540 Episode Num: 11537 Reward: -6.604602871148013 Epsilon: 0.3366096674296486
Total Timesteps: 148553 Episode Num: 11538 Reward: -5.431138691237709 Epsilon: 0.33658021042994757
Total Timesteps: 148566 Episode Num: 11539 Reward: -4.604654527041424 Epsilon: 0.33655075858540984
Total Timesteps: 148579 Episode Num: 11540 Reward: -5.788229224570577 Epsilon: 0.3365213118946823
Total Timesteps: 

Total Timesteps: 149547 Episode Num: 11615 Reward: -5.2905647295655935 Epsilon: 0.3343430493423472
Total Timesteps: 149560 Episode Num: 11616 Reward: -7.221984755259781 Epsilon: 0.33431398769724524
Total Timesteps: 149573 Episode Num: 11617 Reward: -4.749765019269414 Epsilon: 0.3342849311038757
Total Timesteps: 149586 Episode Num: 11618 Reward: -6.9402067248699275 Epsilon: 0.33425587956092145
Total Timesteps: 149599 Episode Num: 11619 Reward: -5.521210580386098 Epsilon: 0.33422683306706596
Total Timesteps: 149611 Episode Num: 11620 Reward: -5.345189433808126 Epsilon: 0.3342000253992019
Total Timesteps: 149624 Episode Num: 11621 Reward: -5.606979885892105 Epsilon: 0.3341709886114527
Total Timesteps: 149637 Episode Num: 11622 Reward: -6.407178718600665 Epsilon: 0.3341419568689562
Total Timesteps: 149650 Episode Num: 11623 Reward: -6.976647323592957 Epsilon: 0.3341129301703976
Total Timesteps: 149663 Episode Num: 11624 Reward: -6.993823023062259 Epsilon: 0.3340839085144625
Total Timesteps

Total Timesteps: 150600 Episode Num: 11697 Reward: -6.862482707989875 Epsilon: 0.33200531208499334
Total Timesteps: 150613 Episode Num: 11698 Reward: -4.526975798660205 Epsilon: 0.33197665540159216
Total Timesteps: 150626 Episode Num: 11699 Reward: -5.407855517466312 Epsilon: 0.33194800366470595
Total Timesteps: 150639 Episode Num: 11700 Reward: -5.564642092549983 Epsilon: 0.3319193568730541
Total Timesteps: 150652 Episode Num: 11701 Reward: -6.952579662370655 Epsilon: 0.33189071502535644
Total Timesteps: 150665 Episode Num: 11702 Reward: -4.777241459299305 Epsilon: 0.3318620781203332
Total Timesteps: 150677 Episode Num: 11703 Reward: -5.525617236369897 Epsilon: 0.3318356484400406
Total Timesteps: 150690 Episode Num: 11704 Reward: -7.105210387346189 Epsilon: 0.3318070210365651
Total Timesteps: 150703 Episode Num: 11705 Reward: -6.345660331056214 Epsilon: 0.3317783985720258
Total Timesteps: 150716 Episode Num: 11706 Reward: -6.904712885855293 Epsilon: 0.3317497810451445
Total Timesteps:

Total Timesteps: 151685 Episode Num: 11781 Reward: -5.236118216110887 Epsilon: 0.32963048422718133
Total Timesteps: 151698 Episode Num: 11782 Reward: -7.417334036143537 Epsilon: 0.3296022360215692
Total Timesteps: 151710 Episode Num: 11783 Reward: -5.562037155115277 Epsilon: 0.3295761650517435
Total Timesteps: 151723 Episode Num: 11784 Reward: -7.348672856753594 Epsilon: 0.3295479261549007
Total Timesteps: 151736 Episode Num: 11785 Reward: -4.519434836783642 Epsilon: 0.3295196920967997
Total Timesteps: 151749 Episode Num: 11786 Reward: -7.731979401797352 Epsilon: 0.32949146287619685
Total Timesteps: 151762 Episode Num: 11787 Reward: -6.504484302301267 Epsilon: 0.32946323849184905
Total Timesteps: 151775 Episode Num: 11788 Reward: -5.307440797621149 Epsilon: 0.3294350189425136
Total Timesteps: 151789 Episode Num: 11789 Reward: -5.437333862159148 Epsilon: 0.329404634064392
Total Timesteps: 151801 Episode Num: 11790 Reward: -4.153818112779834 Epsilon: 0.32937859434391076
Total Timesteps: 

Total Timesteps: 152765 Episode Num: 11865 Reward: -5.585878987526235 Epsilon: 0.32730010146303146
Total Timesteps: 152779 Episode Num: 11866 Reward: -7.4003083842758475 Epsilon: 0.3272701091118544
Total Timesteps: 152792 Episode Num: 11867 Reward: -6.7424512991153795 Epsilon: 0.3272422639928792
Total Timesteps: 152805 Episode Num: 11868 Reward: -5.620793535092468 Epsilon: 0.3272144236117928
Total Timesteps: 152818 Episode Num: 11869 Reward: -7.273250316683903 Epsilon: 0.32718658796738603
Total Timesteps: 152831 Episode Num: 11870 Reward: -6.775242305611878 Epsilon: 0.32715875705845016
Total Timesteps: 152844 Episode Num: 11871 Reward: -7.868825672094972 Epsilon: 0.3271309308837769
Total Timesteps: 152857 Episode Num: 11872 Reward: -7.008716921189482 Epsilon: 0.32710310944215837
Total Timesteps: 152869 Episode Num: 11873 Reward: -5.85914054158066 Epsilon: 0.32707743231132536
Total Timesteps: 152882 Episode Num: 11874 Reward: -4.842301077457784 Epsilon: 0.3270496199683416
Total Timestep

## The inference policy function

In [None]:
def evaluate_final_policy(policy,random = 1, eval_episodes=1):
  avg_reward = 0.
  distance = 0.
  for k in range(eval_episodes):
    obs = env.reset(seed = random)
    done = False
    while not done:
      action, current_Q = policy.select_action(obs, state_dim)
      obs, reward, done = env.step(action)
      avg_reward += reward
    total_distance = 0
    for j in range(len(env.routes)):
      for i in range(len(env.routes[j])-1):
        total_distance = total_distance + math.floor(((env.VRP[env.routes[j][i],0]-env.VRP[env.routes[j][i+1],0])**2+(env.VRP[env.routes[j][i],1]-env.VRP[env.routes[j][i+1],1])**2)**0.5)
    distance += total_distance
  distance /= eval_episodes
  print ("---------------------------------------")
  print ("Average Distance over the Evaluation Step: %f" % (distance))
  print ("---------------------------------------")
  return distance

## Let's see if we can test the implementation

In [None]:
results = np.zeros(1)
policy.load(file_name, './pytorch_models/')
for i in range(1):
  evaluations = [evaluate_final_policy(policy, random = i)]
  results[i] = evaluations[0]
print(results)