In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 400
steps_per_epoch = 2000
learning_rate = 0.005
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            
            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer[:, :, :1].shape)
            else:
                depth_buffer = np.expand_dims(state.depth_buffer, axis=2)
                
            buffer = np.concatenate((state.screen_buffer, depth_buffer), axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self, epoch):
        self.learning_rate = 0.98*self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            
#Substitute an array of zeros for the depth buffer if that setting is disabled

            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer[:, :, :1].shape)
            else:
                depth_buffer = np.expand_dims(state.depth_buffer, axis=2)
            
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                depth_buffer),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    depth_buffer),
                                                    axis=2),
                                                    down_sample_ratio)
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr(epoch)
    target_net.update_lr(epoch)
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

        print('Epoch {} test:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward:', test_reward)
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [04:56<00:00,  6.75it/s]


Epoch 1 Mean Reward: 230.66210343933105


100%|██████████| 2000/2000 [03:32<00:00,  9.43it/s]


Epoch 2 Mean Reward: 231.4806551361084


100%|██████████| 2000/2000 [04:59<00:00,  6.68it/s]


Epoch 3 Mean Reward: 231.87222441101073


100%|██████████| 2000/2000 [04:44<00:00,  7.04it/s]


Epoch 4 Mean Reward: 230.3137645339966


100%|██████████| 2000/2000 [04:24<00:00,  7.57it/s]


Epoch 5 Mean Reward: 232.65606282043456


100%|██████████| 2000/2000 [04:07<00:00,  8.10it/s]


Epoch 6 Mean Reward: 230.26709364318847


100%|██████████| 2000/2000 [04:06<00:00,  8.12it/s]


Epoch 7 Mean Reward: 229.22575656890868


100%|██████████| 2000/2000 [04:09<00:00,  8.03it/s]


Epoch 8 Mean Reward: 230.29229149627685


100%|██████████| 2000/2000 [04:18<00:00,  7.73it/s]


Epoch 9 Mean Reward: 232.1329535293579


100%|██████████| 2000/2000 [03:58<00:00,  8.38it/s]


Epoch 10 Mean Reward: 231.42750119018555
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 10 test:
Test Episode 1 Reward: 371.37986755371094
Test Episode 2 Reward: 333.98387145996094
Test Episode 3 Reward: 371.37986755371094
Test Episode 4 Reward: 371.37986755371094
Test Episode 5 Reward: 371.37986755371094
Test Episode 6 Reward: 365.4778594970703
Test Episode 7 Reward: 371.37986755371094
Test Episode 8 Reward: 357.3772735595703
Test Episode 9 Reward: 371.37986755371094
Test Episode 10 Reward: 371.37986755371094
Average Test Reward: 365.649807739


100%|██████████| 2000/2000 [04:23<00:00,  7.60it/s]


Epoch 11 Mean Reward: 231.45031442260742


100%|██████████| 2000/2000 [04:13<00:00,  7.87it/s]


Epoch 12 Mean Reward: 231.54884819793702


100%|██████████| 2000/2000 [03:56<00:00,  8.44it/s]


Epoch 13 Mean Reward: 232.9632762527466


100%|██████████| 2000/2000 [03:38<00:00,  9.15it/s]


Epoch 14 Mean Reward: 231.78624807739257


100%|██████████| 2000/2000 [03:50<00:00,  8.68it/s]


Epoch 15 Mean Reward: 231.6035881881714


100%|██████████| 2000/2000 [03:59<00:00,  8.35it/s]


Epoch 16 Mean Reward: 232.9123680114746


100%|██████████| 2000/2000 [03:11<00:00, 10.45it/s]


Epoch 17 Mean Reward: 233.1254228439331


100%|██████████| 2000/2000 [03:41<00:00,  9.03it/s]


Epoch 18 Mean Reward: 231.92283963775634


100%|██████████| 2000/2000 [03:34<00:00,  9.33it/s]


Epoch 19 Mean Reward: 230.2620124053955


100%|██████████| 2000/2000 [02:53<00:00, 11.53it/s]


Epoch 20 Mean Reward: 232.26358936309813
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test:
Test Episode 1 Reward: 428.28880310058594
Test Episode 2 Reward: 337.0444641113281
Test Episode 3 Reward: 394.17149353027344
Test Episode 4 Reward: 363.3585205078125
Test Episode 5 Reward: 394.17149353027344
Test Episode 6 Reward: 394.17149353027344
Test Episode 7 Reward: 394.17149353027344
Test Episode 8 Reward: 608.2017517089844
Test Episode 9 Reward: 334.109619140625
Test Episode 10 Reward: 394.17149353027344
Average Test Reward: 404.186062622


100%|██████████| 2000/2000 [03:01<00:00, 11.02it/s]


Epoch 21 Mean Reward: 234.00993001556395


100%|██████████| 2000/2000 [03:19<00:00, 10.02it/s]


Epoch 22 Mean Reward: 233.95630574035644


100%|██████████| 2000/2000 [02:57<00:00, 11.24it/s]


Epoch 23 Mean Reward: 230.05746424102784


100%|██████████| 2000/2000 [02:39<00:00, 12.53it/s]


Epoch 24 Mean Reward: 229.56480675506592


100%|██████████| 2000/2000 [03:02<00:00, 10.97it/s]


Epoch 25 Mean Reward: 231.49874757385254


100%|██████████| 2000/2000 [02:58<00:00, 11.21it/s]


Epoch 26 Mean Reward: 228.94968212890626


100%|██████████| 2000/2000 [03:06<00:00, 10.75it/s]


Epoch 27 Mean Reward: 231.92166245269775


100%|██████████| 2000/2000 [02:40<00:00, 12.49it/s]


Epoch 28 Mean Reward: 233.32423358154296


100%|██████████| 2000/2000 [02:46<00:00, 11.98it/s]


Epoch 29 Mean Reward: 231.52247505950928


100%|██████████| 2000/2000 [02:39<00:00, 12.50it/s]


Epoch 30 Mean Reward: 231.07545096588134
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 30 test:
Test Episode 1 Reward: 388.2373352050781
Test Episode 2 Reward: 388.2373352050781
Test Episode 3 Reward: 388.2373352050781
Test Episode 4 Reward: 340.6838684082031
Test Episode 5 Reward: 367.7684631347656
Test Episode 6 Reward: 355.33363342285156
Test Episode 7 Reward: 388.2373352050781
Test Episode 8 Reward: 362.97218322753906
Test Episode 9 Reward: 608.3450012207031
Test Episode 10 Reward: 605.4576416015625
Average Test Reward: 419.351013184


100%|██████████| 2000/2000 [02:40<00:00, 12.48it/s]


Epoch 31 Mean Reward: 232.2132400970459


100%|██████████| 2000/2000 [02:40<00:00, 12.46it/s]


Epoch 32 Mean Reward: 232.0777030029297


100%|██████████| 2000/2000 [02:41<00:00, 12.37it/s]


Epoch 33 Mean Reward: 231.77355780792237


100%|██████████| 2000/2000 [02:37<00:00, 12.69it/s]


Epoch 34 Mean Reward: 232.20638486480712


100%|██████████| 2000/2000 [02:36<00:00, 12.77it/s]


Epoch 35 Mean Reward: 230.98769086456298


100%|██████████| 2000/2000 [02:29<00:00, 13.36it/s]


Epoch 36 Mean Reward: 230.3384165649414


100%|██████████| 2000/2000 [02:37<00:00, 12.70it/s]


Epoch 37 Mean Reward: 230.45450436401367


100%|██████████| 2000/2000 [02:24<00:00, 13.81it/s]


Epoch 38 Mean Reward: 231.86954176330568


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Epoch 39 Mean Reward: 231.94967932891845


100%|██████████| 2000/2000 [02:11<00:00, 15.26it/s]


Epoch 40 Mean Reward: 230.94012596130372
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test:
Test Episode 1 Reward: 621.7326965332031
Test Episode 2 Reward: 681.0916442871094
Test Episode 3 Reward: 438.52232360839844
Test Episode 4 Reward: 540.7571411132812
Test Episode 5 Reward: 327.7820129394531
Test Episode 6 Reward: 482.25389099121094
Test Episode 7 Reward: 482.25389099121094
Test Episode 8 Reward: 482.25389099121094
Test Episode 9 Reward: 396.35121154785156
Test Episode 10 Reward: 482.25389099121094
Average Test Reward: 493.525259399


100%|██████████| 2000/2000 [02:19<00:00, 14.33it/s]


Epoch 41 Mean Reward: 231.34238191223145


100%|██████████| 2000/2000 [02:18<00:00, 14.41it/s]


Epoch 42 Mean Reward: 232.8697253265381


100%|██████████| 2000/2000 [02:18<00:00, 14.43it/s]


Epoch 43 Mean Reward: 230.3008073501587


100%|██████████| 2000/2000 [02:14<00:00, 14.87it/s]


Epoch 44 Mean Reward: 234.30252005767824


100%|██████████| 2000/2000 [02:01<00:00, 16.42it/s]


Epoch 45 Mean Reward: 230.7289229888916


100%|██████████| 2000/2000 [01:59<00:00, 16.74it/s]


Epoch 46 Mean Reward: 232.80865751647949


100%|██████████| 2000/2000 [02:02<00:00, 16.28it/s]


Epoch 47 Mean Reward: 232.70496630096434


100%|██████████| 2000/2000 [01:59<00:00, 16.69it/s]


Epoch 48 Mean Reward: 232.0334410018921


100%|██████████| 2000/2000 [02:03<00:00, 16.20it/s]


Epoch 49 Mean Reward: 230.85882691192626


100%|██████████| 2000/2000 [02:11<00:00, 15.21it/s]


Epoch 50 Mean Reward: 233.54492185211183
Epoch 50 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 50 test:
Test Episode 1 Reward: 357.19207763671875
Test Episode 2 Reward: 686.6470489501953
Test Episode 3 Reward: 357.19207763671875
Test Episode 4 Reward: 375.4600830078125
Test Episode 5 Reward: 634.2671051025391
Test Episode 6 Reward: 595.9880218505859
Test Episode 7 Reward: 543.3281555175781
Test Episode 8 Reward: 602.7923889160156
Test Episode 9 Reward: 359.7574920654297
Test Episode 10 Reward: 357.19207763671875
Average Test Reward: 486.981652832


100%|██████████| 2000/2000 [01:58<00:00, 16.87it/s]


Epoch 51 Mean Reward: 229.42836208343505


100%|██████████| 2000/2000 [01:56<00:00, 17.24it/s]


Epoch 52 Mean Reward: 230.85003554534913


100%|██████████| 2000/2000 [01:58<00:00, 16.81it/s]


Epoch 53 Mean Reward: 230.2575609664917


100%|██████████| 2000/2000 [02:01<00:00, 16.44it/s]


Epoch 54 Mean Reward: 230.15036278533935


100%|██████████| 2000/2000 [02:07<00:00, 15.68it/s]


Epoch 55 Mean Reward: 229.96632453918457


100%|██████████| 2000/2000 [02:00<00:00, 16.61it/s]


Epoch 56 Mean Reward: 231.13958375549316


100%|██████████| 2000/2000 [01:57<00:00, 17.07it/s]


Epoch 57 Mean Reward: 231.85738159179687


100%|██████████| 2000/2000 [02:02<00:00, 16.31it/s]


Epoch 58 Mean Reward: 233.29667141723633


100%|██████████| 2000/2000 [01:58<00:00, 16.88it/s]


Epoch 59 Mean Reward: 230.2468634033203


100%|██████████| 2000/2000 [01:56<00:00, 17.15it/s]


Epoch 60 Mean Reward: 231.96521727752685
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test:
Test Episode 1 Reward: 447.7160949707031
Test Episode 2 Reward: 447.7160949707031
Test Episode 3 Reward: 447.7160949707031
Test Episode 4 Reward: 447.7160949707031
Test Episode 5 Reward: 302.9500427246094
Test Episode 6 Reward: 447.7160949707031
Test Episode 7 Reward: 498.4619445800781
Test Episode 8 Reward: 352.26861572265625
Test Episode 9 Reward: 320.8821105957031
Test Episode 10 Reward: 625.6674041748047
Average Test Reward: 433.881059265


100%|██████████| 2000/2000 [01:58<00:00, 16.89it/s]


Epoch 61 Mean Reward: 230.18089404296876


100%|██████████| 2000/2000 [02:12<00:00, 15.06it/s]


Epoch 62 Mean Reward: 232.28445677185059


100%|██████████| 2000/2000 [02:44<00:00, 12.19it/s]


Epoch 63 Mean Reward: 231.6260863647461


100%|██████████| 2000/2000 [02:43<00:00, 12.26it/s]


Epoch 64 Mean Reward: 231.21743800354005


100%|██████████| 2000/2000 [02:39<00:00, 12.52it/s]


Epoch 65 Mean Reward: 231.92420617675782


100%|██████████| 2000/2000 [02:49<00:00, 11.79it/s]


Epoch 66 Mean Reward: 233.86291532897948


100%|██████████| 2000/2000 [02:49<00:00, 11.79it/s]


Epoch 67 Mean Reward: 233.453108833313


100%|██████████| 2000/2000 [02:34<00:00, 12.91it/s]


Epoch 68 Mean Reward: 229.86463500976564


100%|██████████| 2000/2000 [02:38<00:00, 12.64it/s]


Epoch 69 Mean Reward: 231.35418843078614


100%|██████████| 2000/2000 [02:18<00:00, 14.49it/s]


Epoch 70 Mean Reward: 230.89763745880126
Epoch 70 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 70 test:
Test Episode 1 Reward: 447.6195373535156
Test Episode 2 Reward: 347.5172119140625
Test Episode 3 Reward: 296.2043762207031
Test Episode 4 Reward: 293.23439025878906
Test Episode 5 Reward: 326.4134063720703
Test Episode 6 Reward: 519.9139404296875
Test Episode 7 Reward: 333.3016052246094
Test Episode 8 Reward: 328.3477783203125
Test Episode 9 Reward: 354.7616729736328
Test Episode 10 Reward: 519.9139404296875
Average Test Reward: 376.72278595


100%|██████████| 2000/2000 [01:58<00:00, 16.83it/s]


Epoch 71 Mean Reward: 232.989926864624


100%|██████████| 2000/2000 [02:04<00:00, 16.10it/s]


Epoch 72 Mean Reward: 229.47874309539796


100%|██████████| 2000/2000 [01:53<00:00, 17.55it/s]


Epoch 73 Mean Reward: 232.43725595092772


100%|██████████| 2000/2000 [02:07<00:00, 15.66it/s]


Epoch 74 Mean Reward: 231.5219686355591


100%|██████████| 2000/2000 [01:56<00:00, 17.16it/s]


Epoch 75 Mean Reward: 232.65160195922851


100%|██████████| 2000/2000 [01:56<00:00, 17.12it/s]


Epoch 76 Mean Reward: 231.0282942123413


100%|██████████| 2000/2000 [01:54<00:00, 17.46it/s]


Epoch 77 Mean Reward: 229.6172355117798


100%|██████████| 2000/2000 [01:54<00:00, 17.54it/s]


Epoch 78 Mean Reward: 230.86814184570312


100%|██████████| 2000/2000 [01:54<00:00, 17.40it/s]


Epoch 79 Mean Reward: 232.58894593811036


100%|██████████| 2000/2000 [01:57<00:00, 17.00it/s]


Epoch 80 Mean Reward: 231.63391960144043
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test:
Test Episode 1 Reward: 386.2644500732422
Test Episode 2 Reward: 305.8230438232422
Test Episode 3 Reward: 443.69879150390625
Test Episode 4 Reward: 443.69879150390625
Test Episode 5 Reward: 293.74546813964844
Test Episode 6 Reward: 524.0493316650391
Test Episode 7 Reward: 443.69879150390625
Test Episode 8 Reward: 434.3213806152344
Test Episode 9 Reward: 307.85137939453125
Test Episode 10 Reward: 394.6641082763672
Average Test Reward: 397.78155365


100%|██████████| 2000/2000 [02:42<00:00, 12.33it/s]


Epoch 81 Mean Reward: 231.41065219116211


100%|██████████| 2000/2000 [02:41<00:00, 12.37it/s]


Epoch 82 Mean Reward: 230.0602943954468


100%|██████████| 2000/2000 [02:35<00:00, 12.85it/s]


Epoch 83 Mean Reward: 232.29725813293456


100%|██████████| 2000/2000 [02:39<00:00, 12.52it/s]


Epoch 84 Mean Reward: 228.8622060394287


100%|██████████| 2000/2000 [02:39<00:00, 12.55it/s]


Epoch 85 Mean Reward: 230.12654432678224


100%|██████████| 2000/2000 [02:34<00:00, 12.95it/s]


Epoch 86 Mean Reward: 230.5075982208252


100%|██████████| 2000/2000 [02:37<00:00, 12.68it/s]


Epoch 87 Mean Reward: 229.55373350524903


100%|██████████| 2000/2000 [02:02<00:00, 16.34it/s]


Epoch 88 Mean Reward: 233.67476914215086


100%|██████████| 2000/2000 [02:34<00:00, 12.95it/s]


Epoch 89 Mean Reward: 230.38429740142823


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 90 Mean Reward: 230.45227684020995
Epoch 90 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 90 test:
Test Episode 1 Reward: 467.72430419921875
Test Episode 2 Reward: 536.3099212646484
Test Episode 3 Reward: 467.72430419921875
Test Episode 4 Reward: 467.72430419921875
Test Episode 5 Reward: 467.72430419921875
Test Episode 6 Reward: 467.72430419921875
Test Episode 7 Reward: 545.5589599609375
Test Episode 8 Reward: 467.72430419921875
Test Episode 9 Reward: 467.72430419921875
Test Episode 10 Reward: 325.16221618652344
Average Test Reward: 468.110122681


100%|██████████| 2000/2000 [02:41<00:00, 12.39it/s]


Epoch 91 Mean Reward: 231.15232704925538


100%|██████████| 2000/2000 [02:22<00:00, 14.06it/s]


Epoch 92 Mean Reward: 231.03361975860597


100%|██████████| 2000/2000 [02:23<00:00, 13.90it/s]


Epoch 93 Mean Reward: 230.92301906585692


100%|██████████| 2000/2000 [02:20<00:00, 14.20it/s]


Epoch 94 Mean Reward: 230.10995333862306


100%|██████████| 2000/2000 [02:51<00:00, 11.66it/s]


Epoch 95 Mean Reward: 229.89959423065184


100%|██████████| 2000/2000 [02:49<00:00, 11.77it/s]


Epoch 96 Mean Reward: 232.7094104614258


100%|██████████| 2000/2000 [02:40<00:00, 12.43it/s]


Epoch 97 Mean Reward: 231.45797814941406


100%|██████████| 2000/2000 [02:47<00:00, 11.95it/s]


Epoch 98 Mean Reward: 232.38636171722413


100%|██████████| 2000/2000 [02:42<00:00, 12.33it/s]


Epoch 99 Mean Reward: 231.3405457458496


100%|██████████| 2000/2000 [02:51<00:00, 11.67it/s]


Epoch 100 Mean Reward: 230.25526111602784
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test:
Test Episode 1 Reward: 468.0237274169922
Test Episode 2 Reward: 468.0237274169922
Test Episode 3 Reward: 468.0237274169922
Test Episode 4 Reward: 277.6262969970703
Test Episode 5 Reward: 319.0111846923828
Test Episode 6 Reward: 304.74781799316406
Test Episode 7 Reward: 392.0566864013672
Test Episode 8 Reward: 468.0237274169922
Test Episode 9 Reward: 468.0237274169922
Test Episode 10 Reward: 287.2765350341797
Average Test Reward: 392.08371582


100%|██████████| 2000/2000 [02:42<00:00, 12.27it/s]


Epoch 101 Mean Reward: 230.41164656066894


100%|██████████| 2000/2000 [02:50<00:00, 11.76it/s]


Epoch 102 Mean Reward: 230.5659870300293


100%|██████████| 2000/2000 [02:13<00:00, 14.97it/s]


Epoch 103 Mean Reward: 230.25810935211183


100%|██████████| 2000/2000 [02:39<00:00, 12.54it/s]


Epoch 104 Mean Reward: 233.72196087646483


100%|██████████| 2000/2000 [02:49<00:00, 11.81it/s]


Epoch 105 Mean Reward: 231.77112287902833


100%|██████████| 2000/2000 [01:57<00:00, 16.99it/s]


Epoch 106 Mean Reward: 230.58907957458496


100%|██████████| 2000/2000 [02:01<00:00, 16.52it/s]


Epoch 107 Mean Reward: 234.16114056396484


100%|██████████| 2000/2000 [02:15<00:00, 14.77it/s]


Epoch 108 Mean Reward: 230.96114936828613


100%|██████████| 2000/2000 [01:55<00:00, 17.38it/s]


Epoch 109 Mean Reward: 233.1990902633667


100%|██████████| 2000/2000 [01:57<00:00, 17.02it/s]


Epoch 110 Mean Reward: 231.23893299102784
Epoch 110 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 110 test:
Test Episode 1 Reward: 488.48963928222656
Test Episode 2 Reward: 415.0550079345703
Test Episode 3 Reward: 596.9412536621094
Test Episode 4 Reward: 267.02899169921875
Test Episode 5 Reward: 488.48963928222656
Test Episode 6 Reward: 488.48963928222656
Test Episode 7 Reward: 316.53041076660156
Test Episode 8 Reward: 488.48963928222656
Test Episode 9 Reward: 593.4423065185547
Test Episode 10 Reward: 488.48963928222656
Average Test Reward: 463.144616699


100%|██████████| 2000/2000 [02:16<00:00, 14.62it/s]


Epoch 111 Mean Reward: 229.74248934173585


100%|██████████| 2000/2000 [02:06<00:00, 15.84it/s]


Epoch 112 Mean Reward: 230.77032624053956


100%|██████████| 2000/2000 [02:10<00:00, 15.29it/s]


Epoch 113 Mean Reward: 231.0961770629883


100%|██████████| 2000/2000 [02:05<00:00, 15.89it/s]


Epoch 114 Mean Reward: 230.44628645324707


100%|██████████| 2000/2000 [02:07<00:00, 15.66it/s]


Epoch 115 Mean Reward: 233.30797688293458


100%|██████████| 2000/2000 [02:17<00:00, 14.51it/s]


Epoch 116 Mean Reward: 233.57539664459227


100%|██████████| 2000/2000 [02:27<00:00, 13.54it/s]


Epoch 117 Mean Reward: 230.61316820526122


100%|██████████| 2000/2000 [02:33<00:00, 13.05it/s]


Epoch 118 Mean Reward: 231.16426797485352


100%|██████████| 2000/2000 [02:15<00:00, 14.80it/s]


Epoch 119 Mean Reward: 231.25731826782226


100%|██████████| 2000/2000 [02:02<00:00, 16.35it/s]


Epoch 120 Mean Reward: 232.64072059631349
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test:
Test Episode 1 Reward: 324.77601623535156
Test Episode 2 Reward: 304.40208435058594
Test Episode 3 Reward: 322.70252990722656
Test Episode 4 Reward: 322.70252990722656
Test Episode 5 Reward: 253.39019775390625
Test Episode 6 Reward: 322.70252990722656
Test Episode 7 Reward: 322.70252990722656
Test Episode 8 Reward: 322.70252990722656
Test Episode 9 Reward: 322.70252990722656
Test Episode 10 Reward: 338.5454406738281
Average Test Reward: 315.732891846


100%|██████████| 2000/2000 [02:09<00:00, 15.45it/s]


Epoch 121 Mean Reward: 253.1695090789795


100%|██████████| 2000/2000 [02:13<00:00, 14.94it/s]


Epoch 122 Mean Reward: 254.3503872909546


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 123 Mean Reward: 253.5796632156372


100%|██████████| 2000/2000 [02:32<00:00, 13.09it/s]


Epoch 124 Mean Reward: 255.736395072937


100%|██████████| 2000/2000 [02:34<00:00, 12.94it/s]


Epoch 125 Mean Reward: 256.34288513183594


100%|██████████| 2000/2000 [02:35<00:00, 12.87it/s]


Epoch 126 Mean Reward: 258.37129588317873


100%|██████████| 2000/2000 [02:42<00:00, 12.34it/s]


Epoch 127 Mean Reward: 256.70173527526856


100%|██████████| 2000/2000 [02:42<00:00, 12.31it/s]


Epoch 128 Mean Reward: 257.9037394256592


100%|██████████| 2000/2000 [02:43<00:00, 12.24it/s]


Epoch 129 Mean Reward: 256.1978757171631


100%|██████████| 2000/2000 [02:45<00:00, 12.10it/s]


Epoch 130 Mean Reward: 259.0114175567627
Epoch 130 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 130 test:
Test Episode 1 Reward: 417.39788818359375
Test Episode 2 Reward: 291.2280731201172
Test Episode 3 Reward: 477.7781219482422
Test Episode 4 Reward: 324.86151123046875
Test Episode 5 Reward: 517.280029296875
Test Episode 6 Reward: 517.280029296875
Test Episode 7 Reward: 517.280029296875
Test Episode 8 Reward: 517.280029296875
Test Episode 9 Reward: 517.280029296875
Test Episode 10 Reward: 311.5938415527344
Average Test Reward: 440.925958252


100%|██████████| 2000/2000 [02:19<00:00, 14.32it/s]


Epoch 131 Mean Reward: 256.26371620178224


100%|██████████| 2000/2000 [02:13<00:00, 14.98it/s]


Epoch 132 Mean Reward: 259.8811771621704


100%|██████████| 2000/2000 [02:18<00:00, 14.48it/s]


Epoch 133 Mean Reward: 260.8804469985962


100%|██████████| 2000/2000 [02:15<00:00, 14.71it/s]


Epoch 134 Mean Reward: 263.096290725708


100%|██████████| 2000/2000 [02:14<00:00, 14.86it/s]


Epoch 135 Mean Reward: 262.98304007720947


100%|██████████| 2000/2000 [02:09<00:00, 15.39it/s]


Epoch 136 Mean Reward: 260.50253020477294


100%|██████████| 2000/2000 [02:23<00:00, 13.98it/s]


Epoch 137 Mean Reward: 260.2764431228638


100%|██████████| 2000/2000 [02:14<00:00, 14.85it/s]


Epoch 138 Mean Reward: 262.1732911376953


100%|██████████| 2000/2000 [02:16<00:00, 14.67it/s]


Epoch 139 Mean Reward: 260.95033811950685


100%|██████████| 2000/2000 [02:13<00:00, 14.95it/s]


Epoch 140 Mean Reward: 263.48946980285643
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test:
Test Episode 1 Reward: 482.9203338623047
Test Episode 2 Reward: 470.0062255859375
Test Episode 3 Reward: 273.03466796875
Test Episode 4 Reward: 470.0062255859375
Test Episode 5 Reward: 343.4011535644531
Test Episode 6 Reward: 407.23577880859375
Test Episode 7 Reward: 470.0062255859375
Test Episode 8 Reward: 470.0062255859375
Test Episode 9 Reward: 470.0062255859375
Test Episode 10 Reward: 470.0062255859375
Average Test Reward: 432.662928772


100%|██████████| 2000/2000 [02:15<00:00, 14.74it/s]


Epoch 141 Mean Reward: 264.11664948272704


100%|██████████| 2000/2000 [02:16<00:00, 14.63it/s]


Epoch 142 Mean Reward: 261.658860458374


100%|██████████| 2000/2000 [02:27<00:00, 13.56it/s]


Epoch 143 Mean Reward: 265.47258145141603


100%|██████████| 2000/2000 [02:13<00:00, 14.94it/s]


Epoch 144 Mean Reward: 262.2694768600464


100%|██████████| 2000/2000 [02:18<00:00, 14.49it/s]


Epoch 145 Mean Reward: 264.42572902679444


100%|██████████| 2000/2000 [02:11<00:00, 15.20it/s]


Epoch 146 Mean Reward: 263.38494062805177


100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]


Epoch 147 Mean Reward: 266.1325291366577


100%|██████████| 2000/2000 [02:19<00:00, 14.32it/s]


Epoch 148 Mean Reward: 261.843851272583


100%|██████████| 2000/2000 [02:21<00:00, 14.14it/s]


Epoch 149 Mean Reward: 263.2506613693237


100%|██████████| 2000/2000 [02:33<00:00, 13.05it/s]


Epoch 150 Mean Reward: 264.35564149475096
Epoch 150 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 150 test:
Test Episode 1 Reward: 411.6723327636719
Test Episode 2 Reward: 411.6723327636719
Test Episode 3 Reward: 273.7146911621094
Test Episode 4 Reward: 411.6723327636719
Test Episode 5 Reward: 411.6723327636719
Test Episode 6 Reward: 302.1946716308594
Test Episode 7 Reward: 277.97991943359375
Test Episode 8 Reward: 411.6723327636719
Test Episode 9 Reward: 411.6723327636719
Test Episode 10 Reward: 400.16114807128906
Average Test Reward: 372.408442688


100%|██████████| 2000/2000 [02:26<00:00, 13.68it/s]


Epoch 151 Mean Reward: 266.70035565948484


100%|██████████| 2000/2000 [02:26<00:00, 13.67it/s]


Epoch 152 Mean Reward: 268.09825955200193


100%|██████████| 2000/2000 [02:27<00:00, 13.52it/s]


Epoch 153 Mean Reward: 268.54726531219484


100%|██████████| 2000/2000 [02:27<00:00, 13.59it/s]


Epoch 154 Mean Reward: 268.7677564468384


100%|██████████| 2000/2000 [02:33<00:00, 13.06it/s]


Epoch 155 Mean Reward: 271.1458089752197


100%|██████████| 2000/2000 [02:24<00:00, 13.87it/s]


Epoch 156 Mean Reward: 271.7593249282837


100%|██████████| 2000/2000 [02:18<00:00, 14.40it/s]


Epoch 157 Mean Reward: 271.3402442550659


100%|██████████| 2000/2000 [02:17<00:00, 14.59it/s]


Epoch 158 Mean Reward: 270.5063362503052


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 159 Mean Reward: 271.9415832366943


100%|██████████| 2000/2000 [02:21<00:00, 14.09it/s]


Epoch 160 Mean Reward: 273.9092648620605
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test:
Test Episode 1 Reward: 417.98524475097656
Test Episode 2 Reward: 399.7307586669922
Test Episode 3 Reward: 396.8941650390625
Test Episode 4 Reward: 287.2469787597656
Test Episode 5 Reward: 389.7740936279297
Test Episode 6 Reward: 417.98524475097656
Test Episode 7 Reward: 417.98524475097656
Test Episode 8 Reward: 417.98524475097656
Test Episode 9 Reward: 420.4980010986328
Test Episode 10 Reward: 512.6441192626953
Average Test Reward: 407.872909546


100%|██████████| 2000/2000 [02:19<00:00, 14.37it/s]


Epoch 161 Mean Reward: 276.8879597091675


100%|██████████| 2000/2000 [02:33<00:00, 13.03it/s]


Epoch 162 Mean Reward: 280.8340008163452


100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Epoch 163 Mean Reward: 277.6323993911743


100%|██████████| 2000/2000 [02:21<00:00, 14.16it/s]


Epoch 164 Mean Reward: 280.91590188598633


100%|██████████| 2000/2000 [02:22<00:00, 13.99it/s]


Epoch 165 Mean Reward: 281.6726383132935


100%|██████████| 2000/2000 [02:21<00:00, 14.18it/s]


Epoch 166 Mean Reward: 281.8793727874756


100%|██████████| 2000/2000 [02:21<00:00, 14.13it/s]


Epoch 167 Mean Reward: 283.40005227661135


100%|██████████| 2000/2000 [02:30<00:00, 13.33it/s]


Epoch 168 Mean Reward: 278.4927936630249


100%|██████████| 2000/2000 [02:23<00:00, 13.89it/s]


Epoch 169 Mean Reward: 283.83008741760256


100%|██████████| 2000/2000 [02:26<00:00, 13.62it/s]


Epoch 170 Mean Reward: 282.8916023330689
Epoch 170 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 170 test:
Test Episode 1 Reward: 447.9744415283203
Test Episode 2 Reward: 447.9744415283203
Test Episode 3 Reward: 447.9744415283203
Test Episode 4 Reward: 447.9744415283203
Test Episode 5 Reward: 447.9744415283203
Test Episode 6 Reward: 447.9744415283203
Test Episode 7 Reward: 349.96107482910156
Test Episode 8 Reward: 410.92823791503906
Test Episode 9 Reward: 447.9744415283203
Test Episode 10 Reward: 629.6914978027344
Average Test Reward: 452.640190125


100%|██████████| 2000/2000 [02:25<00:00, 13.72it/s]


Epoch 171 Mean Reward: 278.6240773925781


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 172 Mean Reward: 277.5698410720825


100%|██████████| 2000/2000 [02:24<00:00, 13.89it/s]


Epoch 173 Mean Reward: 282.28431058502196


100%|██████████| 2000/2000 [02:26<00:00, 13.67it/s]


Epoch 174 Mean Reward: 277.5915780029297


100%|██████████| 2000/2000 [02:19<00:00, 14.36it/s]


Epoch 175 Mean Reward: 284.04135608673096


100%|██████████| 2000/2000 [02:15<00:00, 14.75it/s]


Epoch 176 Mean Reward: 283.51598655700684


100%|██████████| 2000/2000 [02:17<00:00, 14.53it/s]


Epoch 177 Mean Reward: 281.2771682052612


100%|██████████| 2000/2000 [02:21<00:00, 14.16it/s]


Epoch 178 Mean Reward: 283.96727277374265


100%|██████████| 2000/2000 [02:16<00:00, 14.69it/s]


Epoch 179 Mean Reward: 286.20552841186526


100%|██████████| 2000/2000 [02:37<00:00, 12.73it/s]


Epoch 180 Mean Reward: 286.6351648025513
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test:
Test Episode 1 Reward: 464.9194793701172
Test Episode 2 Reward: 483.8724670410156
Test Episode 3 Reward: 464.9194793701172
Test Episode 4 Reward: 464.9194793701172
Test Episode 5 Reward: 350.47406005859375
Test Episode 6 Reward: 323.1977081298828
Test Episode 7 Reward: 464.9194793701172
Test Episode 8 Reward: 343.5794677734375
Test Episode 9 Reward: 317.7030029296875
Test Episode 10 Reward: 464.9194793701172
Average Test Reward: 414.342410278


100%|██████████| 2000/2000 [02:28<00:00, 13.50it/s]


Epoch 181 Mean Reward: 267.3715040664673


100%|██████████| 2000/2000 [02:22<00:00, 14.07it/s]


Epoch 182 Mean Reward: 265.61404961395266


100%|██████████| 2000/2000 [02:21<00:00, 14.16it/s]


Epoch 183 Mean Reward: 267.0416158676147


100%|██████████| 2000/2000 [02:17<00:00, 14.51it/s]


Epoch 184 Mean Reward: 263.85276042938233


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 185 Mean Reward: 268.414380821228


100%|██████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 186 Mean Reward: 269.57489739227293


100%|██████████| 2000/2000 [02:17<00:00, 14.54it/s]


Epoch 187 Mean Reward: 265.24498523712157


100%|██████████| 2000/2000 [02:17<00:00, 14.52it/s]


Epoch 188 Mean Reward: 270.7378261871338


100%|██████████| 2000/2000 [02:18<00:00, 14.40it/s]


Epoch 189 Mean Reward: 267.72829541015625


100%|██████████| 2000/2000 [02:18<00:00, 14.42it/s]


Epoch 190 Mean Reward: 272.42326522064207
Epoch 190 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 190 test:
Test Episode 1 Reward: 390.3285675048828
Test Episode 2 Reward: 216.9026641845703
Test Episode 3 Reward: 330.97705078125
Test Episode 4 Reward: 390.3285675048828
Test Episode 5 Reward: 390.3285675048828
Test Episode 6 Reward: 390.3285675048828
Test Episode 7 Reward: 228.770263671875
Test Episode 8 Reward: 326.96820068359375
Test Episode 9 Reward: 261.8727264404297
Test Episode 10 Reward: 407.92266845703125
Average Test Reward: 333.472784424


100%|██████████| 2000/2000 [02:15<00:00, 14.73it/s]


Epoch 191 Mean Reward: 269.2467882537842


100%|██████████| 2000/2000 [02:16<00:00, 14.63it/s]


Epoch 192 Mean Reward: 268.5902858886719


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 193 Mean Reward: 272.71542570495603


100%|██████████| 2000/2000 [02:15<00:00, 14.73it/s]


Epoch 194 Mean Reward: 270.5689372558594


100%|██████████| 2000/2000 [02:15<00:00, 14.74it/s]


Epoch 195 Mean Reward: 271.0615194015503


100%|██████████| 2000/2000 [02:09<00:00, 15.49it/s]


Epoch 196 Mean Reward: 270.9296204833984


100%|██████████| 2000/2000 [02:11<00:00, 15.24it/s]


Epoch 197 Mean Reward: 274.7830152587891


100%|██████████| 2000/2000 [02:14<00:00, 14.87it/s]


Epoch 198 Mean Reward: 271.92485887145995


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 199 Mean Reward: 273.8301266479492


100%|██████████| 2000/2000 [02:10<00:00, 15.29it/s]


Epoch 200 Mean Reward: 275.11205094909667
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test:
Test Episode 1 Reward: 261.8280029296875
Test Episode 2 Reward: 354.1342468261719
Test Episode 3 Reward: 463.400634765625
Test Episode 4 Reward: 298.6933135986328
Test Episode 5 Reward: 236.81259155273438
Test Episode 6 Reward: 307.6513977050781
Test Episode 7 Reward: 215.06243896484375
Test Episode 8 Reward: 371.9266052246094
Test Episode 9 Reward: 257.3985595703125
Test Episode 10 Reward: 342.5852813720703
Average Test Reward: 310.949307251


100%|██████████| 2000/2000 [02:29<00:00, 13.41it/s]


Epoch 201 Mean Reward: 262.35746672821045


100%|██████████| 2000/2000 [02:15<00:00, 14.71it/s]


Epoch 202 Mean Reward: 266.8666748046875


100%|██████████| 2000/2000 [02:15<00:00, 14.72it/s]


Epoch 203 Mean Reward: 267.89256093597413


100%|██████████| 2000/2000 [02:16<00:00, 14.67it/s]


Epoch 204 Mean Reward: 267.4143419876099


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 205 Mean Reward: 265.9595801925659


100%|██████████| 2000/2000 [02:09<00:00, 15.39it/s]


Epoch 206 Mean Reward: 270.64180295562744


100%|██████████| 2000/2000 [02:20<00:00, 14.25it/s]


Epoch 207 Mean Reward: 272.3855636138916


100%|██████████| 2000/2000 [02:14<00:00, 14.89it/s]


Epoch 208 Mean Reward: 268.5103300857544


100%|██████████| 2000/2000 [02:19<00:00, 14.35it/s]


Epoch 209 Mean Reward: 266.40627220916747


100%|██████████| 2000/2000 [02:21<00:00, 14.16it/s]


Epoch 210 Mean Reward: 269.0687578811646
Epoch 210 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 210 test:
Test Episode 1 Reward: 233.51715087890625
Test Episode 2 Reward: 270.4179229736328
Test Episode 3 Reward: 214.99615478515625
Test Episode 4 Reward: 192.311767578125
Test Episode 5 Reward: 230.4794158935547
Test Episode 6 Reward: 270.4179229736328
Test Episode 7 Reward: 189.7738037109375
Test Episode 8 Reward: 270.4179229736328
Test Episode 9 Reward: 208.75987243652344
Test Episode 10 Reward: 270.4179229736328
Average Test Reward: 235.150985718


100%|██████████| 2000/2000 [03:25<00:00,  9.74it/s]


Epoch 211 Mean Reward: 275.7090296020508


100%|██████████| 2000/2000 [03:18<00:00, 10.08it/s]


Epoch 212 Mean Reward: 282.2973296203613


100%|██████████| 2000/2000 [02:56<00:00, 11.36it/s]


Epoch 213 Mean Reward: 291.7396480026245


100%|██████████| 2000/2000 [02:52<00:00, 11.62it/s]


Epoch 214 Mean Reward: 294.51812547302245


100%|██████████| 2000/2000 [02:53<00:00, 11.54it/s]


Epoch 215 Mean Reward: 293.72735987854


100%|██████████| 2000/2000 [03:02<00:00, 10.99it/s]


Epoch 216 Mean Reward: 286.9338455352783


100%|██████████| 2000/2000 [02:52<00:00, 11.60it/s]


Epoch 217 Mean Reward: 289.91525407409665


100%|██████████| 2000/2000 [02:54<00:00, 11.49it/s]


Epoch 218 Mean Reward: 288.6246545410156


100%|██████████| 2000/2000 [02:57<00:00, 11.25it/s]


Epoch 219 Mean Reward: 289.7838929672241


100%|██████████| 2000/2000 [02:50<00:00, 11.74it/s]


Epoch 220 Mean Reward: 298.45015098571776
Epoch 220 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 220 test:
Test Episode 1 Reward: 465.95098876953125
Test Episode 2 Reward: 465.95098876953125
Test Episode 3 Reward: 465.95098876953125
Test Episode 4 Reward: 465.95098876953125
Test Episode 5 Reward: 289.2190856933594
Test Episode 6 Reward: 465.95098876953125
Test Episode 7 Reward: 219.49658203125
Test Episode 8 Reward: 465.95098876953125
Test Episode 9 Reward: 319.290283203125
Test Episode 10 Reward: 465.95098876953125
Average Test Reward: 408.966287231


100%|██████████| 2000/2000 [02:50<00:00, 11.71it/s]


Epoch 221 Mean Reward: 273.7249727783203


100%|██████████| 2000/2000 [02:27<00:00, 13.55it/s]


Epoch 222 Mean Reward: 273.81741632080076


100%|██████████| 2000/2000 [02:41<00:00, 12.39it/s]


Epoch 223 Mean Reward: 277.36574307250976


100%|██████████| 2000/2000 [02:35<00:00, 12.82it/s]


Epoch 224 Mean Reward: 276.631789100647


100%|██████████| 2000/2000 [02:36<00:00, 12.79it/s]


Epoch 225 Mean Reward: 271.98700775146483


100%|██████████| 2000/2000 [02:39<00:00, 12.54it/s]


Epoch 226 Mean Reward: 278.2311002044678


100%|██████████| 2000/2000 [02:23<00:00, 13.91it/s]


Epoch 227 Mean Reward: 280.2551844940186


100%|██████████| 2000/2000 [02:26<00:00, 13.66it/s]


Epoch 228 Mean Reward: 280.8868207321167


100%|██████████| 2000/2000 [02:25<00:00, 13.70it/s]


Epoch 229 Mean Reward: 281.6178463973999


100%|██████████| 2000/2000 [02:27<00:00, 13.52it/s]


Epoch 230 Mean Reward: 280.1174423141479
Epoch 230 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 230 test:
Test Episode 1 Reward: 399.5041046142578
Test Episode 2 Reward: 385.3819885253906
Test Episode 3 Reward: 385.3819885253906
Test Episode 4 Reward: 304.48779296875
Test Episode 5 Reward: 385.3819885253906
Test Episode 6 Reward: 290.06695556640625
Test Episode 7 Reward: 385.3819885253906
Test Episode 8 Reward: 296.12269592285156
Test Episode 9 Reward: 351.45477294921875
Test Episode 10 Reward: 385.3819885253906
Average Test Reward: 356.854626465


100%|██████████| 2000/2000 [02:15<00:00, 14.78it/s]


Epoch 231 Mean Reward: 275.51179467010496


100%|██████████| 2000/2000 [02:23<00:00, 13.92it/s]


Epoch 232 Mean Reward: 277.4040172958374


100%|██████████| 2000/2000 [02:14<00:00, 14.91it/s]


Epoch 233 Mean Reward: 272.58710398864747


100%|██████████| 2000/2000 [02:13<00:00, 15.03it/s]


Epoch 234 Mean Reward: 261.53030116271975


100%|██████████| 2000/2000 [02:15<00:00, 14.73it/s]


Epoch 235 Mean Reward: 266.2183663024902


100%|██████████| 2000/2000 [02:10<00:00, 15.34it/s]


Epoch 236 Mean Reward: 273.9420973434448


100%|██████████| 2000/2000 [02:11<00:00, 15.23it/s]


Epoch 237 Mean Reward: 282.67847187805177


100%|██████████| 2000/2000 [02:07<00:00, 15.74it/s]


Epoch 238 Mean Reward: 279.8383497238159


100%|██████████| 2000/2000 [02:20<00:00, 14.28it/s]


Epoch 239 Mean Reward: 285.5565096435547


100%|██████████| 2000/2000 [02:08<00:00, 15.61it/s]


Epoch 240 Mean Reward: 286.4471667098999
Epoch 240 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 240 test:
Test Episode 1 Reward: 316.2095947265625
Test Episode 2 Reward: 265.9655456542969
Test Episode 3 Reward: 316.2095947265625
Test Episode 4 Reward: 316.2095947265625
Test Episode 5 Reward: 316.2095947265625
Test Episode 6 Reward: 235.07102966308594
Test Episode 7 Reward: 184.02008056640625
Test Episode 8 Reward: 316.2095947265625
Test Episode 9 Reward: 316.2095947265625
Test Episode 10 Reward: 292.52748107910156
Average Test Reward: 287.484170532


100%|██████████| 2000/2000 [02:40<00:00, 12.43it/s]


Epoch 241 Mean Reward: 259.8405809860229


100%|██████████| 2000/2000 [02:38<00:00, 12.65it/s]


Epoch 242 Mean Reward: 258.3239779434204


100%|██████████| 2000/2000 [03:05<00:00, 10.78it/s]


Epoch 243 Mean Reward: 261.4482760925293


100%|██████████| 2000/2000 [02:50<00:00, 11.70it/s]


Epoch 244 Mean Reward: 260.38939724731443


100%|██████████| 2000/2000 [02:42<00:00, 12.28it/s]


Epoch 245 Mean Reward: 261.5759792022705


100%|██████████| 2000/2000 [02:25<00:00, 13.79it/s]


Epoch 246 Mean Reward: 263.42726932525636


100%|██████████| 2000/2000 [02:48<00:00, 11.85it/s]


Epoch 247 Mean Reward: 265.5309780960083


100%|██████████| 2000/2000 [02:36<00:00, 12.78it/s]


Epoch 248 Mean Reward: 269.02396674346926


100%|██████████| 2000/2000 [02:43<00:00, 12.24it/s]


Epoch 249 Mean Reward: 262.26132806396487


100%|██████████| 2000/2000 [02:56<00:00, 11.31it/s]


Epoch 250 Mean Reward: 258.2394231338501
Epoch 250 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 250 test:
Test Episode 1 Reward: 237.5860595703125
Test Episode 2 Reward: 237.5860595703125
Test Episode 3 Reward: 237.5860595703125
Test Episode 4 Reward: 237.5860595703125
Test Episode 5 Reward: 237.5860595703125
Test Episode 6 Reward: 348.7232208251953
Test Episode 7 Reward: 210.24752807617188
Test Episode 8 Reward: 212.09373474121094
Test Episode 9 Reward: 237.5860595703125
Test Episode 10 Reward: 237.5860595703125
Average Test Reward: 243.416690063


100%|██████████| 2000/2000 [02:51<00:00, 11.69it/s]


Epoch 251 Mean Reward: 228.17402123260499


100%|██████████| 2000/2000 [02:47<00:00, 11.93it/s]


Epoch 252 Mean Reward: 227.35175447845458


100%|██████████| 2000/2000 [02:38<00:00, 12.65it/s]


Epoch 253 Mean Reward: 229.9113921661377


100%|██████████| 2000/2000 [02:49<00:00, 11.79it/s]


Epoch 254 Mean Reward: 227.22294290161133


100%|██████████| 2000/2000 [02:57<00:00, 11.25it/s]


Epoch 255 Mean Reward: 227.57382702636718


100%|██████████| 2000/2000 [03:04<00:00, 10.82it/s]


Epoch 256 Mean Reward: 227.7191725921631


100%|██████████| 2000/2000 [03:04<00:00, 10.82it/s]


Epoch 257 Mean Reward: 229.61319505310058


100%|██████████| 2000/2000 [03:03<00:00, 10.90it/s]


Epoch 258 Mean Reward: 230.47070620727538


100%|██████████| 2000/2000 [02:59<00:00, 11.16it/s]


Epoch 259 Mean Reward: 233.61858235168458


100%|██████████| 2000/2000 [03:11<00:00, 10.45it/s]


Epoch 260 Mean Reward: 231.34987022399903
Epoch 260 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 260 test:
Test Episode 1 Reward: 184.08804321289062
Test Episode 2 Reward: 206.71458435058594
Test Episode 3 Reward: 206.71458435058594
Test Episode 4 Reward: 206.71458435058594
Test Episode 5 Reward: 189.23919677734375
Test Episode 6 Reward: 206.71458435058594
Test Episode 7 Reward: 206.71458435058594
Test Episode 8 Reward: 188.28219604492188
Test Episode 9 Reward: 206.71458435058594
Test Episode 10 Reward: 186.52374267578125
Average Test Reward: 198.842068481


100%|██████████| 2000/2000 [02:40<00:00, 12.44it/s]


Epoch 261 Mean Reward: 224.38909420013428


100%|██████████| 2000/2000 [02:55<00:00, 11.41it/s]


Epoch 262 Mean Reward: 222.40528832244874


100%|██████████| 2000/2000 [02:48<00:00, 11.87it/s]


Epoch 263 Mean Reward: 227.7610786819458


100%|██████████| 2000/2000 [02:42<00:00, 12.28it/s]


Epoch 264 Mean Reward: 227.40443606567382


100%|██████████| 2000/2000 [02:57<00:00, 11.29it/s]


Epoch 265 Mean Reward: 227.74512890625


100%|██████████| 2000/2000 [02:42<00:00, 12.28it/s]


Epoch 266 Mean Reward: 226.08727635192872


100%|██████████| 2000/2000 [02:39<00:00, 12.55it/s]


Epoch 267 Mean Reward: 225.27478552246095


100%|██████████| 2000/2000 [02:50<00:00, 11.71it/s]


Epoch 268 Mean Reward: 225.21045671081544


100%|██████████| 2000/2000 [02:40<00:00, 12.45it/s]


Epoch 269 Mean Reward: 228.7460388031006


100%|██████████| 2000/2000 [02:57<00:00, 11.26it/s]


Epoch 270 Mean Reward: 227.91029054260255
Epoch 270 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 270 test:
Test Episode 1 Reward: 191.44671630859375
Test Episode 2 Reward: 286.73094177246094
Test Episode 3 Reward: 184.07557678222656
Test Episode 4 Reward: 199.89669799804688
Test Episode 5 Reward: 191.44671630859375
Test Episode 6 Reward: 191.44671630859375
Test Episode 7 Reward: 250.17172241210938
Test Episode 8 Reward: 184.1787567138672
Test Episode 9 Reward: 184.09014892578125
Test Episode 10 Reward: 204.23052978515625
Average Test Reward: 206.771452332


100%|██████████| 2000/2000 [03:14<00:00, 10.30it/s]


Epoch 271 Mean Reward: 261.063082862854


100%|██████████| 2000/2000 [03:10<00:00, 10.52it/s]


Epoch 272 Mean Reward: 268.33614754486086


100%|██████████| 2000/2000 [03:03<00:00, 10.92it/s]


Epoch 273 Mean Reward: 274.23632877349854


100%|██████████| 2000/2000 [02:57<00:00, 11.26it/s]


Epoch 274 Mean Reward: 270.74265866851806


100%|██████████| 2000/2000 [03:23<00:00,  9.83it/s]


Epoch 275 Mean Reward: 280.8069485473633


100%|██████████| 2000/2000 [03:02<00:00, 10.98it/s]


Epoch 276 Mean Reward: 275.9963069152832


100%|██████████| 2000/2000 [03:01<00:00, 11.03it/s]


Epoch 277 Mean Reward: 277.04055618286134


100%|██████████| 2000/2000 [03:12<00:00, 10.41it/s]


Epoch 278 Mean Reward: 284.8801071777344


100%|██████████| 2000/2000 [03:05<00:00, 10.81it/s]


Epoch 279 Mean Reward: 281.09589967346193


100%|██████████| 2000/2000 [03:07<00:00, 10.66it/s]


Epoch 280 Mean Reward: 283.3007201004028
Epoch 280 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 280 test:
Test Episode 1 Reward: 184.00184631347656
Test Episode 2 Reward: 222.83602905273438
Test Episode 3 Reward: 184.3505401611328
Test Episode 4 Reward: 222.83602905273438
Test Episode 5 Reward: 222.83602905273438
Test Episode 6 Reward: 222.83602905273438
Test Episode 7 Reward: 222.83602905273438
Test Episode 8 Reward: 278.2743835449219
Test Episode 9 Reward: 191.20211791992188
Test Episode 10 Reward: 186.41058349609375
Average Test Reward: 213.84196167


100%|██████████| 2000/2000 [03:21<00:00,  9.92it/s]


Epoch 281 Mean Reward: 274.01524753570556


100%|██████████| 2000/2000 [03:10<00:00, 10.48it/s]


Epoch 282 Mean Reward: 282.4562517166138


100%|██████████| 2000/2000 [03:48<00:00,  8.75it/s]


Epoch 283 Mean Reward: 270.73105711364747


100%|██████████| 2000/2000 [03:39<00:00,  9.13it/s]


Epoch 284 Mean Reward: 275.5243544540405


100%|██████████| 2000/2000 [03:09<00:00, 10.53it/s]


Epoch 285 Mean Reward: 273.2591522216797


100%|██████████| 2000/2000 [03:14<00:00, 10.29it/s]


Epoch 286 Mean Reward: 281.41974736785886


100%|██████████| 2000/2000 [03:34<00:00,  9.34it/s]


Epoch 287 Mean Reward: 273.079342300415


100%|██████████| 2000/2000 [03:34<00:00,  9.32it/s]


Epoch 288 Mean Reward: 273.30117137908934


100%|██████████| 2000/2000 [04:05<00:00,  8.13it/s]


Epoch 289 Mean Reward: 274.2153701019287


100%|██████████| 2000/2000 [03:17<00:00, 10.10it/s]


Epoch 290 Mean Reward: 279.53612289428713
Epoch 290 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 290 test:
Test Episode 1 Reward: 191.2440948486328
Test Episode 2 Reward: 322.1546173095703
Test Episode 3 Reward: 290.8802032470703
Test Episode 4 Reward: 191.1159210205078
Test Episode 5 Reward: 191.2440948486328
Test Episode 6 Reward: 259.2790985107422
Test Episode 7 Reward: 223.30430603027344
Test Episode 8 Reward: 191.2440948486328
Test Episode 9 Reward: 191.2440948486328
Test Episode 10 Reward: 191.2440948486328
Average Test Reward: 224.295462036


100%|██████████| 2000/2000 [02:18<00:00, 14.39it/s]


Epoch 291 Mean Reward: 275.4554753036499


100%|██████████| 2000/2000 [02:26<00:00, 13.67it/s]


Epoch 292 Mean Reward: 275.14343582916257


100%|██████████| 2000/2000 [02:40<00:00, 12.43it/s]


Epoch 293 Mean Reward: 238.57868537902831


100%|██████████| 2000/2000 [02:21<00:00, 14.10it/s]


Epoch 294 Mean Reward: 225.0725852661133


100%|██████████| 2000/2000 [02:16<00:00, 14.64it/s]


Epoch 295 Mean Reward: 212.37370693969726


100%|██████████| 2000/2000 [02:21<00:00, 14.12it/s]


Epoch 296 Mean Reward: 212.1856259841919


100%|██████████| 2000/2000 [02:55<00:00, 11.38it/s]


Epoch 297 Mean Reward: 209.24856629180908


100%|██████████| 2000/2000 [03:01<00:00, 11.01it/s]


Epoch 298 Mean Reward: 207.86576734161378


100%|██████████| 2000/2000 [02:54<00:00, 11.49it/s]


Epoch 299 Mean Reward: 205.7520897293091


100%|██████████| 2000/2000 [03:04<00:00, 10.82it/s]


Epoch 300 Mean Reward: 206.88111955261232
Epoch 300 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 300 test:
Test Episode 1 Reward: 184.02435302734375
Test Episode 2 Reward: 184.02435302734375
Test Episode 3 Reward: 184.02435302734375
Test Episode 4 Reward: 184.0184326171875
Test Episode 5 Reward: 184.01828002929688
Test Episode 6 Reward: 184.01844787597656
Test Episode 7 Reward: 184.01844787597656
Test Episode 8 Reward: 184.02435302734375
Test Episode 9 Reward: 184.62631225585938
Test Episode 10 Reward: 184.02435302734375
Average Test Reward: 184.082168579


100%|██████████| 2000/2000 [03:21<00:00,  9.92it/s]


Epoch 301 Mean Reward: 211.39828217315673


100%|██████████| 2000/2000 [03:21<00:00,  9.92it/s]


Epoch 302 Mean Reward: 217.3287361984253


100%|██████████| 2000/2000 [03:24<00:00,  9.79it/s]


Epoch 303 Mean Reward: 213.98355912017823


100%|██████████| 2000/2000 [03:18<00:00, 10.06it/s]


Epoch 304 Mean Reward: 215.3511868133545


100%|██████████| 2000/2000 [03:09<00:00, 10.54it/s]


Epoch 305 Mean Reward: 217.43757347106933


100%|██████████| 2000/2000 [03:20<00:00,  9.96it/s]


Epoch 306 Mean Reward: 216.76446137237548


100%|██████████| 2000/2000 [02:32<00:00, 13.08it/s]


Epoch 307 Mean Reward: 222.7508203125


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 308 Mean Reward: 220.84151997375488


100%|██████████| 2000/2000 [02:14<00:00, 14.82it/s]


Epoch 309 Mean Reward: 220.4446851043701


100%|██████████| 2000/2000 [02:12<00:00, 15.15it/s]


Epoch 310 Mean Reward: 222.8828819656372
Epoch 310 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 310 test:
Test Episode 1 Reward: 228.88470458984375
Test Episode 2 Reward: 184.04359436035156
Test Episode 3 Reward: 184.1787567138672
Test Episode 4 Reward: 184.1787567138672
Test Episode 5 Reward: 184.1787567138672
Test Episode 6 Reward: 184.1787567138672
Test Episode 7 Reward: 184.1787567138672
Test Episode 8 Reward: 214.14076232910156
Test Episode 9 Reward: 184.1787567138672
Test Episode 10 Reward: 227.05657958984375
Average Test Reward: 195.919818115


100%|██████████| 2000/2000 [02:19<00:00, 14.36it/s]


Epoch 311 Mean Reward: 219.4222387008667


100%|██████████| 2000/2000 [02:18<00:00, 14.42it/s]


Epoch 312 Mean Reward: 219.68662164306642


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 313 Mean Reward: 223.3681427154541


100%|██████████| 2000/2000 [02:35<00:00, 12.83it/s]


Epoch 314 Mean Reward: 222.3389614715576


100%|██████████| 2000/2000 [02:14<00:00, 14.86it/s]


Epoch 315 Mean Reward: 222.18213331604005


100%|██████████| 2000/2000 [02:17<00:00, 14.54it/s]


Epoch 316 Mean Reward: 227.26400759124755


100%|██████████| 2000/2000 [02:17<00:00, 14.51it/s]


Epoch 317 Mean Reward: 237.36645429992674


100%|██████████| 2000/2000 [02:16<00:00, 14.64it/s]


Epoch 318 Mean Reward: 242.79029417419434


100%|██████████| 2000/2000 [02:30<00:00, 13.31it/s]


Epoch 319 Mean Reward: 238.7188080215454


100%|██████████| 2000/2000 [02:15<00:00, 14.77it/s]


Epoch 320 Mean Reward: 240.50591734313966
Epoch 320 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 320 test:
Test Episode 1 Reward: 250.11305236816406
Test Episode 2 Reward: 250.11305236816406
Test Episode 3 Reward: 250.11305236816406
Test Episode 4 Reward: 281.7665557861328
Test Episode 5 Reward: 184.02342224121094
Test Episode 6 Reward: 250.11305236816406
Test Episode 7 Reward: 250.11305236816406
Test Episode 8 Reward: 219.95567321777344
Test Episode 9 Reward: 250.11305236816406
Test Episode 10 Reward: 196.9424285888672
Average Test Reward: 238.336639404


100%|██████████| 2000/2000 [02:14<00:00, 14.85it/s]


Epoch 321 Mean Reward: 230.3945019683838


100%|██████████| 2000/2000 [02:20<00:00, 14.19it/s]


Epoch 322 Mean Reward: 228.4534693145752


100%|██████████| 2000/2000 [02:27<00:00, 13.60it/s]


Epoch 323 Mean Reward: 227.68146687316894


100%|██████████| 2000/2000 [04:03<00:00,  8.22it/s]


Epoch 324 Mean Reward: 227.7618681869507


100%|██████████| 2000/2000 [04:29<00:00,  7.43it/s]


Epoch 325 Mean Reward: 231.94689377593994


100%|██████████| 2000/2000 [03:54<00:00,  8.53it/s]


Epoch 326 Mean Reward: 227.95642596435547


100%|██████████| 2000/2000 [02:24<00:00, 13.85it/s]


Epoch 327 Mean Reward: 233.31585765075684


100%|██████████| 2000/2000 [02:39<00:00, 12.50it/s]


Epoch 328 Mean Reward: 239.8823522491455


100%|██████████| 2000/2000 [02:13<00:00, 14.93it/s]


Epoch 329 Mean Reward: 238.06128006744385


100%|██████████| 2000/2000 [02:27<00:00, 13.55it/s]


Epoch 330 Mean Reward: 240.2442921295166
Epoch 330 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 330 test:
Test Episode 1 Reward: 199.8898162841797
Test Episode 2 Reward: 199.8898162841797
Test Episode 3 Reward: 199.8898162841797
Test Episode 4 Reward: 184.0011749267578
Test Episode 5 Reward: 199.93592834472656
Test Episode 6 Reward: 184.02256774902344
Test Episode 7 Reward: 199.8898162841797
Test Episode 8 Reward: 199.06210327148438
Test Episode 9 Reward: 199.8898162841797
Test Episode 10 Reward: 198.26266479492188
Average Test Reward: 196.473352051


100%|██████████| 2000/2000 [02:13<00:00, 14.99it/s]


Epoch 331 Mean Reward: 236.8744602279663


100%|██████████| 2000/2000 [02:13<00:00, 14.94it/s]


Epoch 332 Mean Reward: 238.58445165252687


100%|██████████| 2000/2000 [02:20<00:00, 14.23it/s]


Epoch 333 Mean Reward: 239.32516753387452


100%|██████████| 2000/2000 [02:28<00:00, 13.49it/s]


Epoch 334 Mean Reward: 234.10257709503173


100%|██████████| 2000/2000 [03:06<00:00, 10.71it/s]


Epoch 335 Mean Reward: 241.90875927734376


100%|██████████| 2000/2000 [03:20<00:00,  9.99it/s]


Epoch 336 Mean Reward: 264.8306311340332


100%|██████████| 2000/2000 [03:55<00:00,  8.49it/s]


Epoch 337 Mean Reward: 279.9583445587158


100%|██████████| 2000/2000 [04:43<00:00,  7.06it/s]


Epoch 338 Mean Reward: 278.17626264190676


100%|██████████| 2000/2000 [04:35<00:00,  7.26it/s]


Epoch 339 Mean Reward: 272.02410154724123


100%|██████████| 2000/2000 [04:42<00:00,  7.08it/s]


Epoch 340 Mean Reward: 267.3580409011841
Epoch 340 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 340 test:
Test Episode 1 Reward: 184.04440307617188
Test Episode 2 Reward: 184.1776885986328
Test Episode 3 Reward: 184.0677947998047
Test Episode 4 Reward: 184.1776885986328
Test Episode 5 Reward: 184.00143432617188
Test Episode 6 Reward: 184.1776885986328
Test Episode 7 Reward: 184.02259826660156
Test Episode 8 Reward: 184.1776885986328
Test Episode 9 Reward: 184.11422729492188
Test Episode 10 Reward: 184.02230834960938
Average Test Reward: 184.098352051


100%|██████████| 2000/2000 [05:26<00:00,  6.13it/s]


Epoch 341 Mean Reward: 295.5311011657715


100%|██████████| 2000/2000 [02:58<00:00, 11.24it/s]


Epoch 342 Mean Reward: 231.5226646347046


100%|██████████| 2000/2000 [02:56<00:00, 11.32it/s]


Epoch 343 Mean Reward: 244.88389097595214


100%|██████████| 2000/2000 [02:49<00:00, 11.80it/s]


Epoch 344 Mean Reward: 252.7317584915161


100%|██████████| 2000/2000 [02:22<00:00, 14.08it/s]


Epoch 345 Mean Reward: 235.3962071838379


100%|██████████| 2000/2000 [02:04<00:00, 16.10it/s]


Epoch 346 Mean Reward: 223.42762915802


100%|██████████| 2000/2000 [02:13<00:00, 14.98it/s]


Epoch 347 Mean Reward: 231.3709580383301


100%|██████████| 2000/2000 [02:02<00:00, 16.27it/s]


Epoch 348 Mean Reward: 239.42613529205323


100%|██████████| 2000/2000 [02:02<00:00, 16.29it/s]


Epoch 349 Mean Reward: 240.20436368560792


100%|██████████| 2000/2000 [02:03<00:00, 16.23it/s]


Epoch 350 Mean Reward: 238.8262756729126
Epoch 350 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 350 test:
Test Episode 1 Reward: 271.6267395019531
Test Episode 2 Reward: 264.3216094970703
Test Episode 3 Reward: 264.3216094970703
Test Episode 4 Reward: 264.3216094970703
Test Episode 5 Reward: 264.3216094970703
Test Episode 6 Reward: 264.3216094970703
Test Episode 7 Reward: 267.05043029785156
Test Episode 8 Reward: 264.3216094970703
Test Episode 9 Reward: 264.3216094970703
Test Episode 10 Reward: 228.5429229736328
Average Test Reward: 261.747135925


100%|██████████| 2000/2000 [02:17<00:00, 14.60it/s]


Epoch 351 Mean Reward: 271.51284264373777


100%|██████████| 2000/2000 [02:03<00:00, 16.16it/s]


Epoch 352 Mean Reward: 270.3687679824829


100%|██████████| 2000/2000 [02:04<00:00, 16.08it/s]


Epoch 353 Mean Reward: 261.2284993286133


100%|██████████| 2000/2000 [02:10<00:00, 15.37it/s]


Epoch 354 Mean Reward: 252.41142948913574


100%|██████████| 2000/2000 [02:37<00:00, 12.73it/s]


Epoch 355 Mean Reward: 247.70756566619872


100%|██████████| 2000/2000 [02:59<00:00, 11.15it/s]


Epoch 356 Mean Reward: 243.19147193145753


100%|██████████| 2000/2000 [02:30<00:00, 13.30it/s]


Epoch 357 Mean Reward: 240.47370169067383


100%|██████████| 2000/2000 [02:31<00:00, 13.21it/s]


Epoch 358 Mean Reward: 246.333314201355


100%|██████████| 2000/2000 [02:30<00:00, 13.27it/s]


Epoch 359 Mean Reward: 248.36602578735352


100%|██████████| 2000/2000 [02:42<00:00, 12.34it/s]


Epoch 360 Mean Reward: 247.11915952301027
Epoch 360 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 360 test:
Test Episode 1 Reward: 737.8287200927734
Test Episode 2 Reward: 321.54400634765625
Test Episode 3 Reward: 400.97825622558594
Test Episode 4 Reward: 431.1297912597656
Test Episode 5 Reward: 400.97825622558594
Test Episode 6 Reward: 400.97825622558594
Test Episode 7 Reward: 331.904541015625
Test Episode 8 Reward: 359.35975646972656
Test Episode 9 Reward: 235.751953125
Test Episode 10 Reward: 400.97825622558594
Average Test Reward: 402.143179321


100%|██████████| 2000/2000 [02:19<00:00, 14.31it/s]


Epoch 361 Mean Reward: 233.9303258666992


100%|██████████| 2000/2000 [02:44<00:00, 12.13it/s]


Epoch 362 Mean Reward: 205.86772158050536


100%|██████████| 2000/2000 [02:14<00:00, 14.83it/s]


Epoch 363 Mean Reward: 204.5568995819092


100%|██████████| 2000/2000 [02:16<00:00, 14.68it/s]


Epoch 364 Mean Reward: 220.403130607605


100%|██████████| 2000/2000 [02:17<00:00, 14.57it/s]


Epoch 365 Mean Reward: 227.31815498352051


100%|██████████| 2000/2000 [02:42<00:00, 12.29it/s]


Epoch 366 Mean Reward: 229.37197647857667


100%|██████████| 2000/2000 [02:41<00:00, 12.36it/s]


Epoch 367 Mean Reward: 225.52472428894043


100%|██████████| 2000/2000 [03:41<00:00,  9.05it/s]


Epoch 368 Mean Reward: 215.38916878509522


100%|██████████| 2000/2000 [03:23<00:00,  9.83it/s]


Epoch 369 Mean Reward: 221.38047763824463


100%|██████████| 2000/2000 [03:31<00:00,  9.46it/s]


Epoch 370 Mean Reward: 225.66175694274904
Epoch 370 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 370 test:
Test Episode 1 Reward: 188.3607177734375
Test Episode 2 Reward: 188.14007568359375
Test Episode 3 Reward: 184.0013885498047
Test Episode 4 Reward: 184.0031280517578
Test Episode 5 Reward: 188.3607177734375
Test Episode 6 Reward: 184.024169921875
Test Episode 7 Reward: 188.3607177734375
Test Episode 8 Reward: 187.2125244140625
Test Episode 9 Reward: 277.56724548339844
Test Episode 10 Reward: 188.3607177734375
Average Test Reward: 195.83914032


100%|██████████| 2000/2000 [02:42<00:00, 12.28it/s]


Epoch 371 Mean Reward: 214.64477264404297


100%|██████████| 2000/2000 [03:04<00:00, 10.86it/s]


Epoch 372 Mean Reward: 216.81434077453613


100%|██████████| 2000/2000 [03:08<00:00, 10.62it/s]


Epoch 373 Mean Reward: 211.8102795791626


100%|██████████| 2000/2000 [03:17<00:00, 10.15it/s]


Epoch 374 Mean Reward: 213.63882329559326


100%|██████████| 2000/2000 [04:52<00:00,  6.83it/s]


Epoch 375 Mean Reward: 218.4146025161743


100%|██████████| 2000/2000 [05:44<00:00,  5.80it/s]


Epoch 376 Mean Reward: 208.38288259887696


100%|██████████| 2000/2000 [04:09<00:00,  8.02it/s]


Epoch 377 Mean Reward: 209.64226696777342


100%|██████████| 2000/2000 [03:53<00:00,  8.56it/s]


Epoch 378 Mean Reward: 207.4894476852417


100%|██████████| 2000/2000 [03:38<00:00,  9.17it/s]


Epoch 379 Mean Reward: 210.46146472167968


100%|██████████| 2000/2000 [03:24<00:00,  9.80it/s]


Epoch 380 Mean Reward: 211.9063038711548
Epoch 380 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 380 test:
Test Episode 1 Reward: 189.5162353515625
Test Episode 2 Reward: 184.00941467285156
Test Episode 3 Reward: 189.5162353515625
Test Episode 4 Reward: 184.01939392089844
Test Episode 5 Reward: 189.5162353515625
Test Episode 6 Reward: 189.5162353515625
Test Episode 7 Reward: 189.5162353515625
Test Episode 8 Reward: 190.70477294921875
Test Episode 9 Reward: 185.1996307373047
Test Episode 10 Reward: 184.02427673339844
Average Test Reward: 187.553866577


100%|██████████| 2000/2000 [02:52<00:00, 11.61it/s]


Epoch 381 Mean Reward: 200.82374008178712


100%|██████████| 2000/2000 [02:44<00:00, 12.15it/s]


Epoch 382 Mean Reward: 200.62385860443115


100%|██████████| 2000/2000 [02:20<00:00, 14.20it/s]


Epoch 383 Mean Reward: 207.7128653793335


100%|██████████| 2000/2000 [02:21<00:00, 14.13it/s]


Epoch 384 Mean Reward: 214.12997682189942


100%|██████████| 2000/2000 [02:18<00:00, 14.47it/s]


Epoch 385 Mean Reward: 218.2719836730957


100%|██████████| 2000/2000 [02:51<00:00, 11.63it/s]


Epoch 386 Mean Reward: 211.65309590148925


100%|██████████| 2000/2000 [02:10<00:00, 15.34it/s]


Epoch 387 Mean Reward: 224.19964833831787


100%|██████████| 2000/2000 [02:18<00:00, 14.49it/s]


Epoch 388 Mean Reward: 226.8049266052246


100%|██████████| 2000/2000 [02:28<00:00, 13.45it/s]


Epoch 389 Mean Reward: 258.46042816925046


100%|██████████| 2000/2000 [02:33<00:00, 13.02it/s]


Epoch 390 Mean Reward: 246.08829788970948
Epoch 390 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 390 test:
Test Episode 1 Reward: 184.02401733398438
Test Episode 2 Reward: 184.0223388671875
Test Episode 3 Reward: 184.02410888671875
Test Episode 4 Reward: 184.0236358642578
Test Episode 5 Reward: 184.0223388671875
Test Episode 6 Reward: 184.0223388671875
Test Episode 7 Reward: 184.02149963378906
Test Episode 8 Reward: 184.40155029296875
Test Episode 9 Reward: 184.06549072265625
Test Episode 10 Reward: 184.0244140625
Average Test Reward: 184.06517334


100%|██████████| 2000/2000 [03:08<00:00, 10.63it/s]


Epoch 391 Mean Reward: 208.1687859802246


100%|██████████| 2000/2000 [03:01<00:00, 11.03it/s]


Epoch 392 Mean Reward: 221.5876455154419


100%|██████████| 2000/2000 [03:01<00:00, 11.02it/s]


Epoch 393 Mean Reward: 224.46991511535646


100%|██████████| 2000/2000 [02:56<00:00, 11.33it/s]


Epoch 394 Mean Reward: 231.27776061248778


100%|██████████| 2000/2000 [02:26<00:00, 13.62it/s]


Epoch 395 Mean Reward: 230.28793294525147


100%|██████████| 2000/2000 [01:54<00:00, 17.44it/s]


Epoch 396 Mean Reward: 294.0383425750732


100%|██████████| 2000/2000 [02:07<00:00, 15.73it/s]


Epoch 397 Mean Reward: 343.59964649963376


100%|██████████| 2000/2000 [02:05<00:00, 15.96it/s]


Epoch 398 Mean Reward: 327.96560642242434


100%|██████████| 2000/2000 [02:17<00:00, 14.55it/s]


Epoch 399 Mean Reward: 450.52866078186037


100%|██████████| 2000/2000 [02:25<00:00, 13.74it/s]


Epoch 400 Mean Reward: 432.7785758972168
Epoch 400 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 400 test:
Test Episode 1 Reward: 184.05523681640625
Test Episode 2 Reward: 185.73544311523438
Test Episode 3 Reward: 184.0030059814453
Test Episode 4 Reward: 185.2953338623047
Test Episode 5 Reward: 203.65235900878906
Test Episode 6 Reward: 184.59326171875
Test Episode 7 Reward: 185.59378051757812
Test Episode 8 Reward: 184.00242614746094
Test Episode 9 Reward: 184.06715393066406
Test Episode 10 Reward: 184.26736450195312
Average Test Reward: 186.52653656
[(493.52525939941404, 40), (486.98165283203127, 50), (468.11012268066406, 90), (463.14461669921877, 110), (452.64019012451172, 170), (440.92595825195315, 130), (433.88105926513674, 60), (432.66292877197264, 140), (419.35101318359375, 30), (414.3424102783203, 180), (408.96628723144534, 220), (407.87290954589844, 160), (404.18606262207032, 20), (402.14317932128904, 360), (397.78155364990232, 80), (392.08371582031248, 100), (376.722

In [None]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

test_reward = test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=ckpts[-1])
print('Average Test Reward:', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-400
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-400
Test Episode 1 Reward: 184.0704803466797
Test Episode 2 Reward: 184.00071716308594
Test Episode 3 Reward: 184.0704803466797
Test Episode 4 Reward: 184.1337127685547
Test Episode 5 Reward: 184.0543670654297
Test Episode 6 Reward: 184.0704803466797
Test Episode 7 Reward: 184.0704803466797
Test Episode 8 Reward: 184.00802612304688
Test Episode 9 Reward: 184.0704803466797
Test Episode 10 Reward: 184.0704803466797
Test Episode 11 Reward: 184.02439880371094
Test Episode 12 Reward: 184.0021209716797
Test Episode 13 Reward: 184.45973205566406
Test Episode 14 Reward: 184.09071350097656
