In [1]:
import importlib.util
import time

import tensorflow as tf
import numpy as np

from skimage.transform import rescale
from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.GRAY8)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 1
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + int(game.is_depth_buffer_enabled())

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 75000
epochs = 800
steps_per_epoch = 2000
learning_rate = 0.0025
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'
num_ckpts = 20


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if float(down_sample_ratio) != 1.0:
        image = rescale(image=image, scale=down_sample_ratio, mode='reflect')
    image = image.astype(np.float32)
    image = np.expand_dims(image, axis=0)

    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, depth, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            
            if depth == False:
                depth_buffer = np.zeros(state.screen_buffer.shape)
            elif depth == True:
                depth_buffer = state.depth_buffer
                
            state_buffer = np.stack((state.screen_buffer,
                                     depth_buffer), axis=-1)
            state1 = preprocess(state_buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.layers.flatten(self.conv2,
                                         name=network_name + '_flatten'
                                        )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self):
        self.learning_rate = 0.98*self.learning_rate
        
        return self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver(max_to_keep=num_ckpts, reshape=True)
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()
epoch_rank_depth = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            
#Substitute an array of zeros for the depth buffer if that setting is disabled

            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer.shape)
            else:
                depth_buffer = state.depth_buffer
            
            state1_buffer = np.stack((state.screen_buffer, depth_buffer), axis=-1)
            state1 = preprocess(state1_buffer, down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.15*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.45*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2_buffer = np.stack((state.screen_buffer, state.depth_buffer), axis=-1)
                state2 = preprocess(state2_buffer, down_sample_ratio)
                
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr()
    target_net.update_lr()
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Update the target network every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        update_target(update_ops, session)
        
#Save the model and test the agent for 10 episodes every 20 epochs
    
    if (epoch + 1) % 20 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

#Test the agent both with and without the depth buffer given
        
        print('Epoch {} test with depth buffer:'.format(epoch + 1))
        test_reward_depth = test_agent(DQN, num_episodes=10,
                                       training=True,
                                       load_model=False,
                                       depth=True,
                                       session=session,
                                       model_dir=model_dir)
        print('Average Test Reward (with depth buffer:)', test_reward_depth)
        
        print('Epoch {} test without depth buffer:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 depth=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward (without depth buffer):', test_reward)
        
        epoch_rank_depth.append((test_reward_depth, epoch + 1))
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [01:28<00:00, 22.65it/s]


Epoch 1 Mean Reward: -71.7723299331665


100%|██████████| 2000/2000 [01:28<00:00, 22.62it/s]


Epoch 2 Mean Reward: -68.08960192108154


100%|██████████| 2000/2000 [01:28<00:00, 22.50it/s]


Epoch 3 Mean Reward: -66.95969484710693


100%|██████████| 2000/2000 [01:28<00:00, 22.52it/s]


Epoch 4 Mean Reward: -67.66074543762207


100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]


Epoch 5 Mean Reward: -68.00603645324708


100%|██████████| 2000/2000 [01:32<00:00, 21.66it/s]


Epoch 6 Mean Reward: -70.99401164245606


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 7 Mean Reward: -68.20411168670654


100%|██████████| 2000/2000 [01:32<00:00, 21.69it/s]


Epoch 8 Mean Reward: -68.22234018707276


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 9 Mean Reward: -68.26078393554687


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 10 Mean Reward: -72.71765885162354


100%|██████████| 2000/2000 [01:33<00:00, 21.41it/s]


Epoch 11 Mean Reward: -67.30392169952393


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 12 Mean Reward: -67.80514653015136


100%|██████████| 2000/2000 [01:33<00:00, 21.37it/s]


Epoch 13 Mean Reward: -68.25544995117187


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 14 Mean Reward: -70.36271810150147


100%|██████████| 2000/2000 [01:33<00:00, 21.41it/s]


Epoch 15 Mean Reward: -68.0859175491333


100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 16 Mean Reward: -70.02226620483398


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 17 Mean Reward: -69.10864595031738


100%|██████████| 2000/2000 [01:36<00:00, 20.77it/s]


Epoch 18 Mean Reward: -69.10532443237305


100%|██████████| 2000/2000 [01:32<00:00, 21.62it/s]


Epoch 19 Mean Reward: -70.8557664489746


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 20 Mean Reward: -67.94546231842041
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test with depth buffer:
Test Episode 1 Reward: 77.68296813964844
Test Episode 2 Reward: 109.44517517089844
Test Episode 3 Reward: 419.0507354736328
Test Episode 4 Reward: 77.68296813964844
Test Episode 5 Reward: 90.18168640136719
Test Episode 6 Reward: 67.26510620117188
Test Episode 7 Reward: 77.68296813964844
Test Episode 8 Reward: 712.0427093505859
Test Episode 9 Reward: 386.82147216796875
Test Episode 10 Reward: 77.68296813964844
Average Test Reward (with depth buffer:) 209.5538757324219
Epoch 20 test without depth buffer:
Test Episode 1 Reward: 61.44795227050781
Test Episode 2 Reward: 198.34213256835938
Test Episode 3 Reward: 42.88966369628906
Test Episode 4 Reward: 156.76377868652344
Test Episode 5 Reward: 198.34213256835938
Test Episode 6 Reward: 119.19599914550781
Test Episode 7 Reward: 198.34213256835938
Test Episode 8 Reward: 236.8585205078125
Test Episode 9 Reward: 47.

100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 21 Mean Reward: -68.96082043457031


100%|██████████| 2000/2000 [01:33<00:00, 21.40it/s]


Epoch 22 Mean Reward: -67.32137704467773


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 23 Mean Reward: -67.84787602233887


100%|██████████| 2000/2000 [01:32<00:00, 21.68it/s]


Epoch 24 Mean Reward: -66.21031099700927


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 25 Mean Reward: -65.95270806121826


100%|██████████| 2000/2000 [01:32<00:00, 21.71it/s]


Epoch 26 Mean Reward: -67.0930030670166


100%|██████████| 2000/2000 [01:31<00:00, 21.76it/s]


Epoch 27 Mean Reward: -67.23853860473633


100%|██████████| 2000/2000 [01:32<00:00, 21.62it/s]


Epoch 28 Mean Reward: -69.15214166259766


100%|██████████| 2000/2000 [01:34<00:00, 21.24it/s]


Epoch 29 Mean Reward: -69.79384915924072


100%|██████████| 2000/2000 [01:35<00:00, 21.04it/s]


Epoch 30 Mean Reward: -65.5354264831543


100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 31 Mean Reward: -68.33076431274414


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 32 Mean Reward: -68.38371350860595


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 33 Mean Reward: -67.35250746154784


100%|██████████| 2000/2000 [01:33<00:00, 21.33it/s]


Epoch 34 Mean Reward: -69.94941307830811


100%|██████████| 2000/2000 [01:34<00:00, 21.28it/s]


Epoch 35 Mean Reward: -69.32490654754639


100%|██████████| 2000/2000 [01:33<00:00, 21.33it/s]


Epoch 36 Mean Reward: -69.13461208343506


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 37 Mean Reward: -71.25241701507568


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 38 Mean Reward: -68.99720141601563


100%|██████████| 2000/2000 [01:34<00:00, 21.14it/s]


Epoch 39 Mean Reward: -68.35597760009766


100%|██████████| 2000/2000 [01:34<00:00, 21.15it/s]


Epoch 40 Mean Reward: -69.91691772460938
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test with depth buffer:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 55.079498291015625
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 424.0656433105469
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 86.00949096679688
Test Episode 10 Reward: 105.15556335449219
Average Test Reward (with depth buffer:) 124.00266571044922
Epoch 40 test without depth buffer:
Test Episode 1 Reward: 46.96142578125
Test Episode 2 Reward: 142.27621459960938
Test Episode 3 Reward: 58.94834899902344
Test Episode 4 Reward: 65.60400390625
Test Episode 5 Reward: -60.651885986328125
Test Episode 6 Reward: 58.94834899902344
Test Episode 7 Reward: 58.94834899902344
Test Episode 8 Reward: 58.94834899902344
Test Episode 9 Reward: 58.9483489

100%|██████████| 2000/2000 [01:33<00:00, 21.46it/s]


Epoch 41 Mean Reward: -67.42309189605713


100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 42 Mean Reward: -66.67631170654298


100%|██████████| 2000/2000 [01:32<00:00, 21.65it/s]


Epoch 43 Mean Reward: -72.43237075042724


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 44 Mean Reward: -70.03455041503906


100%|██████████| 2000/2000 [01:33<00:00, 21.41it/s]


Epoch 45 Mean Reward: -65.83945931243896


100%|██████████| 2000/2000 [01:33<00:00, 21.41it/s]


Epoch 46 Mean Reward: -69.49733882904053


100%|██████████| 2000/2000 [01:29<00:00, 22.39it/s]


Epoch 47 Mean Reward: -69.881220703125


100%|██████████| 2000/2000 [01:29<00:00, 22.34it/s]


Epoch 48 Mean Reward: -69.72628965759277


100%|██████████| 2000/2000 [01:29<00:00, 22.45it/s]


Epoch 49 Mean Reward: -66.24973609924317


100%|██████████| 2000/2000 [01:30<00:00, 22.02it/s]


Epoch 50 Mean Reward: -69.5956856918335


100%|██████████| 2000/2000 [01:28<00:00, 22.49it/s]


Epoch 51 Mean Reward: -68.5138470916748


100%|██████████| 2000/2000 [01:29<00:00, 22.36it/s]


Epoch 52 Mean Reward: -67.63519902801514


100%|██████████| 2000/2000 [01:29<00:00, 22.34it/s]


Epoch 53 Mean Reward: -65.61308724975586


100%|██████████| 2000/2000 [01:30<00:00, 22.21it/s]


Epoch 54 Mean Reward: -67.7833957824707


100%|██████████| 2000/2000 [01:31<00:00, 21.89it/s]


Epoch 55 Mean Reward: -66.34425045776368


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 56 Mean Reward: -67.22839701843262


100%|██████████| 2000/2000 [01:32<00:00, 21.57it/s]


Epoch 57 Mean Reward: -69.26890861511231


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 58 Mean Reward: -68.13827237701416


100%|██████████| 2000/2000 [01:34<00:00, 21.26it/s]


Epoch 59 Mean Reward: -69.22283953857422


100%|██████████| 2000/2000 [01:31<00:00, 21.79it/s]


Epoch 60 Mean Reward: -69.10209534454346
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test with depth buffer:
Test Episode 1 Reward: 494.83082580566406
Test Episode 2 Reward: 77.20150756835938
Test Episode 3 Reward: 284.62945556640625
Test Episode 4 Reward: 86.33395385742188
Test Episode 5 Reward: 77.20150756835938
Test Episode 6 Reward: 77.20150756835938
Test Episode 7 Reward: 77.20150756835938
Test Episode 8 Reward: 77.20150756835938
Test Episode 9 Reward: 9.889633178710938
Test Episode 10 Reward: 244.4847412109375
Average Test Reward (with depth buffer:) 150.61761474609375
Epoch 60 test without depth buffer:
Test Episode 1 Reward: -72.87185668945312
Test Episode 2 Reward: -44.744964599609375
Test Episode 3 Reward: -4.28173828125
Test Episode 4 Reward: 24.011215209960938
Test Episode 5 Reward: -4.28173828125
Test Episode 6 Reward: -35.93257141113281
Test Episode 7 Reward: -30.03778076171875
Test Episode 8 Reward: -37.301788330078125
Test Episode 9 Reward: -78.2

100%|██████████| 2000/2000 [01:32<00:00, 21.57it/s]


Epoch 61 Mean Reward: -68.73267512512207


100%|██████████| 2000/2000 [01:32<00:00, 21.66it/s]


Epoch 62 Mean Reward: -68.89656484985352


100%|██████████| 2000/2000 [01:32<00:00, 21.66it/s]


Epoch 63 Mean Reward: -66.89470792388916


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 64 Mean Reward: -68.59179695129394


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 65 Mean Reward: -66.97938678741455


100%|██████████| 2000/2000 [01:32<00:00, 21.65it/s]


Epoch 66 Mean Reward: -69.13534844207764


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 67 Mean Reward: -67.74809828948975


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 68 Mean Reward: -68.34348472595215


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 69 Mean Reward: -67.28965956115722


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 70 Mean Reward: -68.88194091796875


100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 71 Mean Reward: -67.9069845275879


100%|██████████| 2000/2000 [01:32<00:00, 21.51it/s]


Epoch 72 Mean Reward: -66.20480226898194


100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]


Epoch 73 Mean Reward: -69.23584916687011


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 74 Mean Reward: -68.59347702026368


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 75 Mean Reward: -67.1404367980957


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 76 Mean Reward: -68.03064991760255


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 77 Mean Reward: -68.67229071044922


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 78 Mean Reward: -69.67452452850341


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 79 Mean Reward: -70.64357261657715


100%|██████████| 2000/2000 [01:32<00:00, 21.57it/s]


Epoch 80 Mean Reward: -68.509455909729
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test with depth buffer:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 36.8477783203125
Test Episode 6 Reward: 86.56748962402344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 66.57345581054688
Test Episode 10 Reward: 94.95274353027344
Average Test Reward (with depth buffer:) 85.46579284667969
Epoch 80 test without depth buffer:
Test Episode 1 Reward: 246.21990966796875
Test Episode 2 Reward: 15.302398681640625
Test Episode 3 Reward: 35.16912841796875
Test Episode 4 Reward: 152.2734375
Test Episode 5 Reward: -13.8106689453125
Test Episode 6 Reward: 143.41329956054688
Test Episode 7 Reward: 56.311859130859375
Test Episode 8 Reward: 246.21990966796875
Test Episode 9 Reward: 53.37768554687

100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 81 Mean Reward: -65.62190814971923


100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]


Epoch 82 Mean Reward: -68.72876753997802


100%|██████████| 2000/2000 [01:32<00:00, 21.70it/s]


Epoch 83 Mean Reward: -69.50456632995605


100%|██████████| 2000/2000 [01:32<00:00, 21.73it/s]


Epoch 84 Mean Reward: -66.73974114990234


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 85 Mean Reward: -67.3935004119873


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 86 Mean Reward: -69.54541695404053


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 87 Mean Reward: -70.81983925628663


100%|██████████| 2000/2000 [01:31<00:00, 21.76it/s]


Epoch 88 Mean Reward: -68.97588793945313


100%|██████████| 2000/2000 [01:31<00:00, 21.91it/s]


Epoch 89 Mean Reward: -69.3069713973999


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 90 Mean Reward: -67.3738037109375


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 91 Mean Reward: -68.01958951568604


100%|██████████| 2000/2000 [01:32<00:00, 21.67it/s]


Epoch 92 Mean Reward: -69.0018703918457


100%|██████████| 2000/2000 [01:31<00:00, 21.75it/s]


Epoch 93 Mean Reward: -69.72180841064453


100%|██████████| 2000/2000 [01:32<00:00, 21.68it/s]


Epoch 94 Mean Reward: -68.2394663696289


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 95 Mean Reward: -69.07913105010986


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 96 Mean Reward: -68.66876081848145


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 97 Mean Reward: -69.88316845703125


100%|██████████| 2000/2000 [01:32<00:00, 21.52it/s]


Epoch 98 Mean Reward: -68.78927138519288


100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]


Epoch 99 Mean Reward: -66.11177454376221


100%|██████████| 2000/2000 [01:31<00:00, 21.83it/s]


Epoch 100 Mean Reward: -68.97166075897216
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test with depth buffer:
Test Episode 1 Reward: 74.130126953125
Test Episode 2 Reward: 47.93217468261719
Test Episode 3 Reward: -33.39179992675781
Test Episode 4 Reward: 74.130126953125
Test Episode 5 Reward: 74.130126953125
Test Episode 6 Reward: 85.49685668945312
Test Episode 7 Reward: 85.88615417480469
Test Episode 8 Reward: 70.1273193359375
Test Episode 9 Reward: 74.130126953125
Test Episode 10 Reward: 262.3621368408203
Average Test Reward (with depth buffer:) 81.4933349609375
Epoch 100 test without depth buffer:
Test Episode 1 Reward: 18.407928466796875
Test Episode 2 Reward: 18.407928466796875
Test Episode 3 Reward: 82.08808898925781
Test Episode 4 Reward: 18.407928466796875
Test Episode 5 Reward: 18.407928466796875
Test Episode 6 Reward: 18.407928466796875
Test Episode 7 Reward: 18.407928466796875
Test Episode 8 Reward: -36.23326110839844
Test Episode 9 Reward: -27.6969

100%|██████████| 2000/2000 [01:31<00:00, 21.80it/s]


Epoch 101 Mean Reward: -68.78725482177734


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 102 Mean Reward: -69.19259749603272


100%|██████████| 2000/2000 [01:32<00:00, 21.73it/s]


Epoch 103 Mean Reward: -66.6386382598877


100%|██████████| 2000/2000 [01:31<00:00, 21.77it/s]


Epoch 104 Mean Reward: -68.79679183197021


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 105 Mean Reward: -67.4124828414917


100%|██████████| 2000/2000 [01:32<00:00, 21.61it/s]


Epoch 106 Mean Reward: -67.92575073242188


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 107 Mean Reward: -68.4958189239502


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 108 Mean Reward: -68.11510194396973


100%|██████████| 2000/2000 [01:31<00:00, 21.81it/s]


Epoch 109 Mean Reward: -70.6740124053955


100%|██████████| 2000/2000 [01:33<00:00, 21.50it/s]


Epoch 110 Mean Reward: -69.0802898864746


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 111 Mean Reward: -68.45460022735595


100%|██████████| 2000/2000 [01:31<00:00, 21.74it/s]


Epoch 112 Mean Reward: -70.26265335845947


100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 113 Mean Reward: -69.29335558319092


100%|██████████| 2000/2000 [01:32<00:00, 21.71it/s]


Epoch 114 Mean Reward: -69.153035446167


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 115 Mean Reward: -69.68707434844971


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 116 Mean Reward: -67.21255345153808


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 117 Mean Reward: -67.53516345214844


100%|██████████| 2000/2000 [01:32<00:00, 21.66it/s]


Epoch 118 Mean Reward: -66.79406520843506


100%|██████████| 2000/2000 [01:32<00:00, 21.71it/s]


Epoch 119 Mean Reward: -68.70361431884766


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 120 Mean Reward: -66.68904273223878
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test with depth buffer:
Test Episode 1 Reward: 202.76834106445312
Test Episode 2 Reward: 263.58642578125
Test Episode 3 Reward: -21.12054443359375
Test Episode 4 Reward: 202.76834106445312
Test Episode 5 Reward: 13.861114501953125
Test Episode 6 Reward: 202.76834106445312
Test Episode 7 Reward: 245.865966796875
Test Episode 8 Reward: 202.76834106445312
Test Episode 9 Reward: 202.76834106445312
Test Episode 10 Reward: 202.76834106445312
Average Test Reward (with depth buffer:) 171.88030090332032
Epoch 120 test without depth buffer:
Test Episode 1 Reward: 82.88076782226562
Test Episode 2 Reward: -7.3395538330078125
Test Episode 3 Reward: -17.788818359375
Test Episode 4 Reward: -19.690414428710938
Test Episode 5 Reward: 16.842254638671875
Test Episode 6 Reward: 16.842254638671875
Test Episode 7 Reward: 182.30369567871094
Test Episode 8 Reward: 170.25440979003906
Test Episode 9 R

100%|██████████| 2000/2000 [01:32<00:00, 21.51it/s]


Epoch 121 Mean Reward: -66.63986041259766


100%|██████████| 2000/2000 [01:31<00:00, 21.75it/s]


Epoch 122 Mean Reward: -68.68215813446045


100%|██████████| 2000/2000 [01:32<00:00, 21.63it/s]


Epoch 123 Mean Reward: -66.00277959442138


100%|██████████| 2000/2000 [01:32<00:00, 21.63it/s]


Epoch 124 Mean Reward: -67.7039894104004


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 125 Mean Reward: -66.04121174621582


100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 126 Mean Reward: -68.76510559844971


100%|██████████| 2000/2000 [01:31<00:00, 21.75it/s]


Epoch 127 Mean Reward: -67.99772305297851


100%|██████████| 2000/2000 [01:31<00:00, 21.85it/s]


Epoch 128 Mean Reward: -67.9171911239624


100%|██████████| 2000/2000 [01:32<00:00, 21.69it/s]


Epoch 129 Mean Reward: -68.70655006408691


100%|██████████| 2000/2000 [01:32<00:00, 21.61it/s]


Epoch 130 Mean Reward: -67.98623014068603


100%|██████████| 2000/2000 [01:32<00:00, 21.69it/s]


Epoch 131 Mean Reward: -68.80373809814454


100%|██████████| 2000/2000 [01:32<00:00, 21.65it/s]


Epoch 132 Mean Reward: -68.8050369644165


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 133 Mean Reward: -67.4446954574585


100%|██████████| 2000/2000 [01:32<00:00, 21.57it/s]


Epoch 134 Mean Reward: -68.8522548828125


100%|██████████| 2000/2000 [01:32<00:00, 21.69it/s]


Epoch 135 Mean Reward: -66.8094433517456


100%|██████████| 2000/2000 [01:31<00:00, 21.79it/s]


Epoch 136 Mean Reward: -68.79095175933838


100%|██████████| 2000/2000 [01:31<00:00, 21.83it/s]


Epoch 137 Mean Reward: -66.66077460479737


100%|██████████| 2000/2000 [01:31<00:00, 21.76it/s]


Epoch 138 Mean Reward: -70.91633780670166


100%|██████████| 2000/2000 [01:31<00:00, 21.77it/s]


Epoch 139 Mean Reward: -68.91928708648682


100%|██████████| 2000/2000 [01:31<00:00, 21.90it/s]


Epoch 140 Mean Reward: -67.8981473236084
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test with depth buffer:
Test Episode 1 Reward: 303.7802429199219
Test Episode 2 Reward: 154.46669006347656
Test Episode 3 Reward: 154.46669006347656
Test Episode 4 Reward: 154.46669006347656
Test Episode 5 Reward: 154.46669006347656
Test Episode 6 Reward: 154.46669006347656
Test Episode 7 Reward: 109.0374755859375
Test Episode 8 Reward: 94.04307556152344
Test Episode 9 Reward: 154.46669006347656
Test Episode 10 Reward: -30.23468017578125
Average Test Reward (with depth buffer:) 140.3426254272461
Epoch 140 test without depth buffer:
Test Episode 1 Reward: -27.279693603515625
Test Episode 2 Reward: 111.32199096679688
Test Episode 3 Reward: -29.371261596679688
Test Episode 4 Reward: 111.32199096679688
Test Episode 5 Reward: 111.32199096679688
Test Episode 6 Reward: 111.32199096679688
Test Episode 7 Reward: 111.32199096679688
Test Episode 8 Reward: 111.32199096679688
Test Episode 

100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 141 Mean Reward: -67.69900074768067


100%|██████████| 2000/2000 [01:32<00:00, 21.63it/s]


Epoch 142 Mean Reward: -67.74404962921143


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 143 Mean Reward: -68.37077710723877


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 144 Mean Reward: -66.63714044189453


100%|██████████| 2000/2000 [01:31<00:00, 21.78it/s]


Epoch 145 Mean Reward: -66.55131575775147


100%|██████████| 2000/2000 [01:31<00:00, 21.75it/s]


Epoch 146 Mean Reward: -68.52134126281739


100%|██████████| 2000/2000 [01:31<00:00, 21.79it/s]


Epoch 147 Mean Reward: -68.75133708953858


100%|██████████| 2000/2000 [01:31<00:00, 21.78it/s]


Epoch 148 Mean Reward: -68.12024773406982


100%|██████████| 2000/2000 [01:32<00:00, 21.73it/s]


Epoch 149 Mean Reward: -67.47473664093017


100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 150 Mean Reward: -68.59917261505127


100%|██████████| 2000/2000 [01:32<00:00, 21.65it/s]


Epoch 151 Mean Reward: -69.6433690109253


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 152 Mean Reward: -67.66923049926758


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 153 Mean Reward: -66.60605596160889


100%|██████████| 2000/2000 [01:32<00:00, 21.57it/s]


Epoch 154 Mean Reward: -69.74818204498291


100%|██████████| 2000/2000 [01:32<00:00, 21.66it/s]


Epoch 155 Mean Reward: -72.15481113433837


100%|██████████| 2000/2000 [01:32<00:00, 21.73it/s]


Epoch 156 Mean Reward: -68.522107421875


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 157 Mean Reward: -68.31739118957519


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 158 Mean Reward: -68.4215991821289


100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 159 Mean Reward: -70.1356321182251


100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]


Epoch 160 Mean Reward: -69.37020346832276
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test with depth buffer:
Test Episode 1 Reward: 43.50311279296875
Test Episode 2 Reward: -64.87741088867188
Test Episode 3 Reward: 43.50311279296875
Test Episode 4 Reward: 198.6910858154297
Test Episode 5 Reward: 43.50311279296875
Test Episode 6 Reward: 43.50311279296875
Test Episode 7 Reward: 43.50311279296875
Test Episode 8 Reward: 43.50311279296875
Test Episode 9 Reward: -33.58808898925781
Test Episode 10 Reward: 81.25273132324219
Average Test Reward (with depth buffer:) 44.249699401855466
Epoch 160 test without depth buffer:
Test Episode 1 Reward: 151.0391387939453
Test Episode 2 Reward: 151.0391387939453
Test Episode 3 Reward: -7.8167572021484375
Test Episode 4 Reward: 151.0391387939453
Test Episode 5 Reward: 151.0391387939453
Test Episode 6 Reward: 151.0391387939453
Test Episode 7 Reward: 151.0391387939453
Test Episode 8 Reward: 151.0391387939453
Test Episode 9 Reward: 1

100%|██████████| 2000/2000 [01:32<00:00, 21.52it/s]


Epoch 161 Mean Reward: -66.85625447082519


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 162 Mean Reward: -68.4946446685791


100%|██████████| 2000/2000 [01:32<00:00, 21.68it/s]


Epoch 163 Mean Reward: -66.1844164352417


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 164 Mean Reward: -67.73945596313476


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 165 Mean Reward: -70.856711227417


100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 166 Mean Reward: -67.557563621521


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 167 Mean Reward: -68.46169747924804


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 168 Mean Reward: -66.03375082397461


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 169 Mean Reward: -65.30990522003174


100%|██████████| 2000/2000 [01:33<00:00, 21.31it/s]


Epoch 170 Mean Reward: -66.54423419952393


100%|██████████| 2000/2000 [01:34<00:00, 21.19it/s]


Epoch 171 Mean Reward: -68.49418176269532


100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 172 Mean Reward: -69.49437005615235


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 173 Mean Reward: -67.30139949035645


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 174 Mean Reward: -66.43759828186035


100%|██████████| 2000/2000 [01:34<00:00, 21.14it/s]


Epoch 175 Mean Reward: -67.15590537261963


100%|██████████| 2000/2000 [01:34<00:00, 21.06it/s]


Epoch 176 Mean Reward: -66.51130003356934


100%|██████████| 2000/2000 [01:34<00:00, 21.09it/s]


Epoch 177 Mean Reward: -65.49968392181397


100%|██████████| 2000/2000 [01:34<00:00, 21.07it/s]


Epoch 178 Mean Reward: -66.3213982849121


100%|██████████| 2000/2000 [01:36<00:00, 20.74it/s]


Epoch 179 Mean Reward: -64.26736868286133


100%|██████████| 2000/2000 [01:36<00:00, 20.79it/s]


Epoch 180 Mean Reward: -65.0023675994873
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test with depth buffer:
Test Episode 1 Reward: 229.08132934570312
Test Episode 2 Reward: 166.68475341796875
Test Episode 3 Reward: 166.68475341796875
Test Episode 4 Reward: 128.41506958007812
Test Episode 5 Reward: 68.56318664550781
Test Episode 6 Reward: 126.59355163574219
Test Episode 7 Reward: 166.68475341796875
Test Episode 8 Reward: 219.47821044921875
Test Episode 9 Reward: 350.2407989501953
Test Episode 10 Reward: 12.183563232421875
Average Test Reward (with depth buffer:) 163.46099700927735
Epoch 180 test without depth buffer:
Test Episode 1 Reward: 105.43763732910156
Test Episode 2 Reward: -87.13067626953125
Test Episode 3 Reward: 190.69406127929688
Test Episode 4 Reward: 105.43763732910156
Test Episode 5 Reward: 105.43763732910156
Test Episode 6 Reward: 621.5503692626953
Test Episode 7 Reward: 105.43763732910156
Test Episode 8 Reward: 105.43763732910156
Test Episode 9

100%|██████████| 2000/2000 [01:35<00:00, 20.96it/s]


Epoch 181 Mean Reward: -68.4746775817871


100%|██████████| 2000/2000 [01:36<00:00, 20.71it/s]


Epoch 182 Mean Reward: -64.49946273040771


100%|██████████| 2000/2000 [01:35<00:00, 20.89it/s]


Epoch 183 Mean Reward: -65.32061386871338


100%|██████████| 2000/2000 [01:35<00:00, 20.84it/s]


Epoch 184 Mean Reward: -64.92079148864747


100%|██████████| 2000/2000 [01:37<00:00, 20.52it/s]


Epoch 185 Mean Reward: -65.9994633102417


100%|██████████| 2000/2000 [01:37<00:00, 20.55it/s]


Epoch 186 Mean Reward: -64.25890619659424


100%|██████████| 2000/2000 [01:36<00:00, 20.65it/s]


Epoch 187 Mean Reward: -66.54925118255615


100%|██████████| 2000/2000 [01:37<00:00, 20.56it/s]


Epoch 188 Mean Reward: -63.39891191101074


100%|██████████| 2000/2000 [01:37<00:00, 20.50it/s]


Epoch 189 Mean Reward: -65.39831955718994


100%|██████████| 2000/2000 [01:39<00:00, 20.15it/s]


Epoch 190 Mean Reward: -63.0129130783081


100%|██████████| 2000/2000 [01:38<00:00, 20.38it/s]


Epoch 191 Mean Reward: -64.98956665039063


100%|██████████| 2000/2000 [01:38<00:00, 20.33it/s]


Epoch 192 Mean Reward: -62.998204582214356


100%|██████████| 2000/2000 [01:37<00:00, 20.49it/s]


Epoch 193 Mean Reward: -63.63940160369873


100%|██████████| 2000/2000 [01:37<00:00, 20.52it/s]


Epoch 194 Mean Reward: -63.99559415435791


100%|██████████| 2000/2000 [01:38<00:00, 20.30it/s]


Epoch 195 Mean Reward: -63.722767448425294


100%|██████████| 2000/2000 [01:38<00:00, 20.35it/s]


Epoch 196 Mean Reward: -66.99295331573487


100%|██████████| 2000/2000 [01:38<00:00, 20.33it/s]


Epoch 197 Mean Reward: -65.40493132781982


100%|██████████| 2000/2000 [01:39<00:00, 20.06it/s]


Epoch 198 Mean Reward: -61.75107567596436


100%|██████████| 2000/2000 [01:39<00:00, 20.04it/s]


Epoch 199 Mean Reward: -60.5776883392334


100%|██████████| 2000/2000 [01:39<00:00, 20.12it/s]


Epoch 200 Mean Reward: -65.00290711975097
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test with depth buffer:
Test Episode 1 Reward: 0.277374267578125
Test Episode 2 Reward: 107.879638671875
Test Episode 3 Reward: 107.879638671875
Test Episode 4 Reward: 107.879638671875
Test Episode 5 Reward: 107.879638671875
Test Episode 6 Reward: 107.879638671875
Test Episode 7 Reward: 207.0556182861328
Test Episode 8 Reward: 107.879638671875
Test Episode 9 Reward: -0.4499053955078125
Test Episode 10 Reward: -107.42820739746094
Average Test Reward (with depth buffer:) 74.67327117919922
Epoch 200 test without depth buffer:
Test Episode 1 Reward: -48.55039978027344
Test Episode 2 Reward: -48.55039978027344
Test Episode 3 Reward: -55.1602783203125
Test Episode 4 Reward: 13.582855224609375
Test Episode 5 Reward: -48.55039978027344
Test Episode 6 Reward: 149.58953857421875
Test Episode 7 Reward: -48.55039978027344
Test Episode 8 Reward: -48.55039978027344
Test Episode 9 Reward: -

100%|██████████| 2000/2000 [01:40<00:00, 19.99it/s]


Epoch 201 Mean Reward: -63.6645570526123


100%|██████████| 2000/2000 [01:40<00:00, 20.00it/s]


Epoch 202 Mean Reward: -67.24033708953857


100%|██████████| 2000/2000 [01:39<00:00, 20.03it/s]


Epoch 203 Mean Reward: -64.81917192077637


100%|██████████| 2000/2000 [01:40<00:00, 19.88it/s]


Epoch 204 Mean Reward: -63.754093788146974


100%|██████████| 2000/2000 [01:40<00:00, 19.83it/s]


Epoch 205 Mean Reward: -61.62314591217041


100%|██████████| 2000/2000 [01:42<00:00, 19.46it/s]


Epoch 206 Mean Reward: -63.218850914001465


100%|██████████| 2000/2000 [01:41<00:00, 19.67it/s]


Epoch 207 Mean Reward: -62.359063682556155


100%|██████████| 2000/2000 [01:41<00:00, 19.67it/s]


Epoch 208 Mean Reward: -62.45801731872559


100%|██████████| 2000/2000 [01:42<00:00, 19.60it/s]


Epoch 209 Mean Reward: -61.4758830871582


100%|██████████| 2000/2000 [01:41<00:00, 19.79it/s]


Epoch 210 Mean Reward: -62.499484657287596


100%|██████████| 2000/2000 [01:41<00:00, 19.65it/s]


Epoch 211 Mean Reward: -61.374601646423336


100%|██████████| 2000/2000 [01:42<00:00, 19.49it/s]


Epoch 212 Mean Reward: -61.45141177368164


100%|██████████| 2000/2000 [01:41<00:00, 19.66it/s]


Epoch 213 Mean Reward: -59.00604356384277


100%|██████████| 2000/2000 [01:40<00:00, 19.87it/s]


Epoch 214 Mean Reward: -64.04067958068848


100%|██████████| 2000/2000 [01:44<00:00, 19.23it/s]


Epoch 215 Mean Reward: -61.238729637145994


100%|██████████| 2000/2000 [01:42<00:00, 19.43it/s]


Epoch 216 Mean Reward: -60.7939395904541


100%|██████████| 2000/2000 [01:44<00:00, 19.18it/s]


Epoch 217 Mean Reward: -60.70703443908691


100%|██████████| 2000/2000 [01:42<00:00, 19.48it/s]


Epoch 218 Mean Reward: -63.71954273223877


100%|██████████| 2000/2000 [01:44<00:00, 19.12it/s]


Epoch 219 Mean Reward: -62.91332833099365


100%|██████████| 2000/2000 [01:44<00:00, 19.22it/s]


Epoch 220 Mean Reward: -61.05880884552002
Epoch 220 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 220 test with depth buffer:
Test Episode 1 Reward: -105.35275268554688
Test Episode 2 Reward: -105.35275268554688
Test Episode 3 Reward: -115.99516296386719
Test Episode 4 Reward: -115.99571228027344
Test Episode 5 Reward: -105.35275268554688
Test Episode 6 Reward: -84.9366455078125
Test Episode 7 Reward: -95.15119934082031
Test Episode 8 Reward: -115.98110961914062
Test Episode 9 Reward: -105.35275268554688
Test Episode 10 Reward: -105.35275268554688
Average Test Reward (with depth buffer:) -105.48235931396485
Epoch 220 test without depth buffer:
Test Episode 1 Reward: -96.83575439453125
Test Episode 2 Reward: 12.138809204101562
Test Episode 3 Reward: -20.009292602539062
Test Episode 4 Reward: -115.99430847167969
Test Episode 5 Reward: -21.966781616210938
Test Episode 6 Reward: -65.02024841308594
Test Episode 7 Reward: -20.009292602539062
Test Episode 8 Reward: -25.9625549316406

100%|██████████| 2000/2000 [01:42<00:00, 19.57it/s]


Epoch 221 Mean Reward: -61.56331565856934


100%|██████████| 2000/2000 [01:42<00:00, 19.50it/s]


Epoch 222 Mean Reward: -63.547259223937985


100%|██████████| 2000/2000 [01:43<00:00, 19.24it/s]


Epoch 223 Mean Reward: -62.00494775390625


100%|██████████| 2000/2000 [01:44<00:00, 19.20it/s]


Epoch 224 Mean Reward: -61.68399546051025


100%|██████████| 2000/2000 [01:43<00:00, 19.24it/s]


Epoch 225 Mean Reward: -63.40936960601807


100%|██████████| 2000/2000 [01:46<00:00, 18.76it/s]


Epoch 226 Mean Reward: -59.40645694732666


100%|██████████| 2000/2000 [01:44<00:00, 19.16it/s]


Epoch 227 Mean Reward: -60.64406677246094


100%|██████████| 2000/2000 [01:44<00:00, 19.05it/s]


Epoch 228 Mean Reward: -60.613455047607424


100%|██████████| 2000/2000 [01:47<00:00, 18.65it/s]


Epoch 229 Mean Reward: -60.83235954284668


100%|██████████| 2000/2000 [01:45<00:00, 19.03it/s]


Epoch 230 Mean Reward: -60.39495000457764


100%|██████████| 2000/2000 [01:45<00:00, 18.90it/s]


Epoch 231 Mean Reward: -61.42270530700684


100%|██████████| 2000/2000 [01:47<00:00, 18.63it/s]


Epoch 232 Mean Reward: -59.72635247039795


100%|██████████| 2000/2000 [01:45<00:00, 18.93it/s]


Epoch 233 Mean Reward: -60.23610453796387


100%|██████████| 2000/2000 [01:46<00:00, 18.73it/s]


Epoch 234 Mean Reward: -64.09212357330323


100%|██████████| 2000/2000 [01:46<00:00, 18.69it/s]


Epoch 235 Mean Reward: -59.78343519592285


100%|██████████| 2000/2000 [01:48<00:00, 18.51it/s]


Epoch 236 Mean Reward: -61.249190734863284


100%|██████████| 2000/2000 [01:47<00:00, 18.58it/s]


Epoch 237 Mean Reward: -63.24893991088867


100%|██████████| 2000/2000 [01:48<00:00, 18.43it/s]


Epoch 238 Mean Reward: -63.18143058013916


100%|██████████| 2000/2000 [01:44<00:00, 19.10it/s]


Epoch 239 Mean Reward: -62.98768832397461


100%|██████████| 2000/2000 [01:47<00:00, 18.59it/s]


Epoch 240 Mean Reward: -60.67601996612549
Epoch 240 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 240 test with depth buffer:
Test Episode 1 Reward: -115.98362731933594
Test Episode 2 Reward: -115.98362731933594
Test Episode 3 Reward: -115.97853088378906
Test Episode 4 Reward: -115.98362731933594
Test Episode 5 Reward: -115.98362731933594
Test Episode 6 Reward: -115.98362731933594
Test Episode 7 Reward: -115.9915771484375
Test Episode 8 Reward: -115.98362731933594
Test Episode 9 Reward: -115.98362731933594
Test Episode 10 Reward: -115.98362731933594
Average Test Reward (with depth buffer:) -115.98391265869141
Epoch 240 test without depth buffer:
Test Episode 1 Reward: -115.98080444335938
Test Episode 2 Reward: -63.82661437988281
Test Episode 3 Reward: -115.98080444335938
Test Episode 4 Reward: -115.98080444335938
Test Episode 5 Reward: -115.98080444335938
Test Episode 6 Reward: -115.98080444335938
Test Episode 7 Reward: -114.68418884277344
Test Episode 8 Reward: -113.93577575

100%|██████████| 2000/2000 [01:47<00:00, 18.53it/s]


Epoch 241 Mean Reward: -58.131127883911134


100%|██████████| 2000/2000 [01:47<00:00, 18.67it/s]


Epoch 242 Mean Reward: -60.35116173553467


100%|██████████| 2000/2000 [01:47<00:00, 18.58it/s]


Epoch 243 Mean Reward: -60.40836231994629


100%|██████████| 2000/2000 [01:48<00:00, 18.42it/s]


Epoch 244 Mean Reward: -59.8964296875


100%|██████████| 2000/2000 [01:48<00:00, 18.51it/s]


Epoch 245 Mean Reward: -61.49993939971924


100%|██████████| 2000/2000 [01:49<00:00, 18.30it/s]


Epoch 246 Mean Reward: -61.88705632019043


100%|██████████| 2000/2000 [01:49<00:00, 18.32it/s]


Epoch 247 Mean Reward: -58.30224018859863


100%|██████████| 2000/2000 [01:51<00:00, 17.94it/s]


Epoch 248 Mean Reward: -61.60915651702881


100%|██████████| 2000/2000 [01:49<00:00, 18.23it/s]


Epoch 249 Mean Reward: -59.02783688354492


100%|██████████| 2000/2000 [01:50<00:00, 18.07it/s]


Epoch 250 Mean Reward: -59.959290878295896


100%|██████████| 2000/2000 [01:51<00:00, 17.93it/s]


Epoch 251 Mean Reward: -57.71589493560791


100%|██████████| 2000/2000 [01:53<00:00, 17.64it/s]


Epoch 252 Mean Reward: -56.865543815612796


100%|██████████| 2000/2000 [01:53<00:00, 17.68it/s]


Epoch 253 Mean Reward: -53.76454808807373


100%|██████████| 2000/2000 [01:53<00:00, 17.69it/s]


Epoch 254 Mean Reward: -57.491899490356445


100%|██████████| 2000/2000 [01:53<00:00, 17.56it/s]


Epoch 255 Mean Reward: -55.64597946929932


100%|██████████| 2000/2000 [01:53<00:00, 17.60it/s]


Epoch 256 Mean Reward: -56.516880310058596


100%|██████████| 2000/2000 [01:56<00:00, 17.19it/s]


Epoch 257 Mean Reward: -54.98130168914795


100%|██████████| 2000/2000 [01:54<00:00, 17.53it/s]


Epoch 258 Mean Reward: -57.55000061035156


100%|██████████| 2000/2000 [01:55<00:00, 17.33it/s]


Epoch 259 Mean Reward: -56.61996537017822


100%|██████████| 2000/2000 [01:55<00:00, 17.29it/s]


Epoch 260 Mean Reward: -56.3652428894043
Epoch 260 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 260 test with depth buffer:
Test Episode 1 Reward: -107.32633972167969
Test Episode 2 Reward: -107.32633972167969
Test Episode 3 Reward: -115.99571228027344
Test Episode 4 Reward: -107.32633972167969
Test Episode 5 Reward: -115.97782897949219
Test Episode 6 Reward: -107.32633972167969
Test Episode 7 Reward: -107.32633972167969
Test Episode 8 Reward: -115.99822998046875
Test Episode 9 Reward: -107.32633972167969
Test Episode 10 Reward: -107.32633972167969
Average Test Reward (with depth buffer:) -109.92561492919921
Epoch 260 test without depth buffer:
Test Episode 1 Reward: -115.93446350097656
Test Episode 2 Reward: 106.31538391113281
Test Episode 3 Reward: 106.31538391113281
Test Episode 4 Reward: 106.31538391113281
Test Episode 5 Reward: 10.349075317382812
Test Episode 6 Reward: 106.31538391113281
Test Episode 7 Reward: -90.90228271484375
Test Episode 8 Reward: 106.31538391113281

100%|██████████| 2000/2000 [01:57<00:00, 17.04it/s]


Epoch 261 Mean Reward: -54.477202522277835


100%|██████████| 2000/2000 [01:56<00:00, 17.21it/s]


Epoch 262 Mean Reward: -53.44721314239502


100%|██████████| 2000/2000 [01:59<00:00, 16.68it/s]


Epoch 263 Mean Reward: -52.516200637817384


100%|██████████| 2000/2000 [01:59<00:00, 16.79it/s]


Epoch 264 Mean Reward: -54.061205001831055


100%|██████████| 2000/2000 [01:59<00:00, 16.80it/s]


Epoch 265 Mean Reward: -51.426564323425296


100%|██████████| 2000/2000 [01:58<00:00, 16.94it/s]


Epoch 266 Mean Reward: -52.829949836730954


100%|██████████| 2000/2000 [01:58<00:00, 16.93it/s]


Epoch 267 Mean Reward: -53.34785851287842


100%|██████████| 2000/2000 [01:57<00:00, 17.01it/s]


Epoch 268 Mean Reward: -55.769765403747556


100%|██████████| 2000/2000 [01:58<00:00, 16.86it/s]


Epoch 269 Mean Reward: -53.83731065368652


100%|██████████| 2000/2000 [01:58<00:00, 16.89it/s]


Epoch 270 Mean Reward: -53.313349235534666


100%|██████████| 2000/2000 [01:58<00:00, 16.84it/s]


Epoch 271 Mean Reward: -54.78732972717285


100%|██████████| 2000/2000 [01:56<00:00, 17.15it/s]


Epoch 272 Mean Reward: -56.753385787963865


100%|██████████| 2000/2000 [01:56<00:00, 17.23it/s]


Epoch 273 Mean Reward: -57.92283155059815


100%|██████████| 2000/2000 [01:57<00:00, 17.01it/s]


Epoch 274 Mean Reward: -58.26067984008789


100%|██████████| 2000/2000 [01:58<00:00, 16.83it/s]


Epoch 275 Mean Reward: -54.75757514190674


100%|██████████| 2000/2000 [01:57<00:00, 17.06it/s]


Epoch 276 Mean Reward: -60.209455039978025


100%|██████████| 2000/2000 [01:58<00:00, 16.84it/s]


Epoch 277 Mean Reward: -50.323615913391116


100%|██████████| 2000/2000 [01:56<00:00, 17.23it/s]


Epoch 278 Mean Reward: -56.672457344055175


100%|██████████| 2000/2000 [01:57<00:00, 17.07it/s]


Epoch 279 Mean Reward: -57.4029818649292


100%|██████████| 2000/2000 [01:56<00:00, 17.09it/s]


Epoch 280 Mean Reward: -58.27508575439453
Epoch 280 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 280 test with depth buffer:
Test Episode 1 Reward: -19.0963134765625
Test Episode 2 Reward: 158.11082458496094
Test Episode 3 Reward: 177.64324951171875
Test Episode 4 Reward: -10.915679931640625
Test Episode 5 Reward: 177.64324951171875
Test Episode 6 Reward: 177.64324951171875
Test Episode 7 Reward: 177.64324951171875
Test Episode 8 Reward: 177.64324951171875
Test Episode 9 Reward: 177.64324951171875
Test Episode 10 Reward: 177.64324951171875
Average Test Reward (with depth buffer:) 137.1601577758789
Epoch 280 test without depth buffer:
Test Episode 1 Reward: -31.689010620117188
Test Episode 2 Reward: -31.689010620117188
Test Episode 3 Reward: -31.689010620117188
Test Episode 4 Reward: -75.6849365234375
Test Episode 5 Reward: 93.94090270996094
Test Episode 6 Reward: -21.835479736328125
Test Episode 7 Reward: -31.689010620117188
Test Episode 8 Reward: -94.15234375
Test Episode 9

100%|██████████| 2000/2000 [01:56<00:00, 17.13it/s]


Epoch 281 Mean Reward: -56.532900451660154


100%|██████████| 2000/2000 [01:56<00:00, 17.12it/s]


Epoch 282 Mean Reward: -52.504502044677736


100%|██████████| 2000/2000 [01:57<00:00, 16.96it/s]


Epoch 283 Mean Reward: -53.43738317108154


100%|██████████| 2000/2000 [01:59<00:00, 16.76it/s]


Epoch 284 Mean Reward: -53.08585898590088


100%|██████████| 2000/2000 [01:58<00:00, 16.85it/s]


Epoch 285 Mean Reward: -56.19040782165527


100%|██████████| 2000/2000 [01:57<00:00, 17.00it/s]


Epoch 286 Mean Reward: -53.320627868652345


100%|██████████| 2000/2000 [01:58<00:00, 16.88it/s]


Epoch 287 Mean Reward: -49.414372436523436


100%|██████████| 2000/2000 [01:59<00:00, 16.74it/s]


Epoch 288 Mean Reward: -54.00270460510254


100%|██████████| 2000/2000 [02:00<00:00, 16.63it/s]


Epoch 289 Mean Reward: -57.34956694030762


100%|██████████| 2000/2000 [02:01<00:00, 16.44it/s]


Epoch 290 Mean Reward: -53.900897262573245


100%|██████████| 2000/2000 [02:01<00:00, 16.53it/s]


Epoch 291 Mean Reward: -52.50724626159668


100%|██████████| 2000/2000 [02:01<00:00, 16.44it/s]


Epoch 292 Mean Reward: -55.27520989990234


100%|██████████| 2000/2000 [02:03<00:00, 16.25it/s]


Epoch 293 Mean Reward: -52.86504933166504


100%|██████████| 2000/2000 [02:03<00:00, 16.13it/s]


Epoch 294 Mean Reward: -56.03011042785644


100%|██████████| 2000/2000 [02:03<00:00, 16.23it/s]


Epoch 295 Mean Reward: -50.90050199890137


100%|██████████| 2000/2000 [02:03<00:00, 16.16it/s]


Epoch 296 Mean Reward: -53.78257666778565


100%|██████████| 2000/2000 [02:05<00:00, 15.99it/s]


Epoch 297 Mean Reward: -48.42226906585693


100%|██████████| 2000/2000 [02:02<00:00, 16.32it/s]


Epoch 298 Mean Reward: -52.86192073059082


100%|██████████| 2000/2000 [02:03<00:00, 16.25it/s]


Epoch 299 Mean Reward: -51.52077778625488


100%|██████████| 2000/2000 [02:08<00:00, 15.54it/s]


Epoch 300 Mean Reward: -51.715919921875
Epoch 300 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 300 test with depth buffer:
Test Episode 1 Reward: 162.64630126953125
Test Episode 2 Reward: 22.496612548828125
Test Episode 3 Reward: -74.32781982421875
Test Episode 4 Reward: -115.99824523925781
Test Episode 5 Reward: -64.91195678710938
Test Episode 6 Reward: 99.40730285644531
Test Episode 7 Reward: 34.72117614746094
Test Episode 8 Reward: 162.64630126953125
Test Episode 9 Reward: -18.040924072265625
Test Episode 10 Reward: 162.64630126953125
Average Test Reward (with depth buffer:) 37.12850494384766
Epoch 300 test without depth buffer:
Test Episode 1 Reward: -60.093841552734375
Test Episode 2 Reward: -65.84616088867188
Test Episode 3 Reward: -115.97061157226562
Test Episode 4 Reward: 25.223541259765625
Test Episode 5 Reward: -98.33235168457031
Test Episode 6 Reward: -10.109268188476562
Test Episode 7 Reward: 25.223541259765625
Test Episode 8 Reward: -45.96186828613281
Test Episo

100%|██████████| 2000/2000 [02:05<00:00, 15.88it/s]


Epoch 301 Mean Reward: -51.91193663024902


100%|██████████| 2000/2000 [02:05<00:00, 15.88it/s]


Epoch 302 Mean Reward: -54.11690594482422


100%|██████████| 2000/2000 [02:06<00:00, 15.78it/s]


Epoch 303 Mean Reward: -51.81156019592285


100%|██████████| 2000/2000 [02:04<00:00, 16.05it/s]


Epoch 304 Mean Reward: -53.77712375640869


100%|██████████| 2000/2000 [02:05<00:00, 15.94it/s]


Epoch 305 Mean Reward: -50.746206260681156


100%|██████████| 2000/2000 [02:09<00:00, 15.40it/s]


Epoch 306 Mean Reward: -50.20194944763183


100%|██████████| 2000/2000 [02:06<00:00, 15.78it/s]


Epoch 307 Mean Reward: -48.092462158203126


100%|██████████| 2000/2000 [02:08<00:00, 15.56it/s]


Epoch 308 Mean Reward: -44.99289072418213


100%|██████████| 2000/2000 [02:08<00:00, 15.54it/s]


Epoch 309 Mean Reward: -48.72785301971435


100%|██████████| 2000/2000 [02:10<00:00, 15.29it/s]


Epoch 310 Mean Reward: -46.04348648071289


100%|██████████| 2000/2000 [02:15<00:00, 14.72it/s]


Epoch 311 Mean Reward: -46.85607691955566


100%|██████████| 2000/2000 [02:17<00:00, 14.60it/s]


Epoch 312 Mean Reward: -45.58286045837402


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 313 Mean Reward: -46.70073419189453


100%|██████████| 2000/2000 [02:23<00:00, 13.96it/s]


Epoch 314 Mean Reward: -45.37212295532227


100%|██████████| 2000/2000 [02:23<00:00, 13.90it/s]


Epoch 315 Mean Reward: -39.58426586151123


100%|██████████| 2000/2000 [02:20<00:00, 14.21it/s]


Epoch 316 Mean Reward: -43.02480621337891


100%|██████████| 2000/2000 [02:23<00:00, 13.97it/s]


Epoch 317 Mean Reward: -42.40511608886719


100%|██████████| 2000/2000 [02:20<00:00, 14.26it/s]


Epoch 318 Mean Reward: -40.27562707519531


100%|██████████| 2000/2000 [02:27<00:00, 13.57it/s]


Epoch 319 Mean Reward: -43.83120281982422


100%|██████████| 2000/2000 [02:25<00:00, 13.79it/s]


Epoch 320 Mean Reward: -40.44348593902588
Epoch 320 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 320 test with depth buffer:
Test Episode 1 Reward: -115.9681396484375
Test Episode 2 Reward: -115.9681396484375
Test Episode 3 Reward: -115.9681396484375
Test Episode 4 Reward: -100.72061157226562
Test Episode 5 Reward: -115.99935913085938
Test Episode 6 Reward: -115.97769165039062
Test Episode 7 Reward: -110.18058776855469
Test Episode 8 Reward: -43.49748229980469
Test Episode 9 Reward: -115.9681396484375
Test Episode 10 Reward: -115.84843444824219
Average Test Reward (with depth buffer:) -106.60967254638672
Epoch 320 test without depth buffer:
Test Episode 1 Reward: -108.964111328125
Test Episode 2 Reward: -97.82316589355469
Test Episode 3 Reward: -108.89164733886719
Test Episode 4 Reward: -83.95884704589844
Test Episode 5 Reward: -59.963043212890625
Test Episode 6 Reward: -111.44496154785156
Test Episode 7 Reward: -108.964111328125
Test Episode 8 Reward: -115.94293212890625
Te

100%|██████████| 2000/2000 [02:34<00:00, 12.95it/s]


Epoch 321 Mean Reward: -37.173696296691894


100%|██████████| 2000/2000 [02:30<00:00, 13.25it/s]


Epoch 322 Mean Reward: -32.45593854522705


100%|██████████| 2000/2000 [02:31<00:00, 13.23it/s]


Epoch 323 Mean Reward: -41.20036405944824


100%|██████████| 2000/2000 [02:28<00:00, 13.48it/s]


Epoch 324 Mean Reward: -33.86139321136475


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 325 Mean Reward: -34.62672933197022


100%|██████████| 2000/2000 [02:31<00:00, 13.19it/s]


Epoch 326 Mean Reward: -37.783956787109375


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 327 Mean Reward: -35.68038732147217


100%|██████████| 2000/2000 [02:32<00:00, 13.12it/s]


Epoch 328 Mean Reward: -39.627654739379885


100%|██████████| 2000/2000 [02:40<00:00, 12.49it/s]


Epoch 329 Mean Reward: -33.31504391479492


100%|██████████| 2000/2000 [02:33<00:00, 12.99it/s]


Epoch 330 Mean Reward: -31.97442897796631


100%|██████████| 2000/2000 [02:36<00:00, 12.76it/s]


Epoch 331 Mean Reward: -40.86308769989014


100%|██████████| 2000/2000 [02:33<00:00, 13.00it/s]


Epoch 332 Mean Reward: -40.35909720611572


100%|██████████| 2000/2000 [02:47<00:00, 11.91it/s]


Epoch 333 Mean Reward: -39.89912493133545


100%|██████████| 2000/2000 [02:46<00:00, 12.03it/s]


Epoch 334 Mean Reward: -41.47916033935547


100%|██████████| 2000/2000 [02:40<00:00, 12.43it/s]


Epoch 335 Mean Reward: -38.46663572692871


100%|██████████| 2000/2000 [02:47<00:00, 11.93it/s]


Epoch 336 Mean Reward: -32.34002338409424


100%|██████████| 2000/2000 [02:49<00:00, 11.82it/s]


Epoch 337 Mean Reward: -34.50284506225586


100%|██████████| 2000/2000 [02:49<00:00, 11.78it/s]


Epoch 338 Mean Reward: -36.78048069000244


100%|██████████| 2000/2000 [02:50<00:00, 11.71it/s]


Epoch 339 Mean Reward: -31.27611619567871


100%|██████████| 2000/2000 [02:41<00:00, 12.35it/s]


Epoch 340 Mean Reward: -34.13534005737305
Epoch 340 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 340 test with depth buffer:
Test Episode 1 Reward: 227.18809509277344
Test Episode 2 Reward: 227.18809509277344
Test Episode 3 Reward: -31.4718017578125
Test Episode 4 Reward: 227.18809509277344
Test Episode 5 Reward: 18.959793090820312
Test Episode 6 Reward: 227.18809509277344
Test Episode 7 Reward: 227.18809509277344
Test Episode 8 Reward: -115.98980712890625
Test Episode 9 Reward: 227.18809509277344
Test Episode 10 Reward: -14.899520874023438
Average Test Reward (with depth buffer:) 121.97272338867188
Epoch 340 test without depth buffer:
Test Episode 1 Reward: 270.1346130371094
Test Episode 2 Reward: 227.56666564941406
Test Episode 3 Reward: 68.03338623046875
Test Episode 4 Reward: 227.56666564941406
Test Episode 5 Reward: 227.325439453125
Test Episode 6 Reward: 273.57151794433594
Test Episode 7 Reward: 227.56666564941406
Test Episode 8 Reward: 227.56666564941406
Test Episode 

100%|██████████| 2000/2000 [02:54<00:00, 11.46it/s]


Epoch 341 Mean Reward: -32.28550301361084


100%|██████████| 2000/2000 [02:38<00:00, 12.63it/s]


Epoch 342 Mean Reward: -36.786087936401366


100%|██████████| 2000/2000 [02:58<00:00, 11.20it/s]


Epoch 343 Mean Reward: -31.836786453247072


100%|██████████| 2000/2000 [02:55<00:00, 11.41it/s]


Epoch 344 Mean Reward: -30.671359703063963


100%|██████████| 2000/2000 [02:48<00:00, 11.87it/s]


Epoch 345 Mean Reward: -35.46156032562256


100%|██████████| 2000/2000 [02:54<00:00, 11.46it/s]


Epoch 346 Mean Reward: -31.594173240661622


100%|██████████| 2000/2000 [02:59<00:00, 11.12it/s]


Epoch 347 Mean Reward: -30.054408149719237


100%|██████████| 2000/2000 [02:54<00:00, 11.47it/s]


Epoch 348 Mean Reward: -30.506925117492674


100%|██████████| 2000/2000 [02:58<00:00, 11.18it/s]


Epoch 349 Mean Reward: -32.93300199127197


100%|██████████| 2000/2000 [03:01<00:00, 10.99it/s]


Epoch 350 Mean Reward: -31.730046607971193


100%|██████████| 2000/2000 [03:00<00:00, 11.06it/s]


Epoch 351 Mean Reward: -41.3854220199585


100%|██████████| 2000/2000 [02:57<00:00, 11.28it/s]


Epoch 352 Mean Reward: -34.51412289428711


100%|██████████| 2000/2000 [03:12<00:00, 10.38it/s]


Epoch 353 Mean Reward: -38.23271385192871


100%|██████████| 2000/2000 [03:07<00:00, 10.65it/s]


Epoch 354 Mean Reward: -37.60701826477051


100%|██████████| 2000/2000 [03:02<00:00, 10.98it/s]


Epoch 355 Mean Reward: -42.804050994873045


100%|██████████| 2000/2000 [03:01<00:00, 11.01it/s]


Epoch 356 Mean Reward: -39.29703539276123


100%|██████████| 2000/2000 [03:01<00:00, 11.04it/s]


Epoch 357 Mean Reward: -35.89434775543213


100%|██████████| 2000/2000 [03:01<00:00, 11.00it/s]


Epoch 358 Mean Reward: -37.10672873687744


100%|██████████| 2000/2000 [02:52<00:00, 11.58it/s]


Epoch 359 Mean Reward: -36.59916400909424


100%|██████████| 2000/2000 [02:52<00:00, 11.56it/s]


Epoch 360 Mean Reward: -38.56300284576416
Epoch 360 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 360 test with depth buffer:
Test Episode 1 Reward: -28.636367797851562
Test Episode 2 Reward: -93.68292236328125
Test Episode 3 Reward: -115.97569274902344
Test Episode 4 Reward: -93.68292236328125
Test Episode 5 Reward: -91.20927429199219
Test Episode 6 Reward: -93.68292236328125
Test Episode 7 Reward: -93.68292236328125
Test Episode 8 Reward: -47.91654968261719
Test Episode 9 Reward: 34.85050964355469
Test Episode 10 Reward: -62.962432861328125
Average Test Reward (with depth buffer:) -68.65814971923828
Epoch 360 test without depth buffer:
Test Episode 1 Reward: 68.34602355957031
Test Episode 2 Reward: 50.44346618652344
Test Episode 3 Reward: 17.84088134765625
Test Episode 4 Reward: 50.44346618652344
Test Episode 5 Reward: -67.98483276367188
Test Episode 6 Reward: 50.44346618652344
Test Episode 7 Reward: -83.145751953125
Test Episode 8 Reward: 174.5702362060547
Test Episode 9 R

100%|██████████| 2000/2000 [15:11<00:00,  2.19it/s]


Epoch 361 Mean Reward: 124.03267659759521


100%|██████████| 2000/2000 [19:19<00:00,  1.73it/s]


Epoch 362 Mean Reward: 134.53697649383545


100%|██████████| 2000/2000 [15:28<00:00,  2.15it/s]


Epoch 363 Mean Reward: 154.08721317291258


100%|██████████| 2000/2000 [15:16<00:00,  2.18it/s]


Epoch 364 Mean Reward: 168.49158350372315


100%|██████████| 2000/2000 [15:10<00:00,  2.20it/s]


Epoch 365 Mean Reward: 188.14123636627198


100%|██████████| 2000/2000 [14:32<00:00,  2.29it/s]


Epoch 366 Mean Reward: 202.0800589752197


100%|██████████| 2000/2000 [16:07<00:00,  2.07it/s]


Epoch 367 Mean Reward: 160.79930953979493


100%|██████████| 2000/2000 [14:02<00:00,  2.37it/s]


Epoch 368 Mean Reward: 194.25526551818848


100%|██████████| 2000/2000 [15:43<00:00,  2.12it/s]


Epoch 369 Mean Reward: 172.0617008743286


100%|██████████| 2000/2000 [15:31<00:00,  2.15it/s]


Epoch 370 Mean Reward: 173.76111837005615


100%|██████████| 2000/2000 [13:44<00:00,  2.43it/s]


Epoch 371 Mean Reward: 211.3187996520996


100%|██████████| 2000/2000 [14:15<00:00,  2.34it/s]


Epoch 372 Mean Reward: 196.4561019821167


100%|██████████| 2000/2000 [14:30<00:00,  2.30it/s]


Epoch 373 Mean Reward: 195.2460344848633


100%|██████████| 2000/2000 [12:50<00:00,  2.60it/s]


Epoch 374 Mean Reward: 229.2297834854126


100%|██████████| 2000/2000 [14:21<00:00,  2.32it/s]


Epoch 375 Mean Reward: 177.43141456604005


100%|██████████| 2000/2000 [14:16<00:00,  2.33it/s]


Epoch 376 Mean Reward: 208.79611656188965


100%|██████████| 2000/2000 [14:19<00:00,  2.33it/s]


Epoch 377 Mean Reward: 229.07490083312987


100%|██████████| 2000/2000 [14:20<00:00,  2.32it/s]


Epoch 378 Mean Reward: 217.75255410003663


100%|██████████| 2000/2000 [14:48<00:00,  2.25it/s]


Epoch 379 Mean Reward: 209.57873342895508


100%|██████████| 2000/2000 [13:36<00:00,  2.45it/s]


Epoch 380 Mean Reward: 234.17862949371337
Epoch 380 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 380 test with depth buffer:
Test Episode 1 Reward: -115.76071166992188
Test Episode 2 Reward: -115.76071166992188
Test Episode 3 Reward: -97.87606811523438
Test Episode 4 Reward: -115.76071166992188
Test Episode 5 Reward: -115.76071166992188
Test Episode 6 Reward: -115.76071166992188
Test Episode 7 Reward: -88.40705871582031
Test Episode 8 Reward: -115.76071166992188
Test Episode 9 Reward: 117.66700744628906
Test Episode 10 Reward: -115.99281311035156
Average Test Reward (with depth buffer:) -87.91732025146484
Epoch 380 test without depth buffer:
Test Episode 1 Reward: -81.62344360351562
Test Episode 2 Reward: -81.62344360351562
Test Episode 3 Reward: -81.62344360351562
Test Episode 4 Reward: -31.420440673828125
Test Episode 5 Reward: -81.62344360351562
Test Episode 6 Reward: -81.62344360351562
Test Episode 7 Reward: -71.75711059570312
Test Episode 8 Reward: -81.62344360351562
Te

100%|██████████| 2000/2000 [14:10<00:00,  2.35it/s]


Epoch 381 Mean Reward: 229.38612577056884


100%|██████████| 2000/2000 [15:56<00:00,  2.09it/s]


Epoch 382 Mean Reward: 191.10980965423585


100%|██████████| 2000/2000 [15:55<00:00,  2.09it/s]


Epoch 383 Mean Reward: 205.393241355896


100%|██████████| 2000/2000 [15:10<00:00,  2.20it/s]


Epoch 384 Mean Reward: 230.84335305786132


100%|██████████| 2000/2000 [16:05<00:00,  2.07it/s]


Epoch 385 Mean Reward: 240.69347504425048


100%|██████████| 2000/2000 [15:19<00:00,  2.18it/s]


Epoch 386 Mean Reward: 232.90324892425537


100%|██████████| 2000/2000 [14:46<00:00,  2.26it/s]


Epoch 387 Mean Reward: 228.3543370819092


100%|██████████| 2000/2000 [14:45<00:00,  2.26it/s]


Epoch 388 Mean Reward: 224.07948900604248


100%|██████████| 2000/2000 [15:08<00:00,  2.20it/s]


Epoch 389 Mean Reward: 252.94507271575927


100%|██████████| 2000/2000 [12:25<00:00,  2.68it/s]


Epoch 390 Mean Reward: 256.0786752319336


100%|██████████| 2000/2000 [11:51<00:00,  2.81it/s]


Epoch 391 Mean Reward: 244.1194331436157


100%|██████████| 2000/2000 [10:29<00:00,  3.18it/s]


Epoch 392 Mean Reward: 246.39980488586426


100%|██████████| 2000/2000 [09:49<00:00,  3.39it/s]


Epoch 393 Mean Reward: 240.81878231811524


100%|██████████| 2000/2000 [11:05<00:00,  3.00it/s]


Epoch 394 Mean Reward: 237.8986647491455


100%|██████████| 2000/2000 [14:07<00:00,  2.36it/s]


Epoch 395 Mean Reward: 272.4770442504883


100%|██████████| 2000/2000 [11:35<00:00,  2.88it/s]


Epoch 396 Mean Reward: 263.81549279785156


100%|██████████| 2000/2000 [11:25<00:00,  2.92it/s]


Epoch 397 Mean Reward: 278.55409899902344


100%|██████████| 2000/2000 [13:15<00:00,  2.51it/s]


Epoch 398 Mean Reward: 263.5747544250488


100%|██████████| 2000/2000 [11:17<00:00,  2.95it/s]


Epoch 399 Mean Reward: 292.87703024291994


100%|██████████| 2000/2000 [11:17<00:00,  2.95it/s]


Epoch 400 Mean Reward: 299.9022977294922
Epoch 400 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 400 test with depth buffer:
Test Episode 1 Reward: -21.1109619140625
Test Episode 2 Reward: 38.007080078125
Test Episode 3 Reward: -21.1109619140625
Test Episode 4 Reward: -33.5538330078125
Test Episode 5 Reward: 37.67597961425781
Test Episode 6 Reward: -21.1109619140625
Test Episode 7 Reward: -21.1109619140625
Test Episode 8 Reward: -21.1109619140625
Test Episode 9 Reward: -0.300689697265625
Test Episode 10 Reward: 47.06463623046875
Average Test Reward (with depth buffer:) -1.6661636352539062
Epoch 400 test without depth buffer:
Test Episode 1 Reward: -32.38520812988281
Test Episode 2 Reward: 78.63236999511719
Test Episode 3 Reward: -21.8055419921875
Test Episode 4 Reward: -21.8055419921875
Test Episode 5 Reward: -76.53143310546875
Test Episode 6 Reward: 14.895767211914062
Test Episode 7 Reward: 40.34979248046875
Test Episode 8 Reward: -21.8055419921875
Test Episode 9 Reward: -21

100%|██████████| 2000/2000 [09:43<00:00,  3.43it/s]


Epoch 401 Mean Reward: 302.60671765899656


100%|██████████| 2000/2000 [10:30<00:00,  3.17it/s]


Epoch 402 Mean Reward: 322.94823712158205


100%|██████████| 2000/2000 [10:39<00:00,  3.13it/s]


Epoch 403 Mean Reward: 314.5988103103638


100%|██████████| 2000/2000 [11:06<00:00,  3.00it/s]


Epoch 404 Mean Reward: 309.755663230896


100%|██████████| 2000/2000 [11:58<00:00,  2.78it/s]


Epoch 405 Mean Reward: 319.94831172180176


100%|██████████| 2000/2000 [07:25<00:00,  4.49it/s]


Epoch 406 Mean Reward: 341.24599171447755


100%|██████████| 2000/2000 [10:13<00:00,  3.26it/s]


Epoch 407 Mean Reward: 313.3037664871216


100%|██████████| 2000/2000 [09:28<00:00,  3.52it/s]


Epoch 408 Mean Reward: 309.61315863800047


100%|██████████| 2000/2000 [10:05<00:00,  3.30it/s]


Epoch 409 Mean Reward: 331.59262954711915


100%|██████████| 2000/2000 [09:20<00:00,  3.57it/s]


Epoch 410 Mean Reward: 323.2955040435791


100%|██████████| 2000/2000 [13:32<00:00,  2.46it/s]


Epoch 411 Mean Reward: 263.04600119018556


100%|██████████| 2000/2000 [09:11<00:00,  3.62it/s]


Epoch 412 Mean Reward: 249.46563956451416


100%|██████████| 2000/2000 [08:35<00:00,  3.88it/s]


Epoch 413 Mean Reward: 275.7681996383667


100%|██████████| 2000/2000 [10:29<00:00,  3.18it/s]


Epoch 414 Mean Reward: 281.2698695220947


100%|██████████| 2000/2000 [17:22<00:00,  1.92it/s]


Epoch 415 Mean Reward: 267.1602936248779


100%|██████████| 2000/2000 [20:50<00:00,  1.60it/s]


Epoch 416 Mean Reward: 274.85457383728027


100%|██████████| 2000/2000 [13:48<00:00,  2.41it/s]


Epoch 417 Mean Reward: 259.8055206756592


100%|██████████| 2000/2000 [10:13<00:00,  3.26it/s]


Epoch 418 Mean Reward: 273.0385018615723


100%|██████████| 2000/2000 [08:17<00:00,  4.02it/s]


Epoch 419 Mean Reward: 264.89253067779543


100%|██████████| 2000/2000 [08:35<00:00,  3.88it/s]


Epoch 420 Mean Reward: 235.00939318847657
Epoch 420 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 420 test with depth buffer:
Test Episode 1 Reward: -115.99786376953125
Test Episode 2 Reward: -115.99786376953125
Test Episode 3 Reward: -51.98619079589844
Test Episode 4 Reward: -115.99786376953125
Test Episode 5 Reward: 33.94770812988281
Test Episode 6 Reward: -115.99786376953125
Test Episode 7 Reward: -7.3206329345703125
Test Episode 8 Reward: -31.077804565429688
Test Episode 9 Reward: -115.90097045898438
Test Episode 10 Reward: -115.99786376953125
Average Test Reward (with depth buffer:) -75.23272094726562
Epoch 420 test without depth buffer:
Test Episode 1 Reward: -33.81129455566406
Test Episode 2 Reward: 176.65533447265625
Test Episode 3 Reward: 48.74822998046875
Test Episode 4 Reward: 23.504241943359375
Test Episode 5 Reward: -33.168670654296875
Test Episode 6 Reward: 127.80181884765625
Test Episode 7 Reward: 176.65533447265625
Test Episode 8 Reward: 176.65533447265625
Tes

100%|██████████| 2000/2000 [11:20<00:00,  2.94it/s]


Epoch 421 Mean Reward: 308.0674987411499


100%|██████████| 2000/2000 [12:30<00:00,  2.67it/s]


Epoch 422 Mean Reward: 286.5719908981323


100%|██████████| 2000/2000 [12:41<00:00,  2.63it/s]


Epoch 423 Mean Reward: 279.53954090118407


100%|██████████| 2000/2000 [12:20<00:00,  2.70it/s]


Epoch 424 Mean Reward: 305.47732289886477


100%|██████████| 2000/2000 [11:22<00:00,  2.93it/s]


Epoch 425 Mean Reward: 305.5998751068115


100%|██████████| 2000/2000 [11:47<00:00,  2.83it/s]


Epoch 426 Mean Reward: 288.8342704696655


100%|██████████| 2000/2000 [12:05<00:00,  2.76it/s]


Epoch 427 Mean Reward: 274.78582264709473


100%|██████████| 2000/2000 [12:19<00:00,  2.70it/s]


Epoch 428 Mean Reward: 256.07055715179445


100%|██████████| 2000/2000 [11:31<00:00,  2.89it/s]


Epoch 429 Mean Reward: 288.4997194519043


100%|██████████| 2000/2000 [12:13<00:00,  2.73it/s]


Epoch 430 Mean Reward: 267.4303212738037


100%|██████████| 2000/2000 [11:55<00:00,  2.80it/s]


Epoch 431 Mean Reward: 264.8003517303467


100%|██████████| 2000/2000 [11:28<00:00,  2.91it/s]


Epoch 432 Mean Reward: 264.9190506668091


100%|██████████| 2000/2000 [10:29<00:00,  3.18it/s]


Epoch 433 Mean Reward: 224.0348708114624


100%|██████████| 2000/2000 [10:23<00:00,  3.21it/s]


Epoch 434 Mean Reward: 240.30060943603516


100%|██████████| 2000/2000 [10:36<00:00,  3.14it/s]


Epoch 435 Mean Reward: 236.00427688598631


100%|██████████| 2000/2000 [11:00<00:00,  3.03it/s]


Epoch 436 Mean Reward: 213.28240100097656


100%|██████████| 2000/2000 [10:59<00:00,  3.03it/s]


Epoch 437 Mean Reward: 220.03547235107422


100%|██████████| 2000/2000 [10:37<00:00,  3.13it/s]


Epoch 438 Mean Reward: 208.46049781799317


100%|██████████| 2000/2000 [11:21<00:00,  2.94it/s]


Epoch 439 Mean Reward: 221.2129567565918


100%|██████████| 2000/2000 [10:05<00:00,  3.30it/s]


Epoch 440 Mean Reward: 241.17954251098632
Epoch 440 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 440 test with depth buffer:
Test Episode 1 Reward: 15.849227905273438
Test Episode 2 Reward: -65.11872863769531
Test Episode 3 Reward: -10.825653076171875
Test Episode 4 Reward: -110.30876159667969
Test Episode 5 Reward: -17.440750122070312
Test Episode 6 Reward: -10.825653076171875
Test Episode 7 Reward: -10.825653076171875
Test Episode 8 Reward: -70.26451110839844
Test Episode 9 Reward: -10.825653076171875
Test Episode 10 Reward: -10.825653076171875
Average Test Reward (with depth buffer:) -30.14117889404297
Epoch 440 test without depth buffer:
Test Episode 1 Reward: -7.3447723388671875
Test Episode 2 Reward: -7.3447723388671875
Test Episode 3 Reward: -7.3447723388671875
Test Episode 4 Reward: -7.3447723388671875
Test Episode 5 Reward: -7.3447723388671875
Test Episode 6 Reward: -99.49191284179688
Test Episode 7 Reward: -20.468307495117188
Test Episode 8 Reward: -42.028198242187

100%|██████████| 2000/2000 [11:17<00:00,  2.95it/s]


Epoch 441 Mean Reward: 155.24367195129395


100%|██████████| 2000/2000 [11:13<00:00,  2.97it/s]


Epoch 442 Mean Reward: 126.00261769866944


100%|██████████| 2000/2000 [11:11<00:00,  2.98it/s]


Epoch 443 Mean Reward: 87.36025345611573


100%|██████████| 2000/2000 [10:47<00:00,  3.09it/s]


Epoch 444 Mean Reward: 81.53129389190674


100%|██████████| 2000/2000 [10:17<00:00,  3.24it/s]


Epoch 445 Mean Reward: 90.43902296447754


100%|██████████| 2000/2000 [11:00<00:00,  3.03it/s]


Epoch 446 Mean Reward: 79.61588945007324


100%|██████████| 2000/2000 [11:05<00:00,  3.00it/s]


Epoch 447 Mean Reward: 102.17961166381836


100%|██████████| 2000/2000 [11:21<00:00,  2.94it/s]


Epoch 448 Mean Reward: 81.85823034667969


100%|██████████| 2000/2000 [10:52<00:00,  3.07it/s]


Epoch 449 Mean Reward: 100.30754042816162


100%|██████████| 2000/2000 [10:19<00:00,  3.23it/s]


Epoch 450 Mean Reward: 92.66811157226563


100%|██████████| 2000/2000 [13:20<00:00,  2.50it/s]


Epoch 451 Mean Reward: 178.8447855606079


100%|██████████| 2000/2000 [12:01<00:00,  2.77it/s]


Epoch 452 Mean Reward: 204.33972394561766


100%|██████████| 2000/2000 [12:46<00:00,  2.61it/s]


Epoch 453 Mean Reward: 198.0420687789917


100%|██████████| 2000/2000 [13:30<00:00,  2.47it/s]


Epoch 454 Mean Reward: 200.79312117004395


100%|██████████| 2000/2000 [13:04<00:00,  2.55it/s]


Epoch 455 Mean Reward: 201.95825231170653


100%|██████████| 2000/2000 [12:35<00:00,  2.65it/s]


Epoch 456 Mean Reward: 196.3453667907715


100%|██████████| 2000/2000 [12:11<00:00,  2.74it/s]


Epoch 457 Mean Reward: 202.58700535583495


100%|██████████| 2000/2000 [12:26<00:00,  2.68it/s]


Epoch 458 Mean Reward: 175.531084815979


100%|██████████| 2000/2000 [12:09<00:00,  2.74it/s]


Epoch 459 Mean Reward: 212.04878870391846


100%|██████████| 2000/2000 [12:16<00:00,  2.71it/s]


Epoch 460 Mean Reward: 189.07512033843994
Epoch 460 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 460 test with depth buffer:
Test Episode 1 Reward: -115.97401428222656
Test Episode 2 Reward: -115.97401428222656
Test Episode 3 Reward: -39.65467834472656
Test Episode 4 Reward: -62.93525695800781
Test Episode 5 Reward: -115.97401428222656
Test Episode 6 Reward: -115.99977111816406
Test Episode 7 Reward: -115.97401428222656
Test Episode 8 Reward: -115.99861145019531
Test Episode 9 Reward: -87.49165344238281
Test Episode 10 Reward: -115.97573852539062
Average Test Reward (with depth buffer:) -100.19517669677734
Epoch 460 test without depth buffer:
Test Episode 1 Reward: -115.9757080078125
Test Episode 2 Reward: -115.9757080078125
Test Episode 3 Reward: -115.9757080078125
Test Episode 4 Reward: -115.9976806640625
Test Episode 5 Reward: -115.9757080078125
Test Episode 6 Reward: -83.48350524902344
Test Episode 7 Reward: -92.74786376953125
Test Episode 8 Reward: -89.92617797851562
Te

100%|██████████| 2000/2000 [13:17<00:00,  2.51it/s]


Epoch 461 Mean Reward: 132.5855959701538


100%|██████████| 2000/2000 [14:43<00:00,  2.26it/s]


Epoch 462 Mean Reward: 118.55371676635743


100%|██████████| 2000/2000 [13:37<00:00,  2.45it/s]


Epoch 463 Mean Reward: 157.88871778869628


100%|██████████| 2000/2000 [12:35<00:00,  2.65it/s]


Epoch 464 Mean Reward: 162.90500527954103


100%|██████████| 2000/2000 [12:59<00:00,  2.57it/s]


Epoch 465 Mean Reward: 148.57180041503906


100%|██████████| 2000/2000 [13:19<00:00,  2.50it/s]


Epoch 466 Mean Reward: 160.00858889007569


100%|██████████| 2000/2000 [13:54<00:00,  2.40it/s]


Epoch 467 Mean Reward: 139.89876541137696


100%|██████████| 2000/2000 [14:56<00:00,  2.23it/s]


Epoch 468 Mean Reward: 134.79810341644287


100%|██████████| 2000/2000 [14:03<00:00,  2.37it/s]


Epoch 469 Mean Reward: 152.49246430206298


100%|██████████| 2000/2000 [14:31<00:00,  2.29it/s]


Epoch 470 Mean Reward: 95.7232841720581


100%|██████████| 2000/2000 [13:43<00:00,  2.43it/s]


Epoch 471 Mean Reward: 121.27880353546142


100%|██████████| 2000/2000 [12:54<00:00,  2.58it/s]


Epoch 472 Mean Reward: 120.41183261108398


100%|██████████| 2000/2000 [12:51<00:00,  2.59it/s]


Epoch 473 Mean Reward: 140.75974196624756


100%|██████████| 2000/2000 [17:19<00:00,  1.92it/s]


Epoch 474 Mean Reward: 134.98773129272462


100%|██████████| 2000/2000 [21:29<00:00,  1.55it/s]


Epoch 475 Mean Reward: 163.1095506439209


100%|██████████| 2000/2000 [15:55<00:00,  2.09it/s]


Epoch 476 Mean Reward: 131.52333951568605


100%|██████████| 2000/2000 [26:21<00:00,  1.26it/s]


Epoch 477 Mean Reward: 128.09962477874757


100%|██████████| 2000/2000 [25:07<00:00,  1.33it/s]


Epoch 478 Mean Reward: 141.4029761505127


100%|██████████| 2000/2000 [13:56<00:00,  2.39it/s]


Epoch 479 Mean Reward: 145.542366104126


100%|██████████| 2000/2000 [10:17<00:00,  3.24it/s]


Epoch 480 Mean Reward: 171.49647006225587
Epoch 480 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 480 test with depth buffer:
Test Episode 1 Reward: -115.91545104980469
Test Episode 2 Reward: -115.91545104980469
Test Episode 3 Reward: -115.91545104980469
Test Episode 4 Reward: -115.91545104980469
Test Episode 5 Reward: -113.09199523925781
Test Episode 6 Reward: -20.045745849609375
Test Episode 7 Reward: -115.91545104980469
Test Episode 8 Reward: -115.99569702148438
Test Episode 9 Reward: -115.99552917480469
Test Episode 10 Reward: -115.91545104980469
Average Test Reward (with depth buffer:) -106.06216735839844
Epoch 480 test without depth buffer:
Test Episode 1 Reward: -113.44548034667969
Test Episode 2 Reward: -113.44548034667969
Test Episode 3 Reward: -113.44548034667969
Test Episode 4 Reward: -113.44548034667969
Test Episode 5 Reward: -110.89773559570312
Test Episode 6 Reward: -109.67509460449219
Test Episode 7 Reward: -89.24993896484375
Test Episode 8 Reward: -41.03462219

100%|██████████| 2000/2000 [22:56<00:00,  1.45it/s]


Epoch 481 Mean Reward: 92.56007717895508


100%|██████████| 2000/2000 [17:42<00:00,  1.88it/s]


Epoch 482 Mean Reward: 113.22382674407959


100%|██████████| 2000/2000 [17:39<00:00,  1.89it/s]


Epoch 483 Mean Reward: 84.97806758880616


100%|██████████| 2000/2000 [17:13<00:00,  1.93it/s]


Epoch 484 Mean Reward: 110.25876775360108


100%|██████████| 2000/2000 [16:51<00:00,  1.98it/s]


Epoch 485 Mean Reward: 90.39329996490478


100%|██████████| 2000/2000 [17:15<00:00,  1.93it/s]


Epoch 486 Mean Reward: 87.30334746551513


100%|██████████| 2000/2000 [12:32<00:00,  2.66it/s]


Epoch 487 Mean Reward: 105.55084873962403


100%|██████████| 2000/2000 [16:37<00:00,  2.00it/s]


Epoch 488 Mean Reward: 82.88699298858643


100%|██████████| 2000/2000 [18:11<00:00,  1.83it/s]


Epoch 489 Mean Reward: 85.94650125122071


100%|██████████| 2000/2000 [17:33<00:00,  1.90it/s]


Epoch 490 Mean Reward: 93.30154484558105


100%|██████████| 2000/2000 [19:42<00:00,  1.69it/s]


Epoch 491 Mean Reward: 108.34451327514648


100%|██████████| 2000/2000 [18:20<00:00,  1.82it/s]


Epoch 492 Mean Reward: 80.8452576751709


100%|██████████| 2000/2000 [15:30<00:00,  2.15it/s]


Epoch 493 Mean Reward: 91.89417547607422


100%|██████████| 2000/2000 [18:57<00:00,  1.76it/s]


Epoch 494 Mean Reward: 129.4777942123413


100%|██████████| 2000/2000 [16:12<00:00,  2.06it/s]


Epoch 495 Mean Reward: 132.58990954589845


100%|██████████| 2000/2000 [17:28<00:00,  1.91it/s]


Epoch 496 Mean Reward: 127.61308130645752


100%|██████████| 2000/2000 [20:06<00:00,  1.66it/s]


Epoch 497 Mean Reward: 78.53285031890869


100%|██████████| 2000/2000 [18:20<00:00,  1.82it/s]


Epoch 498 Mean Reward: 85.83511784362793


100%|██████████| 2000/2000 [13:32<00:00,  2.46it/s]


Epoch 499 Mean Reward: 123.28570285797119


100%|██████████| 2000/2000 [18:04<00:00,  1.84it/s]


Epoch 500 Mean Reward: 85.66741563415528
Epoch 500 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 500 test with depth buffer:
Test Episode 1 Reward: -115.37098693847656
Test Episode 2 Reward: -8.654678344726562
Test Episode 3 Reward: 83.68191528320312
Test Episode 4 Reward: -22.61431884765625
Test Episode 5 Reward: -8.654678344726562
Test Episode 6 Reward: -115.86866760253906
Test Episode 7 Reward: -56.531097412109375
Test Episode 8 Reward: -8.654678344726562
Test Episode 9 Reward: -8.654678344726562
Test Episode 10 Reward: 25.524032592773438
Average Test Reward (with depth buffer:) -23.579783630371093
Epoch 500 test without depth buffer:
Test Episode 1 Reward: -39.78779602050781
Test Episode 2 Reward: -39.78779602050781
Test Episode 3 Reward: -39.78779602050781
Test Episode 4 Reward: -43.18597412109375
Test Episode 5 Reward: -39.78779602050781
Test Episode 6 Reward: 260.53814697265625
Test Episode 7 Reward: -39.78779602050781
Test Episode 8 Reward: -39.78779602050781
Test Epi

100%|██████████| 2000/2000 [18:24<00:00,  1.81it/s]


Epoch 501 Mean Reward: 129.99615976715089


100%|██████████| 2000/2000 [18:45<00:00,  1.78it/s]


Epoch 502 Mean Reward: 97.49757927703857


100%|██████████| 2000/2000 [19:42<00:00,  1.69it/s]


Epoch 503 Mean Reward: 134.67385278320313


100%|██████████| 2000/2000 [17:56<00:00,  1.86it/s]


Epoch 504 Mean Reward: 183.9560479660034


100%|██████████| 2000/2000 [18:49<00:00,  1.77it/s]


Epoch 505 Mean Reward: 121.54155982208252


100%|██████████| 2000/2000 [17:42<00:00,  1.88it/s]


Epoch 506 Mean Reward: 160.51552783966065


100%|██████████| 2000/2000 [25:24<00:00,  1.31it/s]


Epoch 507 Mean Reward: 132.21390842437745


100%|██████████| 2000/2000 [20:53<00:00,  1.60it/s]


Epoch 508 Mean Reward: 143.53929943084717


100%|██████████| 2000/2000 [32:10<00:00,  1.04it/s]


Epoch 509 Mean Reward: 144.52536602020263


100%|██████████| 2000/2000 [23:32<00:00,  1.42it/s]


Epoch 510 Mean Reward: 84.34476250457763


100%|██████████| 2000/2000 [19:58<00:00,  1.67it/s]


Epoch 511 Mean Reward: 40.50223863220215


100%|██████████| 2000/2000 [19:23<00:00,  1.72it/s]


Epoch 512 Mean Reward: 49.672226432800294


100%|██████████| 2000/2000 [11:57<00:00,  2.79it/s]


Epoch 513 Mean Reward: 62.92394494628906


100%|██████████| 2000/2000 [11:48<00:00,  2.82it/s]


Epoch 514 Mean Reward: 66.68932978057862


100%|██████████| 2000/2000 [11:34<00:00,  2.88it/s]


Epoch 515 Mean Reward: 49.71265367126465


100%|██████████| 2000/2000 [11:31<00:00,  2.89it/s]


Epoch 516 Mean Reward: 67.3932213897705


100%|██████████| 2000/2000 [13:16<00:00,  2.51it/s]


Epoch 517 Mean Reward: 97.33432513427735


100%|██████████| 2000/2000 [18:19<00:00,  1.82it/s]


Epoch 518 Mean Reward: 73.66040671539307


100%|██████████| 2000/2000 [16:22<00:00,  2.03it/s]


Epoch 519 Mean Reward: 87.55395565795898


100%|██████████| 2000/2000 [15:59<00:00,  2.08it/s]


Epoch 520 Mean Reward: 70.71428227233886
Epoch 520 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 520 test with depth buffer:
Test Episode 1 Reward: -104.9554443359375
Test Episode 2 Reward: -98.26736450195312
Test Episode 3 Reward: -43.15260314941406
Test Episode 4 Reward: -66.73385620117188
Test Episode 5 Reward: -112.53524780273438
Test Episode 6 Reward: -104.9554443359375
Test Episode 7 Reward: -92.50833129882812
Test Episode 8 Reward: -104.9554443359375
Test Episode 9 Reward: -104.9554443359375
Test Episode 10 Reward: -104.9554443359375
Average Test Reward (with depth buffer:) -93.7974624633789
Epoch 520 test without depth buffer:
Test Episode 1 Reward: 44.327880859375
Test Episode 2 Reward: -38.108734130859375
Test Episode 3 Reward: -43.58404541015625
Test Episode 4 Reward: -7.3536529541015625
Test Episode 5 Reward: -98.920654296875
Test Episode 6 Reward: -28.915008544921875
Test Episode 7 Reward: 44.327880859375
Test Episode 8 Reward: 44.327880859375
Test Episode 9 Rewa

100%|██████████| 2000/2000 [16:39<00:00,  2.00it/s]


Epoch 521 Mean Reward: 67.53092964935303


100%|██████████| 2000/2000 [17:38<00:00,  1.89it/s]


Epoch 522 Mean Reward: 41.47960931396484


100%|██████████| 2000/2000 [17:26<00:00,  1.91it/s]


Epoch 523 Mean Reward: 54.70424584960937


100%|██████████| 2000/2000 [17:26<00:00,  1.91it/s]


Epoch 524 Mean Reward: 64.49100988006592


100%|██████████| 2000/2000 [15:38<00:00,  2.13it/s]


Epoch 525 Mean Reward: 96.81181049346924


100%|██████████| 2000/2000 [16:22<00:00,  2.04it/s]


Epoch 526 Mean Reward: 79.09578433990478


100%|██████████| 2000/2000 [16:47<00:00,  1.99it/s]


Epoch 527 Mean Reward: 64.15946036529542


100%|██████████| 2000/2000 [17:12<00:00,  1.94it/s]


Epoch 528 Mean Reward: 73.70015296173095


100%|██████████| 2000/2000 [16:15<00:00,  2.05it/s]


Epoch 529 Mean Reward: 76.8485453491211


100%|██████████| 2000/2000 [16:48<00:00,  1.98it/s]


Epoch 530 Mean Reward: 66.03123263549804


100%|██████████| 2000/2000 [10:46<00:00,  3.10it/s]


Epoch 531 Mean Reward: 23.622266593933105


100%|██████████| 2000/2000 [07:35<00:00,  4.39it/s]


Epoch 532 Mean Reward: -23.99529132080078


100%|██████████| 2000/2000 [06:59<00:00,  4.76it/s]


Epoch 533 Mean Reward: -25.116519088745118


100%|██████████| 2000/2000 [08:38<00:00,  3.85it/s]


Epoch 534 Mean Reward: -18.83523176574707


100%|██████████| 2000/2000 [09:47<00:00,  3.40it/s]


Epoch 535 Mean Reward: -19.687977416992187


100%|██████████| 2000/2000 [09:26<00:00,  3.53it/s]


Epoch 536 Mean Reward: -9.580542289733886


100%|██████████| 2000/2000 [11:40<00:00,  2.85it/s]


Epoch 537 Mean Reward: -8.015378623962402


100%|██████████| 2000/2000 [11:16<00:00,  2.96it/s]


Epoch 538 Mean Reward: -9.807808158874511


100%|██████████| 2000/2000 [10:19<00:00,  3.23it/s]


Epoch 539 Mean Reward: -5.885731590270996


100%|██████████| 2000/2000 [10:55<00:00,  3.05it/s]


Epoch 540 Mean Reward: -22.163633827209473
Epoch 540 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 540 test with depth buffer:
Test Episode 1 Reward: -3.573760986328125
Test Episode 2 Reward: -55.64668273925781
Test Episode 3 Reward: -30.393341064453125
Test Episode 4 Reward: 264.12037658691406
Test Episode 5 Reward: -46.691986083984375
Test Episode 6 Reward: 264.12037658691406
Test Episode 7 Reward: -79.84552001953125
Test Episode 8 Reward: 264.12037658691406
Test Episode 9 Reward: -32.24945068359375
Test Episode 10 Reward: -0.4190673828125
Average Test Reward (with depth buffer:) 54.354132080078124
Epoch 540 test without depth buffer:
Test Episode 1 Reward: -115.99655151367188
Test Episode 2 Reward: -2.9144287109375
Test Episode 3 Reward: -115.99655151367188
Test Episode 4 Reward: -115.99655151367188
Test Episode 5 Reward: -115.99655151367188
Test Episode 6 Reward: -115.99655151367188
Test Episode 7 Reward: -115.99655151367188
Test Episode 8 Reward: 46.9603271484375
Test Ep

100%|██████████| 2000/2000 [13:09<00:00,  2.53it/s]


Epoch 541 Mean Reward: 19.064390327453612


100%|██████████| 2000/2000 [13:46<00:00,  2.42it/s]


Epoch 542 Mean Reward: 66.87544348144532


100%|██████████| 2000/2000 [15:13<00:00,  2.19it/s]


Epoch 543 Mean Reward: 68.28118201446533


100%|██████████| 2000/2000 [14:10<00:00,  2.35it/s]


Epoch 544 Mean Reward: 65.50128813171386


100%|██████████| 2000/2000 [15:27<00:00,  2.16it/s]


Epoch 545 Mean Reward: 81.04695854949951


100%|██████████| 2000/2000 [17:15<00:00,  1.93it/s]


Epoch 546 Mean Reward: 56.96214458465576


100%|██████████| 2000/2000 [17:49<00:00,  1.87it/s]


Epoch 547 Mean Reward: 86.84070278167725


100%|██████████| 2000/2000 [15:18<00:00,  2.18it/s]


Epoch 548 Mean Reward: 89.33984399414062


100%|██████████| 2000/2000 [14:09<00:00,  2.36it/s]


Epoch 549 Mean Reward: 93.67009452056885


100%|██████████| 2000/2000 [13:52<00:00,  2.40it/s]


Epoch 550 Mean Reward: 103.8648473739624


100%|██████████| 2000/2000 [13:58<00:00,  2.39it/s]


Epoch 551 Mean Reward: 45.91436386871338


100%|██████████| 2000/2000 [14:49<00:00,  2.25it/s]


Epoch 552 Mean Reward: 67.59859625244141


100%|██████████| 2000/2000 [13:58<00:00,  2.39it/s]


Epoch 553 Mean Reward: 66.54984645843506


100%|██████████| 2000/2000 [25:46<00:00,  1.29it/s]


Epoch 554 Mean Reward: 69.51211654663086


100%|██████████| 2000/2000 [15:38<00:00,  2.13it/s]


Epoch 555 Mean Reward: 52.87163409423828


100%|██████████| 2000/2000 [14:36<00:00,  2.28it/s]


Epoch 556 Mean Reward: 43.617519882202146


100%|██████████| 2000/2000 [15:09<00:00,  2.20it/s]


Epoch 557 Mean Reward: 54.44732775115967


100%|██████████| 2000/2000 [15:09<00:00,  2.20it/s]


Epoch 558 Mean Reward: 63.18805836486816


100%|██████████| 2000/2000 [15:46<00:00,  2.11it/s]


Epoch 559 Mean Reward: 72.00880480194091


100%|██████████| 2000/2000 [15:22<00:00,  2.17it/s]


Epoch 560 Mean Reward: 69.04081443786622
Epoch 560 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 560 test with depth buffer:
Test Episode 1 Reward: -20.008926391601562
Test Episode 2 Reward: -32.484832763671875
Test Episode 3 Reward: -65.88536071777344
Test Episode 4 Reward: -65.88536071777344
Test Episode 5 Reward: -65.88536071777344
Test Episode 6 Reward: -58.437225341796875
Test Episode 7 Reward: -77.51319885253906
Test Episode 8 Reward: -65.88536071777344
Test Episode 9 Reward: -65.88536071777344
Test Episode 10 Reward: -47.598876953125
Average Test Reward (with depth buffer:) -56.54698638916015
Epoch 560 test without depth buffer:
Test Episode 1 Reward: -26.674713134765625
Test Episode 2 Reward: -35.69642639160156
Test Episode 3 Reward: -35.69642639160156
Test Episode 4 Reward: -93.81166076660156
Test Episode 5 Reward: -35.69642639160156
Test Episode 6 Reward: 215.89413452148438
Test Episode 7 Reward: 74.60989379882812
Test Episode 8 Reward: 15.30023193359375
Test Episod

100%|██████████| 2000/2000 [17:33<00:00,  1.90it/s]


Epoch 561 Mean Reward: 53.72929264831543


100%|██████████| 2000/2000 [16:46<00:00,  1.99it/s]


Epoch 562 Mean Reward: 88.881110206604


100%|██████████| 2000/2000 [17:33<00:00,  1.90it/s]


Epoch 563 Mean Reward: 81.23277889251709


100%|██████████| 2000/2000 [17:32<00:00,  1.90it/s]


Epoch 564 Mean Reward: 71.3196425628662


100%|██████████| 2000/2000 [30:29<00:00,  1.09it/s]


Epoch 565 Mean Reward: 88.14705786895752


100%|██████████| 2000/2000 [30:13<00:00,  1.10it/s]


Epoch 566 Mean Reward: 89.63796313476563


100%|██████████| 2000/2000 [29:41<00:00,  1.12it/s]


Epoch 567 Mean Reward: 84.58859789276123


100%|██████████| 2000/2000 [28:15<00:00,  1.18it/s]


Epoch 568 Mean Reward: 87.27166486358642


100%|██████████| 2000/2000 [33:28<00:00,  1.00s/it]


Epoch 569 Mean Reward: 98.0665465927124


100%|██████████| 2000/2000 [35:52<00:00,  1.08s/it]


Epoch 570 Mean Reward: 96.01897969055176


100%|██████████| 2000/2000 [32:02<00:00,  1.04it/s]


Epoch 571 Mean Reward: 87.5825792388916


100%|██████████| 2000/2000 [21:00<00:00,  1.59it/s]


Epoch 572 Mean Reward: 13.641261810302735


100%|██████████| 2000/2000 [14:26<00:00,  2.31it/s]


Epoch 573 Mean Reward: -2.4784762420654296


100%|██████████| 2000/2000 [12:07<00:00,  2.75it/s]


Epoch 574 Mean Reward: -6.682778579711914


100%|██████████| 2000/2000 [11:25<00:00,  2.92it/s]


Epoch 575 Mean Reward: -9.112695899963379


100%|██████████| 2000/2000 [11:18<00:00,  2.95it/s]


Epoch 576 Mean Reward: 32.53147338867188


100%|██████████| 2000/2000 [11:28<00:00,  2.90it/s]


Epoch 577 Mean Reward: 8.434595504760741


100%|██████████| 2000/2000 [12:05<00:00,  2.76it/s]


Epoch 578 Mean Reward: 29.703536041259767


100%|██████████| 2000/2000 [12:21<00:00,  2.70it/s]


Epoch 579 Mean Reward: 5.440813133239746


100%|██████████| 2000/2000 [12:44<00:00,  2.61it/s]


Epoch 580 Mean Reward: 21.571042373657228
Epoch 580 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 580 test with depth buffer:
Test Episode 1 Reward: -94.71012878417969
Test Episode 2 Reward: 1.681488037109375
Test Episode 3 Reward: 55.08348083496094
Test Episode 4 Reward: -94.71012878417969
Test Episode 5 Reward: -94.71012878417969
Test Episode 6 Reward: -33.20790100097656
Test Episode 7 Reward: 17.239776611328125
Test Episode 8 Reward: -94.71012878417969
Test Episode 9 Reward: -94.71012878417969
Test Episode 10 Reward: -94.71012878417969
Average Test Reward (with depth buffer:) -52.746392822265626
Epoch 580 test without depth buffer:
Test Episode 1 Reward: -97.6900634765625
Test Episode 2 Reward: -97.6900634765625
Test Episode 3 Reward: -97.6900634765625
Test Episode 4 Reward: -22.049713134765625
Test Episode 5 Reward: -97.6900634765625
Test Episode 6 Reward: -97.6900634765625
Test Episode 7 Reward: -97.6900634765625
Test Episode 8 Reward: 29.531890869140625
Test Episode 9 R

100%|██████████| 2000/2000 [20:24<00:00,  1.63it/s]


Epoch 581 Mean Reward: 66.20085948181152


100%|██████████| 2000/2000 [30:17<00:00,  1.10it/s]


Epoch 582 Mean Reward: 133.3184849319458


100%|██████████| 2000/2000 [33:50<00:00,  1.02s/it]


Epoch 583 Mean Reward: 64.28479563903808


100%|██████████| 2000/2000 [19:27<00:00,  1.71it/s]


Epoch 584 Mean Reward: 104.90518257141113


100%|██████████| 2000/2000 [25:50<00:00,  1.29it/s]


Epoch 585 Mean Reward: 91.53625932312012


100%|██████████| 2000/2000 [32:17<00:00,  1.03it/s]


Epoch 586 Mean Reward: 101.06820517730714


100%|██████████| 2000/2000 [17:59<00:00,  1.85it/s]


Epoch 587 Mean Reward: 129.24268902587892


100%|██████████| 2000/2000 [15:07<00:00,  2.20it/s]


Epoch 588 Mean Reward: 128.61099619293213


100%|██████████| 2000/2000 [20:46<00:00,  1.60it/s]


Epoch 589 Mean Reward: 112.0139807434082


100%|██████████| 2000/2000 [24:18<00:00,  1.37it/s]


Epoch 590 Mean Reward: 95.19169996643066


100%|██████████| 2000/2000 [22:22<00:00,  1.49it/s]


Epoch 591 Mean Reward: 110.03178471374511


100%|██████████| 2000/2000 [15:16<00:00,  2.18it/s]


Epoch 592 Mean Reward: 126.53943098449707


100%|██████████| 2000/2000 [21:45<00:00,  1.53it/s]


Epoch 593 Mean Reward: 99.97534845733642


100%|██████████| 2000/2000 [19:16<00:00,  1.73it/s]


Epoch 594 Mean Reward: 100.26875804138183


100%|██████████| 2000/2000 [25:27<00:00,  1.31it/s]


Epoch 595 Mean Reward: 92.58444008636475


100%|██████████| 2000/2000 [18:08<00:00,  1.84it/s]


Epoch 596 Mean Reward: 47.86266618347168


100%|██████████| 2000/2000 [17:38<00:00,  1.89it/s]


Epoch 597 Mean Reward: 76.87915020751953


100%|██████████| 2000/2000 [16:23<00:00,  2.03it/s]


Epoch 598 Mean Reward: 125.37593048095704


100%|██████████| 2000/2000 [14:09<00:00,  2.36it/s]


Epoch 599 Mean Reward: 135.33331134796143


100%|██████████| 2000/2000 [17:45<00:00,  1.88it/s]


Epoch 600 Mean Reward: 86.31661486816407
Epoch 600 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 600 test with depth buffer:
Test Episode 1 Reward: -39.565032958984375
Test Episode 2 Reward: -95.96487426757812
Test Episode 3 Reward: 11.0675048828125
Test Episode 4 Reward: -5.5045623779296875
Test Episode 5 Reward: -95.96487426757812
Test Episode 6 Reward: -95.96487426757812
Test Episode 7 Reward: -115.20054626464844
Test Episode 8 Reward: 17.232192993164062
Test Episode 9 Reward: 34.55628967285156
Test Episode 10 Reward: -95.96487426757812
Average Test Reward (with depth buffer:) -48.12736511230469
Epoch 600 test without depth buffer:
Test Episode 1 Reward: -115.98674011230469
Test Episode 2 Reward: -115.98674011230469
Test Episode 3 Reward: 113.11170959472656
Test Episode 4 Reward: -115.98674011230469
Test Episode 5 Reward: -115.98674011230469
Test Episode 6 Reward: -115.98674011230469
Test Episode 7 Reward: -115.98674011230469
Test Episode 8 Reward: 12.960235595703125
Test 

100%|██████████| 2000/2000 [17:55<00:00,  1.86it/s]


Epoch 601 Mean Reward: 106.45961110687256


100%|██████████| 2000/2000 [17:56<00:00,  1.86it/s]


Epoch 602 Mean Reward: 100.56493552398682


100%|██████████| 2000/2000 [17:05<00:00,  1.95it/s]


Epoch 603 Mean Reward: 120.38205197143554


100%|██████████| 2000/2000 [16:21<00:00,  2.04it/s]


Epoch 604 Mean Reward: 135.77214599609374


100%|██████████| 2000/2000 [16:28<00:00,  2.02it/s]


Epoch 605 Mean Reward: 133.20546627044678


100%|██████████| 2000/2000 [17:17<00:00,  1.93it/s]


Epoch 606 Mean Reward: 91.32326961517334


100%|██████████| 2000/2000 [16:39<00:00,  2.00it/s]


Epoch 607 Mean Reward: 150.26317516326904


100%|██████████| 2000/2000 [16:39<00:00,  2.00it/s]


Epoch 608 Mean Reward: 127.19700220489501


100%|██████████| 2000/2000 [18:00<00:00,  1.85it/s]


Epoch 609 Mean Reward: 138.37345162200927


100%|██████████| 2000/2000 [17:24<00:00,  1.91it/s]


Epoch 610 Mean Reward: 144.96380766296386


100%|██████████| 2000/2000 [20:06<00:00,  1.66it/s]


Epoch 611 Mean Reward: 151.00641622161865


100%|██████████| 2000/2000 [18:20<00:00,  1.82it/s]


Epoch 612 Mean Reward: 141.9131918563843


100%|██████████| 2000/2000 [16:37<00:00,  2.00it/s]


Epoch 613 Mean Reward: 152.8047105484009


100%|██████████| 2000/2000 [18:17<00:00,  1.82it/s]


Epoch 614 Mean Reward: 126.74733024597168


100%|██████████| 2000/2000 [16:40<00:00,  2.00it/s]


Epoch 615 Mean Reward: 184.23896612548828


100%|██████████| 2000/2000 [16:04<00:00,  2.07it/s]


Epoch 616 Mean Reward: 140.28435119628907


100%|██████████| 2000/2000 [17:42<00:00,  1.88it/s]


Epoch 617 Mean Reward: 102.10220677947999


100%|██████████| 2000/2000 [17:36<00:00,  1.89it/s]


Epoch 618 Mean Reward: 149.72777632141114


100%|██████████| 2000/2000 [16:39<00:00,  2.00it/s]


Epoch 619 Mean Reward: 154.86837718200684


100%|██████████| 2000/2000 [17:01<00:00,  1.96it/s]


Epoch 620 Mean Reward: 159.35755757904053
Epoch 620 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 620 test with depth buffer:
Test Episode 1 Reward: 90.99954223632812
Test Episode 2 Reward: 332.2722625732422
Test Episode 3 Reward: 77.06663513183594
Test Episode 4 Reward: -95.450927734375
Test Episode 5 Reward: 135.5748748779297
Test Episode 6 Reward: 135.5748748779297
Test Episode 7 Reward: 135.5748748779297
Test Episode 8 Reward: 135.5748748779297
Test Episode 9 Reward: 135.5748748779297
Test Episode 10 Reward: -21.074264526367188
Average Test Reward (with depth buffer:) 106.16876220703125
Epoch 620 test without depth buffer:
Test Episode 1 Reward: -12.809844970703125
Test Episode 2 Reward: 33.280303955078125
Test Episode 3 Reward: 33.280303955078125
Test Episode 4 Reward: 31.77960205078125
Test Episode 5 Reward: 80.36111450195312
Test Episode 6 Reward: 33.280303955078125
Test Episode 7 Reward: -10.46368408203125
Test Episode 8 Reward: 45.54179382324219
Test Episode 9 Reward

100%|██████████| 2000/2000 [16:01<00:00,  2.08it/s]


Epoch 621 Mean Reward: 72.06015800476074


100%|██████████| 2000/2000 [15:56<00:00,  2.09it/s]


Epoch 622 Mean Reward: 88.50587399291992


100%|██████████| 2000/2000 [19:10<00:00,  1.74it/s]


Epoch 623 Mean Reward: 75.20925083160401


100%|██████████| 2000/2000 [21:03<00:00,  1.58it/s]


Epoch 624 Mean Reward: 76.22599542236328


100%|██████████| 2000/2000 [21:18<00:00,  1.56it/s]


Epoch 625 Mean Reward: 50.78673400878906


100%|██████████| 2000/2000 [21:38<00:00,  1.54it/s]


Epoch 626 Mean Reward: 59.90564235687256


100%|██████████| 2000/2000 [21:56<00:00,  1.52it/s]


Epoch 627 Mean Reward: 44.77944129180908


100%|██████████| 2000/2000 [19:25<00:00,  1.72it/s]


Epoch 628 Mean Reward: 97.98557064819336


100%|██████████| 2000/2000 [20:21<00:00,  1.64it/s]


Epoch 629 Mean Reward: 79.88436392211914


100%|██████████| 2000/2000 [29:22<00:00,  1.13it/s]


Epoch 630 Mean Reward: 75.88805688476563


100%|██████████| 2000/2000 [25:10<00:00,  1.32it/s]


Epoch 631 Mean Reward: 107.47658126831055


100%|██████████| 2000/2000 [22:46<00:00,  1.46it/s]


Epoch 632 Mean Reward: 109.5843740310669


100%|██████████| 2000/2000 [24:30<00:00,  1.36it/s]


Epoch 633 Mean Reward: 86.0924298171997


100%|██████████| 2000/2000 [29:02<00:00,  1.15it/s]


Epoch 634 Mean Reward: 101.76932778167725


100%|██████████| 2000/2000 [23:24<00:00,  1.42it/s]


Epoch 635 Mean Reward: 79.14474587249755


100%|██████████| 2000/2000 [24:15<00:00,  1.37it/s]


Epoch 636 Mean Reward: 90.041384765625


100%|██████████| 2000/2000 [23:24<00:00,  1.42it/s]


Epoch 637 Mean Reward: 95.58714163970947


100%|██████████| 2000/2000 [23:17<00:00,  1.43it/s]


Epoch 638 Mean Reward: 89.65977295684814


100%|██████████| 2000/2000 [22:14<00:00,  1.50it/s]


Epoch 639 Mean Reward: 102.54130960083008


100%|██████████| 2000/2000 [24:07<00:00,  1.38it/s]


Epoch 640 Mean Reward: 47.419989852905275
Epoch 640 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 640 test with depth buffer:
Test Episode 1 Reward: -115.98283386230469
Test Episode 2 Reward: -37.269805908203125
Test Episode 3 Reward: -115.98283386230469
Test Episode 4 Reward: -115.98283386230469
Test Episode 5 Reward: -4.9392852783203125
Test Episode 6 Reward: -115.98283386230469
Test Episode 7 Reward: 0.817138671875
Test Episode 8 Reward: 272.6472473144531
Test Episode 9 Reward: -115.98283386230469
Test Episode 10 Reward: -115.98283386230469
Average Test Reward (with depth buffer:) -46.46417083740234
Epoch 640 test without depth buffer:
Test Episode 1 Reward: -81.94874572753906
Test Episode 2 Reward: -81.94874572753906
Test Episode 3 Reward: -115.98362731933594
Test Episode 4 Reward: 31.7958984375
Test Episode 5 Reward: -81.94874572753906
Test Episode 6 Reward: -81.94874572753906
Test Episode 7 Reward: -87.9437255859375
Test Episode 8 Reward: -115.96609497070312
Test Episod

100%|██████████| 2000/2000 [24:33<00:00,  1.36it/s]


Epoch 641 Mean Reward: 75.8365107421875


100%|██████████| 2000/2000 [24:01<00:00,  1.39it/s]


Epoch 642 Mean Reward: 99.9058984298706


100%|██████████| 2000/2000 [23:12<00:00,  1.44it/s]


Epoch 643 Mean Reward: 130.7281398086548


100%|██████████| 2000/2000 [23:25<00:00,  1.42it/s]


Epoch 644 Mean Reward: 101.7898572921753


100%|██████████| 2000/2000 [23:05<00:00,  1.44it/s]


Epoch 645 Mean Reward: 89.96411670684815


100%|██████████| 2000/2000 [25:09<00:00,  1.32it/s]


Epoch 646 Mean Reward: 82.61873350524903


100%|██████████| 2000/2000 [24:15<00:00,  1.37it/s]


Epoch 647 Mean Reward: 111.74670109558106


100%|██████████| 2000/2000 [24:52<00:00,  1.34it/s]


Epoch 648 Mean Reward: 122.57371028900147


100%|██████████| 2000/2000 [23:24<00:00,  1.42it/s]


Epoch 649 Mean Reward: 105.94458131408692


100%|██████████| 2000/2000 [23:42<00:00,  1.41it/s]


Epoch 650 Mean Reward: 85.975506980896


100%|██████████| 2000/2000 [23:36<00:00,  1.41it/s]


Epoch 651 Mean Reward: 122.11826490783692


100%|██████████| 2000/2000 [24:44<00:00,  1.35it/s]


Epoch 652 Mean Reward: 133.2492008972168


100%|██████████| 2000/2000 [25:39<00:00,  1.30it/s]


Epoch 653 Mean Reward: 82.56588372802734


100%|██████████| 2000/2000 [26:53<00:00,  1.24it/s]


Epoch 654 Mean Reward: 94.88817157745362


100%|██████████| 2000/2000 [25:14<00:00,  1.32it/s]


Epoch 655 Mean Reward: 67.90135636901856


100%|██████████| 2000/2000 [35:24<00:00,  1.06s/it]


Epoch 656 Mean Reward: 98.35569133758545


100%|██████████| 2000/2000 [27:34<00:00,  1.21it/s]


Epoch 657 Mean Reward: 55.83406269836426


100%|██████████| 2000/2000 [25:10<00:00,  1.32it/s]


Epoch 658 Mean Reward: 72.87939279937744


100%|██████████| 2000/2000 [24:07<00:00,  1.38it/s]


Epoch 659 Mean Reward: 115.32045668792725


100%|██████████| 2000/2000 [32:27<00:00,  1.03it/s]


Epoch 660 Mean Reward: 131.76605548858643
Epoch 660 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 660 test with depth buffer:
Test Episode 1 Reward: 22.334991455078125
Test Episode 2 Reward: -115.99432373046875
Test Episode 3 Reward: -115.99432373046875
Test Episode 4 Reward: 22.650466918945312
Test Episode 5 Reward: -108.91926574707031
Test Episode 6 Reward: -115.99432373046875
Test Episode 7 Reward: -44.22377014160156
Test Episode 8 Reward: -47.30183410644531
Test Episode 9 Reward: -115.99432373046875
Test Episode 10 Reward: -115.97587585449219
Average Test Reward (with depth buffer:) -73.5412582397461
Epoch 660 test without depth buffer:
Test Episode 1 Reward: -115.73959350585938
Test Episode 2 Reward: -115.73959350585938
Test Episode 3 Reward: -115.97926330566406
Test Episode 4 Reward: -9.371826171875
Test Episode 5 Reward: -4.1090545654296875
Test Episode 6 Reward: -115.73959350585938
Test Episode 7 Reward: -49.804443359375
Test Episode 8 Reward: -115.73959350585938
Test

100%|██████████| 2000/2000 [35:24<00:00,  1.06s/it]


Epoch 661 Mean Reward: 75.08505500030518


100%|██████████| 2000/2000 [40:37<00:00,  1.22s/it]


Epoch 662 Mean Reward: 36.3979575958252


100%|██████████| 2000/2000 [24:41<00:00,  1.35it/s]


Epoch 663 Mean Reward: 31.635557258605957


100%|██████████| 2000/2000 [12:19<00:00,  2.70it/s]


Epoch 664 Mean Reward: 25.44496248626709


100%|██████████| 2000/2000 [12:15<00:00,  2.72it/s]


Epoch 665 Mean Reward: 49.377548431396484


100%|██████████| 2000/2000 [12:34<00:00,  2.65it/s]


Epoch 666 Mean Reward: 47.68574413299561


100%|██████████| 2000/2000 [22:33<00:00,  1.48it/s]


Epoch 667 Mean Reward: 22.604829025268554


100%|██████████| 2000/2000 [21:25<00:00,  1.56it/s]


Epoch 668 Mean Reward: 26.949697814941405


100%|██████████| 2000/2000 [19:59<00:00,  1.67it/s]


Epoch 669 Mean Reward: 41.39798935699463


100%|██████████| 2000/2000 [20:21<00:00,  1.64it/s]


Epoch 670 Mean Reward: 56.90867942047119


100%|██████████| 2000/2000 [16:34<00:00,  2.01it/s]


Epoch 671 Mean Reward: 81.03308572387695


100%|██████████| 2000/2000 [17:57<00:00,  1.86it/s]


Epoch 672 Mean Reward: 75.84894877624512


100%|██████████| 2000/2000 [17:45<00:00,  1.88it/s]


Epoch 673 Mean Reward: 69.22758190917969


100%|██████████| 2000/2000 [16:52<00:00,  1.97it/s]


Epoch 674 Mean Reward: 71.0726117477417


100%|██████████| 2000/2000 [17:29<00:00,  1.91it/s]


Epoch 675 Mean Reward: 52.43657514190674


100%|██████████| 2000/2000 [17:53<00:00,  1.86it/s]


Epoch 676 Mean Reward: 58.876421531677245


100%|██████████| 2000/2000 [16:58<00:00,  1.96it/s]


Epoch 677 Mean Reward: 73.0941176147461


100%|██████████| 2000/2000 [17:24<00:00,  1.91it/s]


Epoch 678 Mean Reward: 61.10403937530518


100%|██████████| 2000/2000 [17:29<00:00,  1.91it/s]


Epoch 679 Mean Reward: 79.21310285949707


100%|██████████| 2000/2000 [17:28<00:00,  1.91it/s]


Epoch 680 Mean Reward: 87.8566658782959
Epoch 680 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 680 test with depth buffer:
Test Episode 1 Reward: 25.678253173828125
Test Episode 2 Reward: -115.99432373046875
Test Episode 3 Reward: 27.560943603515625
Test Episode 4 Reward: -115.99432373046875
Test Episode 5 Reward: -27.799057006835938
Test Episode 6 Reward: -115.99432373046875
Test Episode 7 Reward: 3.7785491943359375
Test Episode 8 Reward: -18.870513916015625
Test Episode 9 Reward: -115.99432373046875
Test Episode 10 Reward: -115.99432373046875
Average Test Reward (with depth buffer:) -56.962344360351565
Epoch 680 test without depth buffer:
Test Episode 1 Reward: -114.27468872070312
Test Episode 2 Reward: -14.715606689453125
Test Episode 3 Reward: 26.71002197265625
Test Episode 4 Reward: -12.417953491210938
Test Episode 5 Reward: -114.27468872070312
Test Episode 6 Reward: 18.274078369140625
Test Episode 7 Reward: -114.27468872070312
Test Episode 8 Reward: -114.27468872070312

100%|██████████| 2000/2000 [19:05<00:00,  1.75it/s]


Epoch 681 Mean Reward: 44.55902276611328


100%|██████████| 2000/2000 [18:54<00:00,  1.76it/s]


Epoch 682 Mean Reward: 44.75604026031494


100%|██████████| 2000/2000 [18:23<00:00,  1.81it/s]


Epoch 683 Mean Reward: 39.1462370223999


100%|██████████| 2000/2000 [18:22<00:00,  1.81it/s]


Epoch 684 Mean Reward: 11.93814974975586


100%|██████████| 2000/2000 [18:40<00:00,  1.79it/s]


Epoch 685 Mean Reward: 34.29029249572754


100%|██████████| 2000/2000 [18:13<00:00,  1.83it/s]


Epoch 686 Mean Reward: 51.31032292175293


100%|██████████| 2000/2000 [17:53<00:00,  1.86it/s]


Epoch 687 Mean Reward: 31.69300373840332


100%|██████████| 2000/2000 [16:59<00:00,  1.96it/s]


Epoch 688 Mean Reward: 37.92448149108887


100%|██████████| 2000/2000 [17:27<00:00,  1.91it/s]


Epoch 689 Mean Reward: 15.112093841552735


100%|██████████| 2000/2000 [22:55<00:00,  1.45it/s]


Epoch 690 Mean Reward: 18.76194896697998


100%|██████████| 2000/2000 [20:56<00:00,  1.59it/s]


Epoch 691 Mean Reward: 33.715715881347656


100%|██████████| 2000/2000 [19:53<00:00,  1.68it/s]


Epoch 692 Mean Reward: 21.576040740966796


100%|██████████| 2000/2000 [16:05<00:00,  2.07it/s]


Epoch 693 Mean Reward: 35.775812324523926


100%|██████████| 2000/2000 [17:13<00:00,  1.93it/s]


Epoch 694 Mean Reward: 15.166252151489259


100%|██████████| 2000/2000 [17:51<00:00,  1.87it/s]


Epoch 695 Mean Reward: 26.440825790405274


100%|██████████| 2000/2000 [23:09<00:00,  1.44it/s]


Epoch 696 Mean Reward: 11.303585159301758


100%|██████████| 2000/2000 [21:32<00:00,  1.55it/s]


Epoch 697 Mean Reward: 9.689432640075683


100%|██████████| 2000/2000 [38:32<00:00,  1.16s/it]


Epoch 698 Mean Reward: 12.989798851013184


100%|██████████| 2000/2000 [25:29<00:00,  1.31it/s]


Epoch 699 Mean Reward: 26.676347373962404


100%|██████████| 2000/2000 [23:00<00:00,  1.45it/s]


Epoch 700 Mean Reward: 8.076030250549316
Epoch 700 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 700 test with depth buffer:
Test Episode 1 Reward: -92.76988220214844
Test Episode 2 Reward: 53.63639831542969
Test Episode 3 Reward: -84.52760314941406
Test Episode 4 Reward: -92.76988220214844
Test Episode 5 Reward: -92.76988220214844
Test Episode 6 Reward: -92.76988220214844
Test Episode 7 Reward: -92.76988220214844
Test Episode 8 Reward: -92.76988220214844
Test Episode 9 Reward: -92.76988220214844
Test Episode 10 Reward: 24.695693969726562
Average Test Reward (with depth buffer:) -65.55846862792968
Epoch 700 test without depth buffer:
Test Episode 1 Reward: 40.16215515136719
Test Episode 2 Reward: 40.16215515136719
Test Episode 3 Reward: 66.24662780761719
Test Episode 4 Reward: 40.16215515136719
Test Episode 5 Reward: 40.16215515136719
Test Episode 6 Reward: 7.55377197265625
Test Episode 7 Reward: 40.16215515136719
Test Episode 8 Reward: -111.21755981445312
Test Episode 9 Rewa

100%|██████████| 2000/2000 [21:46<00:00,  1.53it/s]


Epoch 701 Mean Reward: 41.262461738586424


100%|██████████| 2000/2000 [22:32<00:00,  1.48it/s]


Epoch 702 Mean Reward: 43.40277784729004


100%|██████████| 2000/2000 [21:55<00:00,  1.52it/s]


Epoch 703 Mean Reward: 50.01367323303223


100%|██████████| 2000/2000 [23:28<00:00,  1.42it/s]


Epoch 704 Mean Reward: 45.19113607788086


100%|██████████| 2000/2000 [21:41<00:00,  1.54it/s]


Epoch 705 Mean Reward: 30.84267943572998


100%|██████████| 2000/2000 [22:24<00:00,  1.49it/s]


Epoch 706 Mean Reward: 44.92321465301514


100%|██████████| 2000/2000 [21:46<00:00,  1.53it/s]


Epoch 707 Mean Reward: 52.704435195922855


100%|██████████| 2000/2000 [20:41<00:00,  1.61it/s]


Epoch 708 Mean Reward: 52.76175427246094


100%|██████████| 2000/2000 [20:36<00:00,  1.62it/s]


Epoch 709 Mean Reward: 37.90297833251953


100%|██████████| 2000/2000 [21:14<00:00,  1.57it/s]


Epoch 710 Mean Reward: 24.055689682006836


100%|██████████| 2000/2000 [22:47<00:00,  1.46it/s]


Epoch 711 Mean Reward: 51.106491790771486


100%|██████████| 2000/2000 [21:18<00:00,  1.56it/s]


Epoch 712 Mean Reward: 88.81249747467041


100%|██████████| 2000/2000 [22:29<00:00,  1.48it/s]


Epoch 713 Mean Reward: 79.10213314056396


100%|██████████| 2000/2000 [21:19<00:00,  1.56it/s]


Epoch 714 Mean Reward: 63.399128814697264


100%|██████████| 2000/2000 [20:09<00:00,  1.65it/s]


Epoch 715 Mean Reward: 61.20968148040772


100%|██████████| 2000/2000 [17:31<00:00,  1.90it/s]


Epoch 716 Mean Reward: 19.347790473937987


100%|██████████| 2000/2000 [16:45<00:00,  1.99it/s]


Epoch 717 Mean Reward: 13.269684211730958


100%|██████████| 2000/2000 [15:38<00:00,  2.13it/s]


Epoch 718 Mean Reward: 10.368951301574707


100%|██████████| 2000/2000 [27:30<00:00,  1.21it/s]


Epoch 719 Mean Reward: 27.163397789001465


100%|██████████| 2000/2000 [26:43<00:00,  1.25it/s]


Epoch 720 Mean Reward: 31.118589820861818
Epoch 720 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 720 test with depth buffer:
Test Episode 1 Reward: 23.300460815429688
Test Episode 2 Reward: 30.386016845703125
Test Episode 3 Reward: 96.55271911621094
Test Episode 4 Reward: -98.90727233886719
Test Episode 5 Reward: -24.501922607421875
Test Episode 6 Reward: -98.90727233886719
Test Episode 7 Reward: -16.4302978515625
Test Episode 8 Reward: -95.79220581054688
Test Episode 9 Reward: -30.43353271484375
Test Episode 10 Reward: -98.90727233886719
Average Test Reward (with depth buffer:) -31.36405792236328
Epoch 720 test without depth buffer:
Test Episode 1 Reward: -21.649322509765625
Test Episode 2 Reward: 5.9858245849609375
Test Episode 3 Reward: -115.4254150390625
Test Episode 4 Reward: -115.4254150390625
Test Episode 5 Reward: -115.4254150390625
Test Episode 6 Reward: -115.4254150390625
Test Episode 7 Reward: -115.4254150390625
Test Episode 8 Reward: -115.4254150390625
Test Episo

100%|██████████| 2000/2000 [21:12<00:00,  1.57it/s]


Epoch 721 Mean Reward: 67.35145809173584


100%|██████████| 2000/2000 [30:40<00:00,  1.09it/s]


Epoch 722 Mean Reward: 41.122126258850095


100%|██████████| 2000/2000 [33:20<00:00,  1.00s/it]


Epoch 723 Mean Reward: 43.50893435668945


100%|██████████| 2000/2000 [28:42<00:00,  1.16it/s]


Epoch 724 Mean Reward: 63.20879425811768


100%|██████████| 2000/2000 [31:39<00:00,  1.05it/s]


Epoch 725 Mean Reward: 26.843808166503905


100%|██████████| 2000/2000 [14:51<00:00,  2.24it/s]


Epoch 726 Mean Reward: 36.887479301452636


100%|██████████| 2000/2000 [11:38<00:00,  2.86it/s]


Epoch 727 Mean Reward: 72.50199272918701


100%|██████████| 2000/2000 [12:19<00:00,  2.71it/s]


Epoch 728 Mean Reward: 35.260677436828615


100%|██████████| 2000/2000 [15:53<00:00,  2.10it/s]


Epoch 729 Mean Reward: 59.22575388336182


100%|██████████| 2000/2000 [21:22<00:00,  1.56it/s]


Epoch 730 Mean Reward: 46.91584147644043


100%|██████████| 2000/2000 [29:54<00:00,  1.11it/s]


Epoch 731 Mean Reward: 62.99047199249267


100%|██████████| 2000/2000 [22:24<00:00,  1.49it/s]


Epoch 732 Mean Reward: 54.5724997253418


100%|██████████| 2000/2000 [19:45<00:00,  1.69it/s]


Epoch 733 Mean Reward: 46.49786638641358


100%|██████████| 2000/2000 [17:25<00:00,  1.91it/s]


Epoch 734 Mean Reward: 53.21686295318604


100%|██████████| 2000/2000 [21:24<00:00,  1.56it/s]


Epoch 735 Mean Reward: 29.804854919433595


100%|██████████| 2000/2000 [18:55<00:00,  1.76it/s]


Epoch 736 Mean Reward: 51.746689613342284


100%|██████████| 2000/2000 [17:15<00:00,  1.93it/s]


Epoch 737 Mean Reward: 32.7599080581665


100%|██████████| 2000/2000 [17:13<00:00,  1.93it/s]


Epoch 738 Mean Reward: 42.844823791503906


100%|██████████| 2000/2000 [17:12<00:00,  1.94it/s]


Epoch 739 Mean Reward: 32.59034617614746


100%|██████████| 2000/2000 [17:52<00:00,  1.87it/s]


Epoch 740 Mean Reward: 46.25777265930176
Epoch 740 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 740 test with depth buffer:
Test Episode 1 Reward: -52.649169921875
Test Episode 2 Reward: -52.649169921875
Test Episode 3 Reward: -52.649169921875
Test Episode 4 Reward: -52.649169921875
Test Episode 5 Reward: -54.83174133300781
Test Episode 6 Reward: 38.234710693359375
Test Episode 7 Reward: -52.649169921875
Test Episode 8 Reward: -52.649169921875
Test Episode 9 Reward: -52.649169921875
Test Episode 10 Reward: -102.01039123535156
Average Test Reward (with depth buffer:) -48.7151611328125
Epoch 740 test without depth buffer:
Test Episode 1 Reward: -92.84638977050781
Test Episode 2 Reward: -90.14244079589844
Test Episode 3 Reward: -28.068023681640625
Test Episode 4 Reward: -84.12286376953125
Test Episode 5 Reward: -113.37724304199219
Test Episode 6 Reward: -115.98773193359375
Test Episode 7 Reward: -107.74009704589844
Test Episode 8 Reward: -115.98811340332031
Test Episode 9 Rewar

100%|██████████| 2000/2000 [16:24<00:00,  2.03it/s]


Epoch 741 Mean Reward: 76.09747392272949


100%|██████████| 2000/2000 [15:43<00:00,  2.12it/s]


Epoch 742 Mean Reward: 73.56214418792725


100%|██████████| 2000/2000 [14:33<00:00,  2.29it/s]


Epoch 743 Mean Reward: 71.30915802764892


100%|██████████| 2000/2000 [13:06<00:00,  2.54it/s]


Epoch 744 Mean Reward: 109.5158465499878


100%|██████████| 2000/2000 [13:58<00:00,  2.38it/s]


Epoch 745 Mean Reward: 101.99529084014893


100%|██████████| 2000/2000 [13:53<00:00,  2.40it/s]


Epoch 746 Mean Reward: 108.44581241607666


100%|██████████| 2000/2000 [15:02<00:00,  2.22it/s]


Epoch 747 Mean Reward: 72.99468124389648


100%|██████████| 2000/2000 [14:33<00:00,  2.29it/s]


Epoch 748 Mean Reward: 97.00010134124756


100%|██████████| 2000/2000 [13:38<00:00,  2.44it/s]


Epoch 749 Mean Reward: 93.05229027557372


100%|██████████| 2000/2000 [15:08<00:00,  2.20it/s]


Epoch 750 Mean Reward: 81.34364498901367


100%|██████████| 2000/2000 [16:09<00:00,  2.06it/s]


Epoch 751 Mean Reward: 99.4204245147705


100%|██████████| 2000/2000 [17:51<00:00,  1.87it/s]


Epoch 752 Mean Reward: 132.59986245727538


100%|██████████| 2000/2000 [16:41<00:00,  2.00it/s]


Epoch 753 Mean Reward: 116.30586557006836


100%|██████████| 2000/2000 [16:27<00:00,  2.03it/s]


Epoch 754 Mean Reward: 94.95166171264648


100%|██████████| 2000/2000 [17:44<00:00,  1.88it/s]


Epoch 755 Mean Reward: 94.6536591796875


100%|██████████| 2000/2000 [17:10<00:00,  1.94it/s]


Epoch 756 Mean Reward: 132.40667488098146


100%|██████████| 2000/2000 [16:31<00:00,  2.02it/s]


Epoch 757 Mean Reward: 147.53843647766112


100%|██████████| 2000/2000 [16:39<00:00,  2.00it/s]


Epoch 758 Mean Reward: 100.79384805297852


100%|██████████| 2000/2000 [15:10<00:00,  2.20it/s]


Epoch 759 Mean Reward: 120.49726682281494


100%|██████████| 2000/2000 [14:39<00:00,  2.27it/s]


Epoch 760 Mean Reward: 141.9652543411255
Epoch 760 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 760 test with depth buffer:
Test Episode 1 Reward: -115.98960876464844
Test Episode 2 Reward: -115.99557495117188
Test Episode 3 Reward: -115.99557495117188
Test Episode 4 Reward: -115.99557495117188
Test Episode 5 Reward: -115.99557495117188
Test Episode 6 Reward: -115.99557495117188
Test Episode 7 Reward: -115.99716186523438
Test Episode 8 Reward: -115.97715759277344
Test Episode 9 Reward: -17.1475830078125
Test Episode 10 Reward: -115.99557495117188
Average Test Reward (with depth buffer:) -106.10849609375
Epoch 760 test without depth buffer:
Test Episode 1 Reward: -93.6158447265625
Test Episode 2 Reward: -93.6158447265625
Test Episode 3 Reward: -93.6158447265625
Test Episode 4 Reward: -93.6158447265625
Test Episode 5 Reward: 188.04367065429688
Test Episode 6 Reward: 93.506591796875
Test Episode 7 Reward: -93.6158447265625
Test Episode 8 Reward: 264.5420379638672
Test Episode 9

100%|██████████| 2000/2000 [16:13<00:00,  2.05it/s]


Epoch 761 Mean Reward: 89.67448545074463


100%|██████████| 2000/2000 [16:43<00:00,  1.99it/s]


Epoch 762 Mean Reward: 56.99277982330322


100%|██████████| 2000/2000 [16:37<00:00,  2.01it/s]


Epoch 763 Mean Reward: 68.00264896392822


100%|██████████| 2000/2000 [15:07<00:00,  2.20it/s]


Epoch 764 Mean Reward: 104.44005847930909


100%|██████████| 2000/2000 [14:52<00:00,  2.24it/s]


Epoch 765 Mean Reward: 122.00817618560791


100%|██████████| 2000/2000 [15:29<00:00,  2.15it/s]


Epoch 766 Mean Reward: 85.83514225769044


100%|██████████| 2000/2000 [15:55<00:00,  2.09it/s]


Epoch 767 Mean Reward: 73.47627837371826


100%|██████████| 2000/2000 [15:25<00:00,  2.16it/s]


Epoch 768 Mean Reward: 104.40550343322754


100%|██████████| 2000/2000 [15:23<00:00,  2.16it/s]


Epoch 769 Mean Reward: 120.03670322418213


100%|██████████| 2000/2000 [15:00<00:00,  2.22it/s]


Epoch 770 Mean Reward: 110.7371215133667


100%|██████████| 2000/2000 [17:02<00:00,  1.96it/s]


Epoch 771 Mean Reward: 106.24475870513916


100%|██████████| 2000/2000 [18:27<00:00,  1.81it/s]


Epoch 772 Mean Reward: 115.27384718322755


100%|██████████| 2000/2000 [20:21<00:00,  1.64it/s]


Epoch 773 Mean Reward: 144.86359047698974


100%|██████████| 2000/2000 [20:24<00:00,  1.63it/s]


Epoch 774 Mean Reward: 136.9851689529419


100%|██████████| 2000/2000 [21:15<00:00,  1.57it/s]


Epoch 775 Mean Reward: 137.57352121734618


100%|██████████| 2000/2000 [20:46<00:00,  1.61it/s]


Epoch 776 Mean Reward: 151.34679611206056


100%|██████████| 2000/2000 [20:07<00:00,  1.66it/s]


Epoch 777 Mean Reward: 112.762522605896


100%|██████████| 2000/2000 [16:22<00:00,  2.04it/s]


Epoch 778 Mean Reward: 144.54669921875


100%|██████████| 2000/2000 [15:37<00:00,  2.13it/s]


Epoch 779 Mean Reward: 134.80381463623047


100%|██████████| 2000/2000 [14:52<00:00,  2.24it/s]


Epoch 780 Mean Reward: 142.375649269104
Epoch 780 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 780 test with depth buffer:
Test Episode 1 Reward: 101.92189025878906
Test Episode 2 Reward: 101.92189025878906
Test Episode 3 Reward: 101.92189025878906
Test Episode 4 Reward: -10.142074584960938
Test Episode 5 Reward: -51.24005126953125
Test Episode 6 Reward: 101.92189025878906
Test Episode 7 Reward: -111.17709350585938
Test Episode 8 Reward: -64.21632385253906
Test Episode 9 Reward: 101.92189025878906
Test Episode 10 Reward: -12.232879638671875
Average Test Reward (with depth buffer:) 26.06010284423828
Epoch 780 test without depth buffer:
Test Episode 1 Reward: 35.81721496582031
Test Episode 2 Reward: -43.940765380859375
Test Episode 3 Reward: -43.940765380859375
Test Episode 4 Reward: 35.502685546875
Test Episode 5 Reward: -43.940765380859375
Test Episode 6 Reward: -115.99226379394531
Test Episode 7 Reward: -43.940765380859375
Test Episode 8 Reward: -115.71388244628906
Test Epi

100%|██████████| 2000/2000 [16:32<00:00,  2.01it/s]


Epoch 781 Mean Reward: 72.64876225280761


100%|██████████| 2000/2000 [15:47<00:00,  2.11it/s]


Epoch 782 Mean Reward: 91.96622479248047


100%|██████████| 2000/2000 [16:00<00:00,  2.08it/s]


Epoch 783 Mean Reward: 83.33280110168457


100%|██████████| 2000/2000 [15:39<00:00,  2.13it/s]


Epoch 784 Mean Reward: 93.65678923797607


100%|██████████| 2000/2000 [14:43<00:00,  2.26it/s]


Epoch 785 Mean Reward: 137.5120312805176


100%|██████████| 2000/2000 [15:33<00:00,  2.14it/s]


Epoch 786 Mean Reward: 111.30679461669922


100%|██████████| 2000/2000 [15:45<00:00,  2.12it/s]


Epoch 787 Mean Reward: 113.12024949645996


100%|██████████| 2000/2000 [15:31<00:00,  2.15it/s]


Epoch 788 Mean Reward: 113.49595520019531


100%|██████████| 2000/2000 [15:53<00:00,  2.10it/s]


Epoch 789 Mean Reward: 91.40468016052246


100%|██████████| 2000/2000 [15:23<00:00,  2.17it/s]


Epoch 790 Mean Reward: 111.55480368041992


100%|██████████| 2000/2000 [14:29<00:00,  2.30it/s]


Epoch 791 Mean Reward: 115.34173579406739


100%|██████████| 2000/2000 [15:29<00:00,  2.15it/s]


Epoch 792 Mean Reward: 91.72096906280518


100%|██████████| 2000/2000 [14:34<00:00,  2.29it/s]


Epoch 793 Mean Reward: 103.48754898834228


100%|██████████| 2000/2000 [14:47<00:00,  2.25it/s]


Epoch 794 Mean Reward: 117.849969581604


100%|██████████| 2000/2000 [16:20<00:00,  2.04it/s]


Epoch 795 Mean Reward: 55.81482398986817


100%|██████████| 2000/2000 [14:59<00:00,  2.22it/s]


Epoch 796 Mean Reward: 95.15359393310547


100%|██████████| 2000/2000 [16:00<00:00,  2.08it/s]


Epoch 797 Mean Reward: 85.41759976959229


100%|██████████| 2000/2000 [15:35<00:00,  2.14it/s]


Epoch 798 Mean Reward: 102.64886380767823


100%|██████████| 2000/2000 [17:51<00:00,  1.87it/s]


Epoch 799 Mean Reward: 68.1441583404541


100%|██████████| 2000/2000 [16:41<00:00,  2.00it/s]


Epoch 800 Mean Reward: 86.59523091125489
Epoch 800 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 800 test with depth buffer:
Test Episode 1 Reward: -66.40925598144531
Test Episode 2 Reward: -115.19439697265625
Test Episode 3 Reward: -66.40925598144531
Test Episode 4 Reward: -66.40925598144531
Test Episode 5 Reward: -66.40925598144531
Test Episode 6 Reward: -66.40925598144531
Test Episode 7 Reward: -66.40925598144531
Test Episode 8 Reward: -66.40925598144531
Test Episode 9 Reward: -66.40925598144531
Test Episode 10 Reward: -17.944000244140625
Average Test Reward (with depth buffer:) -66.44124450683594
Epoch 800 test without depth buffer:
Test Episode 1 Reward: -44.82696533203125
Test Episode 2 Reward: -44.82696533203125
Test Episode 3 Reward: -18.439163208007812
Test Episode 4 Reward: -44.82696533203125
Test Episode 5 Reward: -44.82696533203125
Test Episode 6 Reward: -31.977081298828125
Test Episode 7 Reward: 2.681060791015625
Test Episode 8 Reward: -44.82696533203125
Test Epi

In [6]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

for i in range(len(ckpts)):
    test_reward = test_agent(DQN, num_episodes=20,
                             training=False,
                             load_model=True,
                             depth=True,
                             model_dir=ckpts[i])
    print('Average Test Reward (with depth buffer):', test_reward)

    test_reward = test_agent(DQN, num_episodes=20,
                             training=False,
                             load_model=True,
                             depth=False,
                             model_dir=ckpts[i])
    print('Average Test Reward (without depth buffer):', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-420
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-420
Test Episode 1 Reward: -50.31547546386719
Test Episode 2 Reward: -104.49089050292969
Test Episode 3 Reward: -104.49089050292969
Test Episode 4 Reward: -104.49089050292969
Test Episode 5 Reward: -11.073715209960938
Test Episode 6 Reward: -104.49089050292969
Test Episode 7 Reward: -52.17103576660156
Test Episode 8 Reward: -5.919708251953125
Test Episode 9 Reward: -104.49089050292969
Test Episode 10 Reward: -105.98927307128906
Test Episode 11 Reward: -108.3341064453125
Test Episode 12 Reward: -104.49089050292969
Test Episode 13 Reward: -114.02394104003906
Test Episode 14 Reward: -115.97746276855469
Test Episode 15 Reward: -104.49089050292969
Test Episode 16 Reward: -104.49089050292969
Test Episode 17 Reward: -102.34445190429688
Test Episode 18 Reward: -104.49089050292969
Test Episode 19 Reward: -104.49089050292969
Test Episode 20 Reward: -104.4908905029296

Test Episode 16 Reward: -110.24053955078125
Test Episode 17 Reward: -41.64593505859375
Test Episode 18 Reward: -53.71795654296875
Test Episode 19 Reward: -110.24053955078125
Test Episode 20 Reward: -98.64634704589844
Average Test Reward (without depth buffer): -72.8001205444336
Loading model from checkpoints\deadly_corridor.ckpt-500
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-500
Test Episode 1 Reward: -75.71820068359375
Test Episode 2 Reward: -75.71820068359375
Test Episode 3 Reward: -42.46998596191406
Test Episode 4 Reward: -75.71820068359375
Test Episode 5 Reward: 218.8099365234375
Test Episode 6 Reward: 8.820419311523438
Test Episode 7 Reward: 1.2514801025390625
Test Episode 8 Reward: -75.71820068359375
Test Episode 9 Reward: -33.17030334472656
Test Episode 10 Reward: -8.01446533203125
Test Episode 11 Reward: -56.063262939453125
Test Episode 12 Reward: -1.1046142578125
Test Episode 13 Reward: -115.99250793457031
Test Episode 14 Reward: -111.2575531005

Test Episode 11 Reward: -76.25241088867188
Test Episode 12 Reward: -108.96290588378906
Test Episode 13 Reward: -52.89399719238281
Test Episode 14 Reward: -3.409149169921875
Test Episode 15 Reward: -3.409149169921875
Test Episode 16 Reward: -3.409149169921875
Test Episode 17 Reward: -3.409149169921875
Test Episode 18 Reward: -3.409149169921875
Test Episode 19 Reward: -3.409149169921875
Test Episode 20 Reward: -3.409149169921875
Average Test Reward (without depth buffer): -16.835844421386717
Loading model from checkpoints\deadly_corridor.ckpt-580
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-580
Test Episode 1 Reward: -78.36085510253906
Test Episode 2 Reward: -78.36085510253906
Test Episode 3 Reward: 10.797943115234375
Test Episode 4 Reward: -78.36085510253906
Test Episode 5 Reward: -113.39244079589844
Test Episode 6 Reward: 22.413848876953125
Test Episode 7 Reward: -14.540802001953125
Test Episode 8 Reward: -115.93325805664062
Test Episode 9 Reward: -78.3608

Test Episode 5 Reward: -80.00469970703125
Test Episode 6 Reward: 12.20904541015625
Test Episode 7 Reward: 4.9593658447265625
Test Episode 8 Reward: 7.1689300537109375
Test Episode 9 Reward: -31.99688720703125
Test Episode 10 Reward: -80.00469970703125
Test Episode 11 Reward: -28.858154296875
Test Episode 12 Reward: -108.89469909667969
Test Episode 13 Reward: -80.00469970703125
Test Episode 14 Reward: -80.00469970703125
Test Episode 15 Reward: -80.00469970703125
Test Episode 16 Reward: -80.00469970703125
Test Episode 17 Reward: -80.00469970703125
Test Episode 18 Reward: -69.26762390136719
Test Episode 19 Reward: -80.00469970703125
Test Episode 20 Reward: -80.00469970703125
Average Test Reward (without depth buffer): -47.94685440063476
Loading model from checkpoints\deadly_corridor.ckpt-660
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-660
Test Episode 1 Reward: -115.97689819335938
Test Episode 2 Reward: -39.63262939453125
Test Episode 3 Reward: -115.97689819

Test Episode 1 Reward: -50.73466491699219
Test Episode 2 Reward: -115.97090148925781
Test Episode 3 Reward: -115.99362182617188
Test Episode 4 Reward: -115.97090148925781
Test Episode 5 Reward: 1.652069091796875
Test Episode 6 Reward: 4.5254974365234375
Test Episode 7 Reward: -50.81666564941406
Test Episode 8 Reward: -104.46783447265625
Test Episode 9 Reward: 78.68965148925781
Test Episode 10 Reward: -115.97090148925781
Test Episode 11 Reward: -50.38909912109375
Test Episode 12 Reward: -115.97090148925781
Test Episode 13 Reward: -20.673110961914062
Test Episode 14 Reward: -31.453216552734375
Test Episode 15 Reward: -115.97090148925781
Test Episode 16 Reward: -91.30203247070312
Test Episode 17 Reward: -115.97090148925781
Test Episode 18 Reward: 276.3733215332031
Test Episode 19 Reward: -38.200042724609375
Test Episode 20 Reward: -115.97090148925781
Average Test Reward (without depth buffer): -50.229302978515626
Loading model from checkpoints\deadly_corridor.ckpt-740
INFO:tensorflow:Rest

Test Episode 19 Reward: 34.233184814453125
Test Episode 20 Reward: -44.0439453125
Average Test Reward (with depth buffer): -26.17293701171875
Loading model from checkpoints\deadly_corridor.ckpt-800
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-800
Test Episode 1 Reward: 15.135162353515625
Test Episode 2 Reward: -61.18501281738281
Test Episode 3 Reward: -67.76820373535156
Test Episode 4 Reward: 20.694473266601562
Test Episode 5 Reward: 15.135162353515625
Test Episode 6 Reward: 15.135162353515625
Test Episode 7 Reward: -57.496124267578125
Test Episode 8 Reward: -24.960617065429688
Test Episode 9 Reward: 14.34906005859375
Test Episode 10 Reward: 15.135162353515625
Test Episode 11 Reward: 15.135162353515625
Test Episode 12 Reward: 6.143341064453125
Test Episode 13 Reward: 15.135162353515625
Test Episode 14 Reward: 15.135162353515625
Test Episode 15 Reward: 15.135162353515625
Test Episode 16 Reward: -24.176712036132812
Test Episode 17 Reward: -33.49822998046875
