In [1]:
import importlib.util
import time

import tensorflow as tf
import numpy as np

from skimage.transform import rescale
from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.GRAY8)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 1
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + int(game.is_depth_buffer_enabled())

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 75000
epochs = 800
steps_per_epoch = 2000
learning_rate = 0.0025
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'
num_ckpts = 20


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if float(down_sample_ratio) != 1.0:
        image = rescale(image=image, scale=down_sample_ratio, mode='reflect')
    image = image.astype(np.float32)
    image = np.expand_dims(image, axis=0)

    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, depth, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            
            if depth == False:
                depth_buffer = np.zeros(state.screen_buffer.shape)
            elif depth == True:
                depth_buffer = state.depth_buffer
                
            state_buffer = np.stack((state.screen_buffer,
                                     depth_buffer), axis=-1)
            state1 = preprocess(state_buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.layers.flatten(self.conv2,
                                         name=network_name + '_flatten'
                                        )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self):
        self.learning_rate = 0.98*self.learning_rate
        
        return self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver(max_to_keep=num_ckpts, reshape=True)
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()
epoch_rank_depth = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            
#Substitute an array of zeros for the depth buffer if that setting is disabled

            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer.shape)
            else:
                depth_buffer = state.depth_buffer
            
            state1_buffer = np.stack((state.screen_buffer, depth_buffer), axis=-1)
            state1 = preprocess(state1_buffer, down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2_buffer = np.stack((state.screen_buffer, state.depth_buffer), axis=-1)
                state2 = preprocess(state2_buffer, down_sample_ratio)
                
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr()
    target_net.update_lr()
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Update the target network every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        update_target(update_ops, session)
        
#Save the model and test the agent for 10 episodes every 20 epochs
    
    if (epoch + 1) % 20 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

#Test the agent both with and without the depth buffer given
        
        print('Epoch {} test with depth buffer:'.format(epoch + 1))
        test_reward_depth = test_agent(DQN, num_episodes=10,
                                       training=True,
                                       load_model=False,
                                       depth=True,
                                       session=session,
                                       model_dir=model_dir)
        print('Average Test Reward (with depth buffer:)', test_reward_depth)
        
        print('Epoch {} test without depth buffer:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 depth=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward (without depth buffer):', test_reward)
        
        epoch_rank_depth.append((test_reward_depth, epoch + 1))
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [01:45<00:00, 18.91it/s]


Epoch 1 Mean Reward: -70.14543531799316


100%|██████████| 2000/2000 [01:36<00:00, 20.67it/s]


Epoch 2 Mean Reward: -67.7360872039795


100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]


Epoch 3 Mean Reward: -68.24378144073486


100%|██████████| 2000/2000 [01:33<00:00, 21.50it/s]


Epoch 4 Mean Reward: -68.85452091217041


100%|██████████| 2000/2000 [01:46<00:00, 18.85it/s]


Epoch 5 Mean Reward: -68.6224732208252


100%|██████████| 2000/2000 [01:53<00:00, 17.55it/s]


Epoch 6 Mean Reward: -67.45589167022705


100%|██████████| 2000/2000 [01:35<00:00, 21.04it/s]


Epoch 7 Mean Reward: -68.36672199249267


100%|██████████| 2000/2000 [01:35<00:00, 20.88it/s]


Epoch 8 Mean Reward: -66.91481105804444


100%|██████████| 2000/2000 [02:01<00:00, 16.50it/s]


Epoch 9 Mean Reward: -69.2730106124878


100%|██████████| 2000/2000 [01:35<00:00, 20.87it/s]


Epoch 10 Mean Reward: -71.83629892730713


100%|██████████| 2000/2000 [01:37<00:00, 20.42it/s]


Epoch 11 Mean Reward: -67.99077415466309


100%|██████████| 2000/2000 [01:34<00:00, 21.20it/s]


Epoch 12 Mean Reward: -69.85088845062256


100%|██████████| 2000/2000 [01:34<00:00, 21.24it/s]


Epoch 13 Mean Reward: -69.4206921005249


100%|██████████| 2000/2000 [02:04<00:00, 16.01it/s]


Epoch 14 Mean Reward: -68.37532199859619


100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Epoch 15 Mean Reward: -67.74096681213379


100%|██████████| 2000/2000 [02:44<00:00, 12.16it/s]


Epoch 16 Mean Reward: -68.9996884689331


100%|██████████| 2000/2000 [01:53<00:00, 17.64it/s]


Epoch 17 Mean Reward: -68.52553997039794


100%|██████████| 2000/2000 [01:41<00:00, 19.73it/s]


Epoch 18 Mean Reward: -68.04762963867188


100%|██████████| 2000/2000 [02:05<00:00, 15.89it/s]


Epoch 19 Mean Reward: -67.26277614593506


100%|██████████| 2000/2000 [01:29<00:00, 22.30it/s]


Epoch 20 Mean Reward: -69.89900773620606
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test with depth buffer:
Test Episode 1 Reward: 90.69096374511719
Test Episode 2 Reward: 120.35203552246094
Test Episode 3 Reward: 52.4249267578125
Test Episode 4 Reward: 90.69096374511719
Test Episode 5 Reward: 438.7526092529297
Test Episode 6 Reward: 90.69096374511719
Test Episode 7 Reward: 42.77778625488281
Test Episode 8 Reward: 90.69096374511719
Test Episode 9 Reward: 90.69096374511719
Test Episode 10 Reward: 90.04365539550781
Average Test Reward (with depth buffer:) 119.78058319091797
Epoch 20 test without depth buffer:
Test Episode 1 Reward: 66.06320190429688
Test Episode 2 Reward: 135.17001342773438
Test Episode 3 Reward: 58.683837890625
Test Episode 4 Reward: 16.507125854492188
Test Episode 5 Reward: 15.260177612304688
Test Episode 6 Reward: 59.56553649902344
Test Episode 7 Reward: 8.531494140625
Test Episode 8 Reward: -11.343109130859375
Test Episode 9 Reward: 135.17001

100%|██████████| 2000/2000 [01:34<00:00, 21.20it/s]


Epoch 21 Mean Reward: -70.52224814605712


100%|██████████| 2000/2000 [01:35<00:00, 20.92it/s]


Epoch 22 Mean Reward: -70.1377102355957


100%|██████████| 2000/2000 [01:36<00:00, 20.67it/s]


Epoch 23 Mean Reward: -69.20314004516601


100%|██████████| 2000/2000 [01:33<00:00, 21.32it/s]


Epoch 24 Mean Reward: -69.63417882537841


100%|██████████| 2000/2000 [01:35<00:00, 20.90it/s]


Epoch 25 Mean Reward: -69.27937935638428


100%|██████████| 2000/2000 [01:39<00:00, 20.01it/s]


Epoch 26 Mean Reward: -66.36160682678222


100%|██████████| 2000/2000 [01:35<00:00, 21.02it/s]


Epoch 27 Mean Reward: -68.9486916732788


100%|██████████| 2000/2000 [01:36<00:00, 20.70it/s]


Epoch 28 Mean Reward: -67.14885174560547


100%|██████████| 2000/2000 [01:36<00:00, 20.72it/s]


Epoch 29 Mean Reward: -68.11575672149658


100%|██████████| 2000/2000 [01:35<00:00, 21.05it/s]


Epoch 30 Mean Reward: -69.98779563903808


100%|██████████| 2000/2000 [01:45<00:00, 19.01it/s]


Epoch 31 Mean Reward: -69.34321003723144


100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 32 Mean Reward: -67.57150075531005


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 33 Mean Reward: -68.50345252990722


100%|██████████| 2000/2000 [01:35<00:00, 21.02it/s]


Epoch 34 Mean Reward: -67.59668308258057


100%|██████████| 2000/2000 [01:38<00:00, 20.23it/s]


Epoch 35 Mean Reward: -68.8330452041626


100%|██████████| 2000/2000 [01:34<00:00, 21.12it/s]


Epoch 36 Mean Reward: -68.23242985534668


100%|██████████| 2000/2000 [01:57<00:00, 17.03it/s]


Epoch 37 Mean Reward: -69.59331219482422


100%|██████████| 2000/2000 [01:45<00:00, 18.90it/s]


Epoch 38 Mean Reward: -69.3366153717041


100%|██████████| 2000/2000 [01:35<00:00, 20.95it/s]


Epoch 39 Mean Reward: -66.87505702209472


100%|██████████| 2000/2000 [01:35<00:00, 21.03it/s]


Epoch 40 Mean Reward: -68.53577494812012
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test with depth buffer:
Test Episode 1 Reward: 96.66981506347656
Test Episode 2 Reward: 73.09768676757812
Test Episode 3 Reward: 87.59173583984375
Test Episode 4 Reward: 87.59173583984375
Test Episode 5 Reward: 440.95143127441406
Test Episode 6 Reward: 87.59173583984375
Test Episode 7 Reward: 95.23233032226562
Test Episode 8 Reward: 87.59173583984375
Test Episode 9 Reward: 106.63491821289062
Test Episode 10 Reward: 87.59173583984375
Average Test Reward (with depth buffer:) 125.05448608398437
Epoch 40 test without depth buffer:
Test Episode 1 Reward: 40.73944091796875
Test Episode 2 Reward: 40.73944091796875
Test Episode 3 Reward: -9.682037353515625
Test Episode 4 Reward: 40.73944091796875
Test Episode 5 Reward: -0.2173919677734375
Test Episode 6 Reward: 40.73944091796875
Test Episode 7 Reward: 40.73944091796875
Test Episode 8 Reward: 40.73944091796875
Test Episode 9 Reward: 40.7

100%|██████████| 2000/2000 [01:38<00:00, 20.21it/s]


Epoch 41 Mean Reward: -70.14783712768555


100%|██████████| 2000/2000 [01:34<00:00, 21.07it/s]


Epoch 42 Mean Reward: -68.7408060836792


100%|██████████| 2000/2000 [01:34<00:00, 21.06it/s]


Epoch 43 Mean Reward: -68.46956340026856


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 44 Mean Reward: -68.62553568267822


100%|██████████| 2000/2000 [01:33<00:00, 21.33it/s]


Epoch 45 Mean Reward: -69.9545622253418


100%|██████████| 2000/2000 [01:34<00:00, 21.19it/s]


Epoch 46 Mean Reward: -68.90749161529541


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 47 Mean Reward: -69.25712066650391


100%|██████████| 2000/2000 [01:34<00:00, 21.11it/s]


Epoch 48 Mean Reward: -67.81345086669921


100%|██████████| 2000/2000 [01:35<00:00, 20.87it/s]


Epoch 49 Mean Reward: -68.35318211364746


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 50 Mean Reward: -67.10083042907715


100%|██████████| 2000/2000 [01:33<00:00, 21.33it/s]


Epoch 51 Mean Reward: -70.49911997985839


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 52 Mean Reward: -71.13608712768554


100%|██████████| 2000/2000 [01:33<00:00, 21.34it/s]


Epoch 53 Mean Reward: -69.3590057220459


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 54 Mean Reward: -68.8629607849121


100%|██████████| 2000/2000 [01:34<00:00, 21.28it/s]


Epoch 55 Mean Reward: -67.06705626678466


100%|██████████| 2000/2000 [01:34<00:00, 21.22it/s]


Epoch 56 Mean Reward: -69.19256245422363


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 57 Mean Reward: -65.76867047119141


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 58 Mean Reward: -71.74183332061767


100%|██████████| 2000/2000 [01:34<00:00, 21.13it/s]


Epoch 59 Mean Reward: -67.85960649108887


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 60 Mean Reward: -67.18161991882324
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test with depth buffer:
Test Episode 1 Reward: 89.53369140625
Test Episode 2 Reward: 89.53369140625
Test Episode 3 Reward: 739.4353637695312
Test Episode 4 Reward: 89.53369140625
Test Episode 5 Reward: 89.53369140625
Test Episode 6 Reward: 99.79766845703125
Test Episode 7 Reward: 89.53369140625
Test Episode 8 Reward: 114.55625915527344
Test Episode 9 Reward: 89.53369140625
Test Episode 10 Reward: 89.53369140625
Average Test Reward (with depth buffer:) 158.0525131225586
Epoch 60 test without depth buffer:
Test Episode 1 Reward: 24.403976440429688
Test Episode 2 Reward: -4.4254913330078125
Test Episode 3 Reward: 1.8692779541015625
Test Episode 4 Reward: -42.14753723144531
Test Episode 5 Reward: 24.403976440429688
Test Episode 6 Reward: 24.403976440429688
Test Episode 7 Reward: -57.51435852050781
Test Episode 8 Reward: 24.403976440429688
Test Episode 9 Reward: 24.403976440429688
Te

100%|██████████| 2000/2000 [01:33<00:00, 21.38it/s]


Epoch 61 Mean Reward: -69.11170134735107


100%|██████████| 2000/2000 [01:34<00:00, 21.13it/s]


Epoch 62 Mean Reward: -65.54253535461426


100%|██████████| 2000/2000 [01:33<00:00, 21.40it/s]


Epoch 63 Mean Reward: -69.048164894104


100%|██████████| 2000/2000 [01:35<00:00, 20.91it/s]


Epoch 64 Mean Reward: -70.18610736846924


100%|██████████| 2000/2000 [01:35<00:00, 20.97it/s]


Epoch 65 Mean Reward: -70.42312810516357


100%|██████████| 2000/2000 [01:35<00:00, 20.87it/s]


Epoch 66 Mean Reward: -66.81742742156982


100%|██████████| 2000/2000 [01:36<00:00, 20.65it/s]


Epoch 67 Mean Reward: -68.84854863739014


100%|██████████| 2000/2000 [01:36<00:00, 20.65it/s]


Epoch 68 Mean Reward: -67.6436662902832


100%|██████████| 2000/2000 [01:34<00:00, 21.11it/s]


Epoch 69 Mean Reward: -69.3063846206665


100%|██████████| 2000/2000 [01:34<00:00, 21.07it/s]


Epoch 70 Mean Reward: -69.36422961425781


100%|██████████| 2000/2000 [01:34<00:00, 21.06it/s]


Epoch 71 Mean Reward: -68.04117516326905


100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 72 Mean Reward: -68.3797469329834


100%|██████████| 2000/2000 [01:35<00:00, 21.02it/s]


Epoch 73 Mean Reward: -67.0732082824707


100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 74 Mean Reward: -69.89504414367676


100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 75 Mean Reward: -68.78780390167236


100%|██████████| 2000/2000 [01:34<00:00, 21.22it/s]


Epoch 76 Mean Reward: -68.76813946533203


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 77 Mean Reward: -68.95317709350586


100%|██████████| 2000/2000 [01:40<00:00, 19.90it/s]


Epoch 78 Mean Reward: -70.05364456939698


100%|██████████| 2000/2000 [03:00<00:00, 11.09it/s]


Epoch 79 Mean Reward: -71.57072350311279


100%|██████████| 2000/2000 [02:44<00:00, 12.17it/s]


Epoch 80 Mean Reward: -67.59708671569824
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test with depth buffer:
Test Episode 1 Reward: 98.39788818359375
Test Episode 2 Reward: 75.037109375
Test Episode 3 Reward: 75.037109375
Test Episode 4 Reward: 75.037109375
Test Episode 5 Reward: 72.29826354980469
Test Episode 6 Reward: 75.037109375
Test Episode 7 Reward: 79.68605041503906
Test Episode 8 Reward: 410.44065856933594
Test Episode 9 Reward: 296.8779602050781
Test Episode 10 Reward: 392.17127990722656
Average Test Reward (with depth buffer:) 165.0020538330078
Epoch 80 test without depth buffer:
Test Episode 1 Reward: 125.82078552246094
Test Episode 2 Reward: -89.32633972167969
Test Episode 3 Reward: 67.62338256835938
Test Episode 4 Reward: 67.62338256835938
Test Episode 5 Reward: 67.62338256835938
Test Episode 6 Reward: 67.62338256835938
Test Episode 7 Reward: 67.62338256835938
Test Episode 8 Reward: 35.35111999511719
Test Episode 9 Reward: 70.93983459472656
Test Epi

100%|██████████| 2000/2000 [02:25<00:00, 13.75it/s]


Epoch 81 Mean Reward: -69.70812541198731


100%|██████████| 2000/2000 [02:54<00:00, 11.46it/s]


Epoch 82 Mean Reward: -69.31600485229492


100%|██████████| 2000/2000 [02:55<00:00, 11.41it/s]


Epoch 83 Mean Reward: -67.0740062789917


100%|██████████| 2000/2000 [03:01<00:00, 11.02it/s]


Epoch 84 Mean Reward: -67.22680403900146


100%|██████████| 2000/2000 [02:55<00:00, 11.41it/s]


Epoch 85 Mean Reward: -67.13313544464111


100%|██████████| 2000/2000 [02:20<00:00, 14.26it/s]


Epoch 86 Mean Reward: -67.04319611358643


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 87 Mean Reward: -70.33011853790283


100%|██████████| 2000/2000 [02:09<00:00, 15.49it/s]


Epoch 88 Mean Reward: -67.95720905303955


100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 89 Mean Reward: -68.57920423126221


100%|██████████| 2000/2000 [01:35<00:00, 20.95it/s]


Epoch 90 Mean Reward: -70.2944528427124


100%|██████████| 2000/2000 [01:35<00:00, 20.94it/s]


Epoch 91 Mean Reward: -68.07101128387451


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 92 Mean Reward: -68.95610564422607


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 93 Mean Reward: -69.67234490966797


100%|██████████| 2000/2000 [01:34<00:00, 21.12it/s]


Epoch 94 Mean Reward: -68.80461177062988


100%|██████████| 2000/2000 [01:34<00:00, 21.20it/s]


Epoch 95 Mean Reward: -68.7199199295044


100%|██████████| 2000/2000 [01:34<00:00, 21.24it/s]


Epoch 96 Mean Reward: -69.95597274017334


100%|██████████| 2000/2000 [01:34<00:00, 21.09it/s]


Epoch 97 Mean Reward: -67.40199068450927


100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 98 Mean Reward: -67.12477504730225


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 99 Mean Reward: -67.95571581268311


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 100 Mean Reward: -68.8387484664917
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test with depth buffer:
Test Episode 1 Reward: 90.31744384765625
Test Episode 2 Reward: 63.36651611328125
Test Episode 3 Reward: 504.9720916748047
Test Episode 4 Reward: 288.63232421875
Test Episode 5 Reward: 59.78233337402344
Test Episode 6 Reward: 72.9691162109375
Test Episode 7 Reward: 225.29330444335938
Test Episode 8 Reward: 83.97064208984375
Test Episode 9 Reward: 472.7585754394531
Test Episode 10 Reward: 496.428466796875
Average Test Reward (with depth buffer:) 235.84908142089844
Epoch 100 test without depth buffer:
Test Episode 1 Reward: 11.092926025390625
Test Episode 2 Reward: 8.383529663085938
Test Episode 3 Reward: 8.383529663085938
Test Episode 4 Reward: 145.72711181640625
Test Episode 5 Reward: 8.383529663085938
Test Episode 6 Reward: -66.33677673339844
Test Episode 7 Reward: 22.931076049804688
Test Episode 8 Reward: 8.383529663085938
Test Episode 9 Reward: -58.4

100%|██████████| 2000/2000 [01:34<00:00, 21.16it/s]


Epoch 101 Mean Reward: -71.60747357940674


100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 102 Mean Reward: -68.44557042694092


100%|██████████| 2000/2000 [01:34<00:00, 21.15it/s]


Epoch 103 Mean Reward: -67.87405416107178


100%|██████████| 2000/2000 [01:34<00:00, 21.11it/s]


Epoch 104 Mean Reward: -67.81524617767334


100%|██████████| 2000/2000 [01:33<00:00, 21.32it/s]


Epoch 105 Mean Reward: -70.03169431304931


100%|██████████| 2000/2000 [01:33<00:00, 21.34it/s]


Epoch 106 Mean Reward: -70.4081220779419


100%|██████████| 2000/2000 [01:33<00:00, 21.31it/s]


Epoch 107 Mean Reward: -69.61275331115722


100%|██████████| 2000/2000 [01:34<00:00, 21.12it/s]


Epoch 108 Mean Reward: -66.6655587387085


100%|██████████| 2000/2000 [01:34<00:00, 21.20it/s]


Epoch 109 Mean Reward: -68.81080982971191


100%|██████████| 2000/2000 [01:37<00:00, 20.54it/s]


Epoch 110 Mean Reward: -65.33226991271972


100%|██████████| 2000/2000 [01:34<00:00, 21.19it/s]


Epoch 111 Mean Reward: -70.5972770767212


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 112 Mean Reward: -65.07913459777832


100%|██████████| 2000/2000 [01:34<00:00, 21.24it/s]


Epoch 113 Mean Reward: -70.66214287567139


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 114 Mean Reward: -67.12101123046875


100%|██████████| 2000/2000 [01:34<00:00, 21.07it/s]


Epoch 115 Mean Reward: -67.45329657745361


100%|██████████| 2000/2000 [01:34<00:00, 21.27it/s]


Epoch 116 Mean Reward: -69.01121883392334


100%|██████████| 2000/2000 [01:34<00:00, 21.28it/s]


Epoch 117 Mean Reward: -69.11557313537598


100%|██████████| 2000/2000 [01:33<00:00, 21.37it/s]


Epoch 118 Mean Reward: -68.60669157409669


100%|██████████| 2000/2000 [01:34<00:00, 21.16it/s]


Epoch 119 Mean Reward: -66.24329705810547


100%|██████████| 2000/2000 [01:34<00:00, 21.19it/s]


Epoch 120 Mean Reward: -69.29305257415771
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test with depth buffer:
Test Episode 1 Reward: 246.78268432617188
Test Episode 2 Reward: 246.78268432617188
Test Episode 3 Reward: 390.8327178955078
Test Episode 4 Reward: 326.25892639160156
Test Episode 5 Reward: 246.78268432617188
Test Episode 6 Reward: 23.706634521484375
Test Episode 7 Reward: 426.6015319824219
Test Episode 8 Reward: 246.78268432617188
Test Episode 9 Reward: 246.78268432617188
Test Episode 10 Reward: 246.78268432617188
Average Test Reward (with depth buffer:) 264.80959167480466
Epoch 120 test without depth buffer:
Test Episode 1 Reward: 47.69773864746094
Test Episode 2 Reward: 26.762893676757812
Test Episode 3 Reward: 113.77027893066406
Test Episode 4 Reward: 10.319473266601562
Test Episode 5 Reward: 144.65846252441406
Test Episode 6 Reward: 48.7305908203125
Test Episode 7 Reward: 26.762893676757812
Test Episode 8 Reward: 26.762893676757812
Test Episode 9 

100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 121 Mean Reward: -70.55637199401855


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 122 Mean Reward: -67.60998043060303


100%|██████████| 2000/2000 [01:34<00:00, 21.24it/s]


Epoch 123 Mean Reward: -69.18000503540038


100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 124 Mean Reward: -69.38825367736817


100%|██████████| 2000/2000 [01:33<00:00, 21.31it/s]


Epoch 125 Mean Reward: -69.19721504211425


100%|██████████| 2000/2000 [01:33<00:00, 21.41it/s]


Epoch 126 Mean Reward: -69.80804718780517


100%|██████████| 2000/2000 [01:33<00:00, 21.38it/s]


Epoch 127 Mean Reward: -68.65645899200439


100%|██████████| 2000/2000 [01:33<00:00, 21.37it/s]


Epoch 128 Mean Reward: -67.53071031951905


100%|██████████| 2000/2000 [01:35<00:00, 20.85it/s]


Epoch 129 Mean Reward: -69.08932875823974


100%|██████████| 2000/2000 [01:34<00:00, 21.22it/s]


Epoch 130 Mean Reward: -69.1183030166626


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 131 Mean Reward: -68.35886264038086


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 132 Mean Reward: -65.43621058654786


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 133 Mean Reward: -69.78872979736329


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 134 Mean Reward: -68.04401145935059


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 135 Mean Reward: -69.7866220703125


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 136 Mean Reward: -68.7874951171875


100%|██████████| 2000/2000 [01:33<00:00, 21.34it/s]


Epoch 137 Mean Reward: -68.15156537628174


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 138 Mean Reward: -67.9126761856079


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 139 Mean Reward: -70.2403289642334


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 140 Mean Reward: -68.78711563110352
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test with depth buffer:
Test Episode 1 Reward: 56.29534912109375
Test Episode 2 Reward: 56.29534912109375
Test Episode 3 Reward: 56.29534912109375
Test Episode 4 Reward: 379.34185791015625
Test Episode 5 Reward: 360.69573974609375
Test Episode 6 Reward: 56.29534912109375
Test Episode 7 Reward: 56.29534912109375
Test Episode 8 Reward: 56.29534912109375
Test Episode 9 Reward: 65.52293395996094
Test Episode 10 Reward: 80.54385375976562
Average Test Reward (with depth buffer:) 122.38764801025391
Epoch 140 test without depth buffer:
Test Episode 1 Reward: 39.45855712890625
Test Episode 2 Reward: 98.48841857910156
Test Episode 3 Reward: 98.48841857910156
Test Episode 4 Reward: 218.07180786132812
Test Episode 5 Reward: 45.20623779296875
Test Episode 6 Reward: 19.276626586914062
Test Episode 7 Reward: 98.48841857910156
Test Episode 8 Reward: 98.48841857910156
Test Episode 9 Reward: 2

100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 141 Mean Reward: -68.93841101074219


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 142 Mean Reward: -68.70295092773438


100%|██████████| 2000/2000 [01:33<00:00, 21.40it/s]


Epoch 143 Mean Reward: -67.66199263763427


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 144 Mean Reward: -69.40384948730468


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 145 Mean Reward: -66.52091705322266


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 146 Mean Reward: -69.64277408599854


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 147 Mean Reward: -67.82143586730957


100%|██████████| 2000/2000 [01:33<00:00, 21.36it/s]


Epoch 148 Mean Reward: -69.87629683685303


100%|██████████| 2000/2000 [01:35<00:00, 20.97it/s]


Epoch 149 Mean Reward: -69.87468506622315


100%|██████████| 2000/2000 [01:34<00:00, 21.18it/s]


Epoch 150 Mean Reward: -65.13403093719482


100%|██████████| 2000/2000 [01:34<00:00, 21.28it/s]


Epoch 151 Mean Reward: -68.58115367126464


100%|██████████| 2000/2000 [01:33<00:00, 21.41it/s]


Epoch 152 Mean Reward: -67.87777426147461


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 153 Mean Reward: -69.10747425079346


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 154 Mean Reward: -66.7829688949585


100%|██████████| 2000/2000 [01:33<00:00, 21.37it/s]


Epoch 155 Mean Reward: -69.07972522735595


100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 156 Mean Reward: -68.89421756744385


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 157 Mean Reward: -68.90051769256591


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 158 Mean Reward: -69.10119567108154


100%|██████████| 2000/2000 [01:33<00:00, 21.46it/s]


Epoch 159 Mean Reward: -67.79641404724121


100%|██████████| 2000/2000 [01:33<00:00, 21.32it/s]


Epoch 160 Mean Reward: -68.40611581420899
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test with depth buffer:
Test Episode 1 Reward: 292.6826629638672
Test Episode 2 Reward: 116.76336669921875
Test Episode 3 Reward: 116.76336669921875
Test Episode 4 Reward: 116.76336669921875
Test Episode 5 Reward: 266.7795104980469
Test Episode 6 Reward: 28.667343139648438
Test Episode 7 Reward: 52.273956298828125
Test Episode 8 Reward: 116.76336669921875
Test Episode 9 Reward: 116.76336669921875
Test Episode 10 Reward: 116.76336669921875
Average Test Reward (with depth buffer:) 134.0983673095703
Epoch 160 test without depth buffer:
Test Episode 1 Reward: 135.3899383544922
Test Episode 2 Reward: -76.01161193847656
Test Episode 3 Reward: 113.07785034179688
Test Episode 4 Reward: 43.2447509765625
Test Episode 5 Reward: -81.92521667480469
Test Episode 6 Reward: -57.077117919921875
Test Episode 7 Reward: 113.07785034179688
Test Episode 8 Reward: 175.07839965820312
Test Episode 9 

100%|██████████| 2000/2000 [01:33<00:00, 21.40it/s]


Epoch 161 Mean Reward: -67.5317827911377


100%|██████████| 2000/2000 [01:33<00:00, 21.37it/s]


Epoch 162 Mean Reward: -66.83014787292481


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 163 Mean Reward: -66.91339562988281


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 164 Mean Reward: -70.71816938781738


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 165 Mean Reward: -68.36113333129883


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 166 Mean Reward: -67.39040452575684


100%|██████████| 2000/2000 [01:33<00:00, 21.31it/s]


Epoch 167 Mean Reward: -68.20274848175049


100%|██████████| 2000/2000 [01:34<00:00, 21.17it/s]


Epoch 168 Mean Reward: -69.31307760620118


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 169 Mean Reward: -69.0272305984497


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 170 Mean Reward: -69.68347047424317


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 171 Mean Reward: -71.15734996795655


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 172 Mean Reward: -69.75501387023925


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 173 Mean Reward: -67.58803519439697


100%|██████████| 2000/2000 [01:33<00:00, 21.46it/s]


Epoch 174 Mean Reward: -68.62513475036621


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 175 Mean Reward: -67.44608913421631


100%|██████████| 2000/2000 [01:33<00:00, 21.34it/s]


Epoch 176 Mean Reward: -68.24016496276856


100%|██████████| 2000/2000 [01:32<00:00, 21.67it/s]


Epoch 177 Mean Reward: -69.65641344451905


100%|██████████| 2000/2000 [01:32<00:00, 21.69it/s]


Epoch 178 Mean Reward: -68.68967950439453


100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 179 Mean Reward: -70.03522605895996


100%|██████████| 2000/2000 [01:32<00:00, 21.67it/s]


Epoch 180 Mean Reward: -68.70090384674073
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test with depth buffer:
Test Episode 1 Reward: 154.26559448242188
Test Episode 2 Reward: 205.85289001464844
Test Episode 3 Reward: 205.85289001464844
Test Episode 4 Reward: 28.318008422851562
Test Episode 5 Reward: 3.8423919677734375
Test Episode 6 Reward: 94.10366821289062
Test Episode 7 Reward: 29.7054443359375
Test Episode 8 Reward: 110.94699096679688
Test Episode 9 Reward: 205.85289001464844
Test Episode 10 Reward: 205.85289001464844
Average Test Reward (with depth buffer:) 124.45936584472656
Epoch 180 test without depth buffer:
Test Episode 1 Reward: -88.98762512207031
Test Episode 2 Reward: -115.23985290527344
Test Episode 3 Reward: 20.139602661132812
Test Episode 4 Reward: -48.97953796386719
Test Episode 5 Reward: 20.139602661132812
Test Episode 6 Reward: -115.98771667480469
Test Episode 7 Reward: -64.75077819824219
Test Episode 8 Reward: -49.19572448730469
Test Episod

100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 181 Mean Reward: -68.4361485748291


100%|██████████| 2000/2000 [01:34<00:00, 21.18it/s]


Epoch 182 Mean Reward: -68.31800015258789


100%|██████████| 2000/2000 [01:33<00:00, 21.38it/s]


Epoch 183 Mean Reward: -67.64556064605713


100%|██████████| 2000/2000 [01:34<00:00, 21.27it/s]


Epoch 184 Mean Reward: -70.46322966766357


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 185 Mean Reward: -69.21075709533692


100%|██████████| 2000/2000 [01:32<00:00, 21.71it/s]


Epoch 186 Mean Reward: -69.50837326812744


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 187 Mean Reward: -68.53074854278564


100%|██████████| 2000/2000 [01:34<00:00, 21.12it/s]


Epoch 188 Mean Reward: -66.71233545684814


100%|██████████| 2000/2000 [01:32<00:00, 21.61it/s]


Epoch 189 Mean Reward: -66.72889653015137


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 190 Mean Reward: -66.97810070800782


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 191 Mean Reward: -67.79217695617676


100%|██████████| 2000/2000 [01:32<00:00, 21.73it/s]


Epoch 192 Mean Reward: -67.62771996307373


100%|██████████| 2000/2000 [01:32<00:00, 21.52it/s]


Epoch 193 Mean Reward: -67.9867774963379


100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 194 Mean Reward: -68.95557040405274


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 195 Mean Reward: -66.93251959991456


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 196 Mean Reward: -70.67005470275879


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 197 Mean Reward: -71.08695423889161


100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 198 Mean Reward: -67.88280473327637


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 199 Mean Reward: -70.10064419555664


100%|██████████| 2000/2000 [01:32<00:00, 21.61it/s]


Epoch 200 Mean Reward: -67.78575552368164
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test with depth buffer:
Test Episode 1 Reward: 58.84431457519531
Test Episode 2 Reward: -43.88362121582031
Test Episode 3 Reward: 53.53401184082031
Test Episode 4 Reward: 145.243896484375
Test Episode 5 Reward: 145.243896484375
Test Episode 6 Reward: 145.243896484375
Test Episode 7 Reward: 145.243896484375
Test Episode 8 Reward: 177.192138671875
Test Episode 9 Reward: 145.243896484375
Test Episode 10 Reward: 39.2396240234375
Average Test Reward (with depth buffer:) 101.11459503173828
Epoch 200 test without depth buffer:
Test Episode 1 Reward: 82.28526306152344
Test Episode 2 Reward: 2.393798828125
Test Episode 3 Reward: -43.07600402832031
Test Episode 4 Reward: -14.732009887695312
Test Episode 5 Reward: 27.969970703125
Test Episode 6 Reward: 82.28526306152344
Test Episode 7 Reward: -114.05119323730469
Test Episode 8 Reward: 82.28526306152344
Test Episode 9 Reward: -114.166061

100%|██████████| 2000/2000 [01:33<00:00, 21.36it/s]


Epoch 201 Mean Reward: -70.0330722579956


100%|██████████| 2000/2000 [01:33<00:00, 21.31it/s]


Epoch 202 Mean Reward: -65.93933126831055


100%|██████████| 2000/2000 [01:33<00:00, 21.37it/s]


Epoch 203 Mean Reward: -69.29987104797364


100%|██████████| 2000/2000 [01:33<00:00, 21.31it/s]


Epoch 204 Mean Reward: -67.48560176849365


100%|██████████| 2000/2000 [01:32<00:00, 21.62it/s]


Epoch 205 Mean Reward: -67.80180569458008


100%|██████████| 2000/2000 [01:32<00:00, 21.65it/s]


Epoch 206 Mean Reward: -70.30542363739013


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 207 Mean Reward: -69.12935824584962


100%|██████████| 2000/2000 [01:34<00:00, 21.12it/s]


Epoch 208 Mean Reward: -69.61216136932373


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 209 Mean Reward: -68.74280809783936


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 210 Mean Reward: -67.11985604858398


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 211 Mean Reward: -68.807497215271


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 212 Mean Reward: -70.60498387908936


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 213 Mean Reward: -66.61619668579101


100%|██████████| 2000/2000 [01:33<00:00, 21.37it/s]


Epoch 214 Mean Reward: -69.72048590087891


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 215 Mean Reward: -67.00341780090332


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 216 Mean Reward: -68.79642055511475


100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 217 Mean Reward: -67.71110422515869


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 218 Mean Reward: -68.74018788909912


100%|██████████| 2000/2000 [01:32<00:00, 21.52it/s]


Epoch 219 Mean Reward: -66.48790756225586


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 220 Mean Reward: -67.10676698303223
Epoch 220 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 220 test with depth buffer:
Test Episode 1 Reward: -71.54542541503906
Test Episode 2 Reward: 225.666259765625
Test Episode 3 Reward: 225.666259765625
Test Episode 4 Reward: 225.666259765625
Test Episode 5 Reward: 225.666259765625
Test Episode 6 Reward: 225.666259765625
Test Episode 7 Reward: 225.666259765625
Test Episode 8 Reward: 225.666259765625
Test Episode 9 Reward: 122.85923767089844
Test Episode 10 Reward: 99.315185546875
Average Test Reward (with depth buffer:) 173.02928161621094
Epoch 220 test without depth buffer:
Test Episode 1 Reward: -115.94844055175781
Test Episode 2 Reward: -115.94844055175781
Test Episode 3 Reward: -115.94844055175781
Test Episode 4 Reward: -112.03010559082031
Test Episode 5 Reward: -115.94844055175781
Test Episode 6 Reward: -115.98204040527344
Test Episode 7 Reward: -114.24839782714844
Test Episode 8 Reward: -115.94844055175781
Test Episode 9 Rewa

100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 221 Mean Reward: -67.7849825592041


100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 222 Mean Reward: -66.65598023986817


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 223 Mean Reward: -69.58684197235108


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 224 Mean Reward: -69.44147253417968


100%|██████████| 2000/2000 [01:32<00:00, 21.51it/s]


Epoch 225 Mean Reward: -68.10967699432373


100%|██████████| 2000/2000 [01:33<00:00, 21.40it/s]


Epoch 226 Mean Reward: -66.9043783493042


100%|██████████| 2000/2000 [01:34<00:00, 21.20it/s]


Epoch 227 Mean Reward: -67.70239073944092


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 228 Mean Reward: -67.18629296875


100%|██████████| 2000/2000 [01:33<00:00, 21.40it/s]


Epoch 229 Mean Reward: -66.92459244537353


100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 230 Mean Reward: -69.83732749176025


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 231 Mean Reward: -67.32506084442139


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 232 Mean Reward: -66.17858959197999


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 233 Mean Reward: -69.56878173065185


100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 234 Mean Reward: -67.85507450866699


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 235 Mean Reward: -67.1268442993164


100%|██████████| 2000/2000 [01:32<00:00, 21.71it/s]


Epoch 236 Mean Reward: -69.98714641571046


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 237 Mean Reward: -71.26228812408448


100%|██████████| 2000/2000 [01:32<00:00, 21.51it/s]


Epoch 238 Mean Reward: -68.52584777832031


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 239 Mean Reward: -66.8488958053589


100%|██████████| 2000/2000 [01:33<00:00, 21.32it/s]


Epoch 240 Mean Reward: -66.86202373504639
Epoch 240 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 240 test with depth buffer:
Test Episode 1 Reward: 54.1220703125
Test Episode 2 Reward: 297.33006286621094
Test Episode 3 Reward: 175.72219848632812
Test Episode 4 Reward: 54.1220703125
Test Episode 5 Reward: 287.08717346191406
Test Episode 6 Reward: 54.1220703125
Test Episode 7 Reward: 0.5543670654296875
Test Episode 8 Reward: 54.1220703125
Test Episode 9 Reward: 54.1220703125
Test Episode 10 Reward: 54.1220703125
Average Test Reward (with depth buffer:) 108.54262237548828
Epoch 240 test without depth buffer:
Test Episode 1 Reward: -115.4813232421875
Test Episode 2 Reward: -115.4813232421875
Test Episode 3 Reward: -115.50325012207031
Test Episode 4 Reward: -98.96327209472656
Test Episode 5 Reward: -115.4813232421875
Test Episode 6 Reward: -38.80876159667969
Test Episode 7 Reward: 155.92259216308594
Test Episode 8 Reward: -74.12690734863281
Test Episode 9 Reward: -92.362808227539

100%|██████████| 2000/2000 [01:47<00:00, 18.62it/s]


Epoch 241 Mean Reward: -58.33362260437012


100%|██████████| 2000/2000 [01:46<00:00, 18.78it/s]


Epoch 242 Mean Reward: -58.525804901123045


100%|██████████| 2000/2000 [01:47<00:00, 18.65it/s]


Epoch 243 Mean Reward: -56.584776710510255


100%|██████████| 2000/2000 [01:46<00:00, 18.72it/s]


Epoch 244 Mean Reward: -57.39444657897949


100%|██████████| 2000/2000 [01:49<00:00, 18.24it/s]


Epoch 245 Mean Reward: -55.493118858337404


100%|██████████| 2000/2000 [01:46<00:00, 18.70it/s]


Epoch 246 Mean Reward: -58.025412460327146


100%|██████████| 2000/2000 [01:47<00:00, 18.54it/s]


Epoch 247 Mean Reward: -54.08027785491943


100%|██████████| 2000/2000 [01:51<00:00, 17.98it/s]


Epoch 248 Mean Reward: -57.27702870178223


100%|██████████| 2000/2000 [01:48<00:00, 18.40it/s]


Epoch 249 Mean Reward: -57.45045369720459


100%|██████████| 2000/2000 [01:50<00:00, 18.15it/s]


Epoch 250 Mean Reward: -54.8737523727417


100%|██████████| 2000/2000 [01:49<00:00, 18.30it/s]


Epoch 251 Mean Reward: -53.67108280181885


100%|██████████| 2000/2000 [01:51<00:00, 18.00it/s]


Epoch 252 Mean Reward: -53.61313798522949


100%|██████████| 2000/2000 [01:51<00:00, 17.86it/s]


Epoch 253 Mean Reward: -51.001619735717775


100%|██████████| 2000/2000 [01:52<00:00, 17.82it/s]


Epoch 254 Mean Reward: -54.29248984527588


100%|██████████| 2000/2000 [01:53<00:00, 17.61it/s]


Epoch 255 Mean Reward: -49.389189254760744


100%|██████████| 2000/2000 [01:52<00:00, 17.77it/s]


Epoch 256 Mean Reward: -53.920741752624515


100%|██████████| 2000/2000 [01:53<00:00, 17.59it/s]


Epoch 257 Mean Reward: -55.819068252563476


100%|██████████| 2000/2000 [01:52<00:00, 17.78it/s]


Epoch 258 Mean Reward: -49.41915419006347


100%|██████████| 2000/2000 [01:52<00:00, 17.82it/s]


Epoch 259 Mean Reward: -48.2443472366333


100%|██████████| 2000/2000 [01:55<00:00, 17.26it/s]


Epoch 260 Mean Reward: -49.360065979003906
Epoch 260 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 260 test with depth buffer:
Test Episode 1 Reward: 77.33639526367188
Test Episode 2 Reward: 233.6959991455078
Test Episode 3 Reward: 4.767822265625
Test Episode 4 Reward: 77.33639526367188
Test Episode 5 Reward: 77.33639526367188
Test Episode 6 Reward: -65.4888916015625
Test Episode 7 Reward: -64.31031799316406
Test Episode 8 Reward: 58.32318115234375
Test Episode 9 Reward: 77.33639526367188
Test Episode 10 Reward: 77.33639526367188
Average Test Reward (with depth buffer:) 55.36697692871094
Epoch 260 test without depth buffer:
Test Episode 1 Reward: -3.1613006591796875
Test Episode 2 Reward: -112.77607727050781
Test Episode 3 Reward: -75.07443237304688
Test Episode 4 Reward: -85.22802734375
Test Episode 5 Reward: -3.1613006591796875
Test Episode 6 Reward: -3.1613006591796875
Test Episode 7 Reward: -3.1613006591796875
Test Episode 8 Reward: -115.70538330078125
Test Episode 9 Rewa

100%|██████████| 2000/2000 [01:51<00:00, 17.89it/s]


Epoch 261 Mean Reward: -58.54992919921875


100%|██████████| 2000/2000 [01:53<00:00, 17.65it/s]


Epoch 262 Mean Reward: -56.90250713348389


100%|██████████| 2000/2000 [01:53<00:00, 17.61it/s]


Epoch 263 Mean Reward: -57.21600937652588


100%|██████████| 2000/2000 [01:51<00:00, 18.00it/s]


Epoch 264 Mean Reward: -57.13267346191406


100%|██████████| 2000/2000 [01:50<00:00, 18.04it/s]


Epoch 265 Mean Reward: -61.0973081741333


100%|██████████| 2000/2000 [01:53<00:00, 17.58it/s]


Epoch 266 Mean Reward: -56.286922538757324


100%|██████████| 2000/2000 [01:53<00:00, 17.70it/s]


Epoch 267 Mean Reward: -58.12243658447266


100%|██████████| 2000/2000 [01:54<00:00, 17.40it/s]


Epoch 268 Mean Reward: -53.12682138824463


100%|██████████| 2000/2000 [01:55<00:00, 17.26it/s]


Epoch 269 Mean Reward: -53.797854530334476


100%|██████████| 2000/2000 [01:55<00:00, 17.37it/s]


Epoch 270 Mean Reward: -54.81971957397461


100%|██████████| 2000/2000 [02:04<00:00, 16.03it/s]


Epoch 271 Mean Reward: -51.366832321166996


100%|██████████| 2000/2000 [02:24<00:00, 13.82it/s]


Epoch 272 Mean Reward: -51.13732776641846


100%|██████████| 2000/2000 [02:02<00:00, 16.35it/s]


Epoch 273 Mean Reward: -47.33930623626709


100%|██████████| 2000/2000 [02:02<00:00, 16.35it/s]


Epoch 274 Mean Reward: -50.506453918457034


100%|██████████| 2000/2000 [01:58<00:00, 16.81it/s]


Epoch 275 Mean Reward: -47.030900382995604


100%|██████████| 2000/2000 [02:01<00:00, 16.44it/s]


Epoch 276 Mean Reward: -44.7530901260376


100%|██████████| 2000/2000 [02:03<00:00, 16.16it/s]


Epoch 277 Mean Reward: -45.94830931091309


100%|██████████| 2000/2000 [02:00<00:00, 16.66it/s]


Epoch 278 Mean Reward: -47.95554486846924


100%|██████████| 2000/2000 [02:01<00:00, 16.46it/s]


Epoch 279 Mean Reward: -44.04282928466797


100%|██████████| 2000/2000 [02:01<00:00, 16.47it/s]


Epoch 280 Mean Reward: -47.52275901794434
Epoch 280 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 280 test with depth buffer:
Test Episode 1 Reward: -0.8689422607421875
Test Episode 2 Reward: 125.51893615722656
Test Episode 3 Reward: 125.51893615722656
Test Episode 4 Reward: 304.75669860839844
Test Episode 5 Reward: -4.34051513671875
Test Episode 6 Reward: -10.669601440429688
Test Episode 7 Reward: 277.15916442871094
Test Episode 8 Reward: 125.51893615722656
Test Episode 9 Reward: 125.51893615722656
Test Episode 10 Reward: 96.07530212402344
Average Test Reward (with depth buffer:) 116.41878509521484
Epoch 280 test without depth buffer:
Test Episode 1 Reward: 233.43382263183594
Test Episode 2 Reward: 83.87101745605469
Test Episode 3 Reward: 57.01008605957031
Test Episode 4 Reward: 57.01008605957031
Test Episode 5 Reward: -10.48052978515625
Test Episode 6 Reward: 57.01008605957031
Test Episode 7 Reward: 4.655487060546875
Test Episode 8 Reward: 119.79995727539062
Test Episode 9 

100%|██████████| 2000/2000 [02:08<00:00, 15.57it/s]


Epoch 281 Mean Reward: -44.75630953979492


100%|██████████| 2000/2000 [02:04<00:00, 16.01it/s]


Epoch 282 Mean Reward: -46.193312629699705


100%|██████████| 2000/2000 [02:01<00:00, 16.46it/s]


Epoch 283 Mean Reward: -49.47257395935058


100%|██████████| 2000/2000 [02:07<00:00, 15.65it/s]


Epoch 284 Mean Reward: -41.8058482208252


100%|██████████| 2000/2000 [02:06<00:00, 15.75it/s]


Epoch 285 Mean Reward: -42.954833358764645


100%|██████████| 2000/2000 [02:05<00:00, 15.88it/s]


Epoch 286 Mean Reward: -43.050649154663084


100%|██████████| 2000/2000 [02:04<00:00, 16.07it/s]


Epoch 287 Mean Reward: -43.932128730773925


100%|██████████| 2000/2000 [02:10<00:00, 15.36it/s]


Epoch 288 Mean Reward: -41.084288330078124


100%|██████████| 2000/2000 [02:07<00:00, 15.71it/s]


Epoch 289 Mean Reward: -48.66330610656738


100%|██████████| 2000/2000 [02:10<00:00, 15.28it/s]


Epoch 290 Mean Reward: -44.38966954040527


100%|██████████| 2000/2000 [02:08<00:00, 15.59it/s]


Epoch 291 Mean Reward: -47.41254302215576


100%|██████████| 2000/2000 [02:08<00:00, 15.62it/s]


Epoch 292 Mean Reward: -45.28600634002686


100%|██████████| 2000/2000 [02:07<00:00, 15.70it/s]


Epoch 293 Mean Reward: -45.157159736633304


100%|██████████| 2000/2000 [02:08<00:00, 15.54it/s]


Epoch 294 Mean Reward: -44.55884898376465


100%|██████████| 2000/2000 [02:09<00:00, 15.48it/s]


Epoch 295 Mean Reward: -44.00709506225586


100%|██████████| 2000/2000 [02:10<00:00, 15.38it/s]


Epoch 296 Mean Reward: -47.079160301208496


100%|██████████| 2000/2000 [02:13<00:00, 14.97it/s]


Epoch 297 Mean Reward: -41.635132362365724


100%|██████████| 2000/2000 [02:12<00:00, 15.12it/s]


Epoch 298 Mean Reward: -45.448249481201174


100%|██████████| 2000/2000 [02:09<00:00, 15.42it/s]


Epoch 299 Mean Reward: -43.784757835388184


100%|██████████| 2000/2000 [02:14<00:00, 14.86it/s]


Epoch 300 Mean Reward: -39.66772447967529
Epoch 300 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 300 test with depth buffer:
Test Episode 1 Reward: -59.45887756347656
Test Episode 2 Reward: -50.623809814453125
Test Episode 3 Reward: -50.623809814453125
Test Episode 4 Reward: -50.623809814453125
Test Episode 5 Reward: -83.035888671875
Test Episode 6 Reward: -115.98536682128906
Test Episode 7 Reward: -50.623809814453125
Test Episode 8 Reward: -50.623809814453125
Test Episode 9 Reward: -115.72959899902344
Test Episode 10 Reward: -115.91278076171875
Average Test Reward (with depth buffer:) -74.32415618896485
Epoch 300 test without depth buffer:
Test Episode 1 Reward: 48.82307434082031
Test Episode 2 Reward: 48.82307434082031
Test Episode 3 Reward: -76.95109558105469
Test Episode 4 Reward: -115.99549865722656
Test Episode 5 Reward: 48.82307434082031
Test Episode 6 Reward: 24.258255004882812
Test Episode 7 Reward: 48.82307434082031
Test Episode 8 Reward: 48.82307434082031
Test Epi

100%|██████████| 2000/2000 [02:13<00:00, 15.01it/s]


Epoch 301 Mean Reward: -45.22851705169678


100%|██████████| 2000/2000 [02:11<00:00, 15.25it/s]


Epoch 302 Mean Reward: -44.48801335144043


100%|██████████| 2000/2000 [02:14<00:00, 14.82it/s]


Epoch 303 Mean Reward: -39.50685321807861


100%|██████████| 2000/2000 [02:16<00:00, 14.61it/s]


Epoch 304 Mean Reward: -40.659690048217776


100%|██████████| 2000/2000 [02:14<00:00, 14.86it/s]


Epoch 305 Mean Reward: -38.622332298278806


100%|██████████| 2000/2000 [02:14<00:00, 14.82it/s]


Epoch 306 Mean Reward: -42.10381693267822


100%|██████████| 2000/2000 [02:16<00:00, 14.61it/s]


Epoch 307 Mean Reward: -37.13005727386474


100%|██████████| 2000/2000 [02:16<00:00, 14.64it/s]


Epoch 308 Mean Reward: -37.38165030670166


100%|██████████| 2000/2000 [02:17<00:00, 14.52it/s]


Epoch 309 Mean Reward: -39.92918202209473


100%|██████████| 2000/2000 [02:20<00:00, 14.28it/s]


Epoch 310 Mean Reward: -38.999618949890134


100%|██████████| 2000/2000 [02:31<00:00, 13.20it/s]


Epoch 311 Mean Reward: -31.302432945251464


100%|██████████| 2000/2000 [02:30<00:00, 13.26it/s]


Epoch 312 Mean Reward: -34.04190245819092


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 313 Mean Reward: -32.34373671722412


100%|██████████| 2000/2000 [02:32<00:00, 13.09it/s]


Epoch 314 Mean Reward: -29.43504849243164


100%|██████████| 2000/2000 [02:32<00:00, 13.10it/s]


Epoch 315 Mean Reward: -33.01091938781738


100%|██████████| 2000/2000 [02:33<00:00, 13.03it/s]


Epoch 316 Mean Reward: -29.8219796295166


100%|██████████| 2000/2000 [02:26<00:00, 13.68it/s]


Epoch 317 Mean Reward: -39.40572668457031


100%|██████████| 2000/2000 [02:29<00:00, 13.38it/s]


Epoch 318 Mean Reward: -35.32775750732422


100%|██████████| 2000/2000 [02:31<00:00, 13.17it/s]


Epoch 319 Mean Reward: -30.471783058166505


100%|██████████| 2000/2000 [02:35<00:00, 12.86it/s]


Epoch 320 Mean Reward: -27.042687034606935
Epoch 320 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 320 test with depth buffer:
Test Episode 1 Reward: -101.19758605957031
Test Episode 2 Reward: -101.19758605957031
Test Episode 3 Reward: -100.76948547363281
Test Episode 4 Reward: -107.39816284179688
Test Episode 5 Reward: -101.19758605957031
Test Episode 6 Reward: -101.19758605957031
Test Episode 7 Reward: -101.19758605957031
Test Episode 8 Reward: -101.19758605957031
Test Episode 9 Reward: -101.19758605957031
Test Episode 10 Reward: -115.9571533203125
Average Test Reward (with depth buffer:) -103.25079040527343
Epoch 320 test without depth buffer:
Test Episode 1 Reward: 8.005813598632812
Test Episode 2 Reward: -91.71945190429688
Test Episode 3 Reward: -91.71945190429688
Test Episode 4 Reward: -91.71945190429688
Test Episode 5 Reward: -91.71945190429688
Test Episode 6 Reward: -115.99623107910156
Test Episode 7 Reward: -113.44981384277344
Test Episode 8 Reward: -69.0247344970703

100%|██████████| 2000/2000 [02:26<00:00, 13.68it/s]


Epoch 321 Mean Reward: -35.78165593719483


100%|██████████| 2000/2000 [02:32<00:00, 13.08it/s]


Epoch 322 Mean Reward: -32.81118886566162


100%|██████████| 2000/2000 [02:29<00:00, 13.41it/s]


Epoch 323 Mean Reward: -34.356118774414064


100%|██████████| 2000/2000 [02:32<00:00, 13.15it/s]


Epoch 324 Mean Reward: -34.757612968444825


100%|██████████| 2000/2000 [02:30<00:00, 13.31it/s]


Epoch 325 Mean Reward: -36.45879859924317


100%|██████████| 2000/2000 [02:30<00:00, 13.31it/s]


Epoch 326 Mean Reward: -40.02320554351807


100%|██████████| 2000/2000 [02:32<00:00, 13.09it/s]


Epoch 327 Mean Reward: -37.14059035491943


100%|██████████| 2000/2000 [02:33<00:00, 13.05it/s]


Epoch 328 Mean Reward: -35.43164837646484


100%|██████████| 2000/2000 [02:34<00:00, 12.92it/s]


Epoch 329 Mean Reward: -35.10780143737793


100%|██████████| 2000/2000 [02:30<00:00, 13.28it/s]


Epoch 330 Mean Reward: -40.2101840133667


100%|██████████| 2000/2000 [02:32<00:00, 13.11it/s]


Epoch 331 Mean Reward: -38.74173316192627


100%|██████████| 2000/2000 [02:43<00:00, 12.20it/s]


Epoch 332 Mean Reward: -33.514317665100094


100%|██████████| 2000/2000 [02:43<00:00, 12.20it/s]


Epoch 333 Mean Reward: -32.617498512268064


100%|██████████| 2000/2000 [02:42<00:00, 12.33it/s]


Epoch 334 Mean Reward: -32.80471300506592


100%|██████████| 2000/2000 [02:45<00:00, 12.09it/s]


Epoch 335 Mean Reward: -33.94796873474121


100%|██████████| 2000/2000 [02:46<00:00, 12.03it/s]


Epoch 336 Mean Reward: -32.19256338500976


100%|██████████| 2000/2000 [02:45<00:00, 12.11it/s]


Epoch 337 Mean Reward: -29.589170463562013


100%|██████████| 2000/2000 [02:43<00:00, 12.23it/s]


Epoch 338 Mean Reward: -33.53502532196045


100%|██████████| 2000/2000 [02:39<00:00, 12.56it/s]


Epoch 339 Mean Reward: -32.90150458526611


100%|██████████| 2000/2000 [02:41<00:00, 12.37it/s]


Epoch 340 Mean Reward: -33.8305905380249
Epoch 340 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 340 test with depth buffer:
Test Episode 1 Reward: -115.89210510253906
Test Episode 2 Reward: -112.37985229492188
Test Episode 3 Reward: -112.37985229492188
Test Episode 4 Reward: -115.95631408691406
Test Episode 5 Reward: -60.466033935546875
Test Episode 6 Reward: -115.97789001464844
Test Episode 7 Reward: -115.955810546875
Test Episode 8 Reward: -112.37985229492188
Test Episode 9 Reward: -115.99856567382812
Test Episode 10 Reward: -74.06425476074219
Average Test Reward (with depth buffer:) -105.14505310058594
Epoch 340 test without depth buffer:
Test Episode 1 Reward: -101.69076538085938
Test Episode 2 Reward: -92.24763488769531
Test Episode 3 Reward: -48.345703125
Test Episode 4 Reward: -105.6875
Test Episode 5 Reward: -110.1014404296875
Test Episode 6 Reward: -48.345703125
Test Episode 7 Reward: -114.36714172363281
Test Episode 8 Reward: -112.87808227539062
Test Episode 9 Rewa

100%|██████████| 2000/2000 [02:45<00:00, 12.10it/s]


Epoch 341 Mean Reward: -32.646196754455566


100%|██████████| 2000/2000 [02:48<00:00, 11.88it/s]


Epoch 342 Mean Reward: -27.521873207092284


100%|██████████| 2000/2000 [02:46<00:00, 12.02it/s]


Epoch 343 Mean Reward: -25.53842600250244


100%|██████████| 2000/2000 [02:53<00:00, 11.55it/s]


Epoch 344 Mean Reward: -31.05707511138916


100%|██████████| 2000/2000 [02:49<00:00, 11.78it/s]


Epoch 345 Mean Reward: -30.95993618774414


100%|██████████| 2000/2000 [02:55<00:00, 11.42it/s]


Epoch 346 Mean Reward: -26.693217346191407


100%|██████████| 2000/2000 [02:50<00:00, 11.71it/s]


Epoch 347 Mean Reward: -31.67609061431885


100%|██████████| 2000/2000 [02:52<00:00, 11.59it/s]


Epoch 348 Mean Reward: -24.03287028503418


100%|██████████| 2000/2000 [02:53<00:00, 11.50it/s]


Epoch 349 Mean Reward: -25.976110626220702


100%|██████████| 2000/2000 [02:47<00:00, 11.95it/s]


Epoch 350 Mean Reward: -32.58780606842041


100%|██████████| 2000/2000 [02:54<00:00, 11.44it/s]


Epoch 351 Mean Reward: -24.34611589050293


100%|██████████| 2000/2000 [02:56<00:00, 11.34it/s]


Epoch 352 Mean Reward: -33.11117303466797


100%|██████████| 2000/2000 [02:51<00:00, 11.66it/s]


Epoch 353 Mean Reward: -29.51383708190918


100%|██████████| 2000/2000 [03:00<00:00, 11.11it/s]


Epoch 354 Mean Reward: -24.07830490875244


100%|██████████| 2000/2000 [02:57<00:00, 11.27it/s]


Epoch 355 Mean Reward: -25.522049446105957


100%|██████████| 2000/2000 [02:56<00:00, 11.35it/s]


Epoch 356 Mean Reward: -26.901173751831056


100%|██████████| 2000/2000 [03:05<00:00, 10.81it/s]


Epoch 357 Mean Reward: -23.55707893371582


100%|██████████| 2000/2000 [02:56<00:00, 11.32it/s]


Epoch 358 Mean Reward: -23.541461166381836


100%|██████████| 2000/2000 [03:01<00:00, 11.00it/s]


Epoch 359 Mean Reward: -29.887457374572755


100%|██████████| 2000/2000 [03:00<00:00, 11.11it/s]


Epoch 360 Mean Reward: -26.22022455596924
Epoch 360 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 360 test with depth buffer:
Test Episode 1 Reward: -115.99716186523438
Test Episode 2 Reward: -115.97740173339844
Test Episode 3 Reward: -115.97477722167969
Test Episode 4 Reward: -115.97477722167969
Test Episode 5 Reward: -86.50796508789062
Test Episode 6 Reward: -115.90779113769531
Test Episode 7 Reward: -115.91291809082031
Test Episode 8 Reward: -89.8646240234375
Test Episode 9 Reward: -109.56710815429688
Test Episode 10 Reward: -115.97477722167969
Average Test Reward (with depth buffer:) -109.76593017578125
Epoch 360 test without depth buffer:
Test Episode 1 Reward: -95.15676879882812
Test Episode 2 Reward: -95.15676879882812
Test Episode 3 Reward: -90.66635131835938
Test Episode 4 Reward: -92.76126098632812
Test Episode 5 Reward: -95.15676879882812
Test Episode 6 Reward: -95.15676879882812
Test Episode 7 Reward: -95.15676879882812
Test Episode 8 Reward: -96.11308288574219
Te

100%|██████████| 2000/2000 [03:02<00:00, 10.94it/s]


Epoch 361 Mean Reward: -27.652457374572755


100%|██████████| 2000/2000 [03:02<00:00, 10.95it/s]


Epoch 362 Mean Reward: -27.230609161376954


100%|██████████| 2000/2000 [02:54<00:00, 11.48it/s]


Epoch 363 Mean Reward: -28.397612228393555


100%|██████████| 2000/2000 [03:02<00:00, 10.99it/s]


Epoch 364 Mean Reward: -24.52871368408203


100%|██████████| 2000/2000 [03:05<00:00, 10.78it/s]


Epoch 365 Mean Reward: -28.24755248260498


100%|██████████| 2000/2000 [03:05<00:00, 10.78it/s]


Epoch 366 Mean Reward: -26.2667820892334


100%|██████████| 2000/2000 [03:13<00:00, 10.31it/s]


Epoch 367 Mean Reward: -21.215468215942384


100%|██████████| 2000/2000 [03:03<00:00, 10.92it/s]


Epoch 368 Mean Reward: -28.277755012512205


100%|██████████| 2000/2000 [03:07<00:00, 10.68it/s]


Epoch 369 Mean Reward: -23.250159339904783


100%|██████████| 2000/2000 [03:11<00:00, 10.42it/s]


Epoch 370 Mean Reward: -19.4954019241333


100%|██████████| 2000/2000 [03:09<00:00, 10.55it/s]


Epoch 371 Mean Reward: -32.40404914855957


100%|██████████| 2000/2000 [03:08<00:00, 10.60it/s]


Epoch 372 Mean Reward: -30.393417800903322


100%|██████████| 2000/2000 [03:20<00:00,  9.99it/s]


Epoch 373 Mean Reward: -31.635911361694337


100%|██████████| 2000/2000 [03:11<00:00, 10.47it/s]


Epoch 374 Mean Reward: -31.45566967010498


100%|██████████| 2000/2000 [03:18<00:00, 10.09it/s]


Epoch 375 Mean Reward: -24.90693359375


100%|██████████| 2000/2000 [03:13<00:00, 10.35it/s]


Epoch 376 Mean Reward: -31.42431406402588


100%|██████████| 2000/2000 [03:13<00:00, 10.36it/s]


Epoch 377 Mean Reward: -24.87888592529297


100%|██████████| 2000/2000 [03:18<00:00, 10.09it/s]


Epoch 378 Mean Reward: -34.30138870239258


100%|██████████| 2000/2000 [03:20<00:00,  9.98it/s]


Epoch 379 Mean Reward: -24.203320220947266


100%|██████████| 2000/2000 [03:13<00:00, 10.34it/s]


Epoch 380 Mean Reward: -26.934408111572267
Epoch 380 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 380 test with depth buffer:
Test Episode 1 Reward: -115.97640991210938
Test Episode 2 Reward: -115.97640991210938
Test Episode 3 Reward: -110.997802734375
Test Episode 4 Reward: -115.97640991210938
Test Episode 5 Reward: -115.97640991210938
Test Episode 6 Reward: -115.97640991210938
Test Episode 7 Reward: -109.78236389160156
Test Episode 8 Reward: -115.97640991210938
Test Episode 9 Reward: -115.96528625488281
Test Episode 10 Reward: -115.93206787109375
Average Test Reward (with depth buffer:) -114.85359802246094
Epoch 380 test without depth buffer:
Test Episode 1 Reward: -115.99592590332031
Test Episode 2 Reward: -115.99052429199219
Test Episode 3 Reward: -111.138916015625
Test Episode 4 Reward: -114.28689575195312
Test Episode 5 Reward: -111.138916015625
Test Episode 6 Reward: -111.138916015625
Test Episode 7 Reward: -111.138916015625
Test Episode 8 Reward: -111.138916015625
Te

100%|██████████| 2000/2000 [03:10<00:00, 10.51it/s]


Epoch 381 Mean Reward: -34.97055963897705


100%|██████████| 2000/2000 [03:22<00:00,  9.86it/s]


Epoch 382 Mean Reward: -31.529389457702635


100%|██████████| 2000/2000 [03:17<00:00, 10.11it/s]


Epoch 383 Mean Reward: -29.611512229919434


100%|██████████| 2000/2000 [03:20<00:00, 10.00it/s]


Epoch 384 Mean Reward: -32.24065099334717


100%|██████████| 2000/2000 [03:23<00:00,  9.81it/s]


Epoch 385 Mean Reward: -27.084148231506347


100%|██████████| 2000/2000 [03:19<00:00, 10.02it/s]


Epoch 386 Mean Reward: -36.192392738342285


100%|██████████| 2000/2000 [03:14<00:00, 10.26it/s]


Epoch 387 Mean Reward: -32.00995294189453


100%|██████████| 2000/2000 [03:18<00:00, 10.09it/s]


Epoch 388 Mean Reward: -31.680898643493652


100%|██████████| 2000/2000 [03:19<00:00, 10.03it/s]


Epoch 389 Mean Reward: -29.58292876434326


100%|██████████| 2000/2000 [03:24<00:00,  9.76it/s]


Epoch 390 Mean Reward: -23.88666535949707


100%|██████████| 2000/2000 [03:19<00:00, 10.03it/s]


Epoch 391 Mean Reward: -35.01193212890625


100%|██████████| 2000/2000 [03:20<00:00,  9.99it/s]


Epoch 392 Mean Reward: -37.237622581481936


100%|██████████| 2000/2000 [03:22<00:00,  9.89it/s]


Epoch 393 Mean Reward: -38.45503940582275


100%|██████████| 2000/2000 [03:17<00:00, 10.11it/s]


Epoch 394 Mean Reward: -34.905222595214845


100%|██████████| 2000/2000 [03:28<00:00,  9.58it/s]


Epoch 395 Mean Reward: -32.51854473114014


100%|██████████| 2000/2000 [03:18<00:00, 10.08it/s]


Epoch 396 Mean Reward: -32.99785640716553


100%|██████████| 2000/2000 [03:22<00:00,  9.86it/s]


Epoch 397 Mean Reward: -33.31618029785156


100%|██████████| 2000/2000 [03:22<00:00,  9.86it/s]


Epoch 398 Mean Reward: -37.62375547790527


100%|██████████| 2000/2000 [03:30<00:00,  9.49it/s]


Epoch 399 Mean Reward: -35.34025357818604


100%|██████████| 2000/2000 [03:24<00:00,  9.78it/s]


Epoch 400 Mean Reward: -38.06660108184814
Epoch 400 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 400 test with depth buffer:
Test Episode 1 Reward: -102.6314697265625
Test Episode 2 Reward: -115.99932861328125
Test Episode 3 Reward: -115.99653625488281
Test Episode 4 Reward: -115.9979248046875
Test Episode 5 Reward: -109.87423706054688
Test Episode 6 Reward: -109.87423706054688
Test Episode 7 Reward: -109.87423706054688
Test Episode 8 Reward: -109.87423706054688
Test Episode 9 Reward: -109.87423706054688
Test Episode 10 Reward: -109.87423706054688
Average Test Reward (with depth buffer:) -110.98706817626953
Epoch 400 test without depth buffer:
Test Episode 1 Reward: -115.99089050292969
Test Episode 2 Reward: -115.9862060546875
Test Episode 3 Reward: -115.99729919433594
Test Episode 4 Reward: -115.99729919433594
Test Episode 5 Reward: -115.97721862792969
Test Episode 6 Reward: -115.97633361816406
Test Episode 7 Reward: -115.99729919433594
Test Episode 8 Reward: -115.997299194

100%|██████████| 2000/2000 [03:14<00:00, 10.28it/s]


Epoch 401 Mean Reward: -38.43157106781006


100%|██████████| 2000/2000 [03:27<00:00,  9.63it/s]


Epoch 402 Mean Reward: -31.64501976776123


100%|██████████| 2000/2000 [03:38<00:00,  9.15it/s]


Epoch 403 Mean Reward: -36.47940184020996


100%|██████████| 2000/2000 [03:31<00:00,  9.46it/s]


Epoch 404 Mean Reward: -30.82661959075928


100%|██████████| 2000/2000 [03:29<00:00,  9.56it/s]


Epoch 405 Mean Reward: -35.295952323913575


100%|██████████| 2000/2000 [03:31<00:00,  9.45it/s]


Epoch 406 Mean Reward: -34.88125381469727


100%|██████████| 2000/2000 [03:41<00:00,  9.03it/s]


Epoch 407 Mean Reward: -36.52744918060303


100%|██████████| 2000/2000 [03:31<00:00,  9.46it/s]


Epoch 408 Mean Reward: -31.36520429992676


100%|██████████| 2000/2000 [03:29<00:00,  9.55it/s]


Epoch 409 Mean Reward: -33.25043767547607


100%|██████████| 2000/2000 [03:32<00:00,  9.39it/s]


Epoch 410 Mean Reward: -34.33245909118652


100%|██████████| 2000/2000 [03:22<00:00,  9.88it/s]


Epoch 411 Mean Reward: -37.656940505981446


100%|██████████| 2000/2000 [03:26<00:00,  9.70it/s]


Epoch 412 Mean Reward: -38.57378840637207


100%|██████████| 2000/2000 [03:32<00:00,  9.41it/s]


Epoch 413 Mean Reward: -39.68615838623047


100%|██████████| 2000/2000 [03:26<00:00,  9.68it/s]


Epoch 414 Mean Reward: -37.88016465759277


100%|██████████| 2000/2000 [03:32<00:00,  9.40it/s]


Epoch 415 Mean Reward: -38.1417692565918


100%|██████████| 2000/2000 [03:37<00:00,  9.21it/s]


Epoch 416 Mean Reward: -32.19286315155029


100%|██████████| 2000/2000 [03:32<00:00,  9.41it/s]


Epoch 417 Mean Reward: -41.36688726043701


100%|██████████| 2000/2000 [03:20<00:00,  9.97it/s]


Epoch 418 Mean Reward: -35.29239236450195


100%|██████████| 2000/2000 [03:42<00:00,  9.00it/s]


Epoch 419 Mean Reward: -31.65098236846924


100%|██████████| 2000/2000 [03:39<00:00,  9.11it/s]


Epoch 420 Mean Reward: -31.17984655761719
Epoch 420 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 420 test with depth buffer:
Test Episode 1 Reward: -108.16288757324219
Test Episode 2 Reward: 87.38067626953125
Test Episode 3 Reward: -115.99713134765625
Test Episode 4 Reward: -115.99713134765625
Test Episode 5 Reward: -115.97563171386719
Test Episode 6 Reward: -115.97563171386719
Test Episode 7 Reward: -115.99935913085938
Test Episode 8 Reward: -115.97563171386719
Test Episode 9 Reward: -115.97563171386719
Test Episode 10 Reward: -115.97563171386719
Average Test Reward (with depth buffer:) -94.86539916992187
Epoch 420 test without depth buffer:
Test Episode 1 Reward: -115.99073791503906
Test Episode 2 Reward: -115.98709106445312
Test Episode 3 Reward: -104.62141418457031
Test Episode 4 Reward: -115.98019409179688
Test Episode 5 Reward: -104.62141418457031
Test Episode 6 Reward: -104.62141418457031
Test Episode 7 Reward: -115.93348693847656
Test Episode 8 Reward: -104.621414184

100%|██████████| 2000/2000 [03:46<00:00,  8.83it/s]


Epoch 421 Mean Reward: -43.9837864151001


100%|██████████| 2000/2000 [03:28<00:00,  9.61it/s]


Epoch 422 Mean Reward: -38.7937311706543


100%|██████████| 2000/2000 [03:31<00:00,  9.45it/s]


Epoch 423 Mean Reward: -32.879744972229005


100%|██████████| 2000/2000 [03:36<00:00,  9.22it/s]


Epoch 424 Mean Reward: -36.45558522796631


100%|██████████| 2000/2000 [03:32<00:00,  9.39it/s]


Epoch 425 Mean Reward: -39.039396530151365


100%|██████████| 2000/2000 [03:28<00:00,  9.60it/s]


Epoch 426 Mean Reward: -41.952655921936035


100%|██████████| 2000/2000 [03:29<00:00,  9.53it/s]


Epoch 427 Mean Reward: -40.636328071594235


100%|██████████| 2000/2000 [03:26<00:00,  9.69it/s]


Epoch 428 Mean Reward: -41.2952410736084


100%|██████████| 2000/2000 [03:33<00:00,  9.35it/s]


Epoch 429 Mean Reward: -33.86927709197998


100%|██████████| 2000/2000 [03:35<00:00,  9.27it/s]


Epoch 430 Mean Reward: -34.98720901489258


100%|██████████| 2000/2000 [03:30<00:00,  9.52it/s]


Epoch 431 Mean Reward: -33.95327301025391


100%|██████████| 2000/2000 [03:33<00:00,  9.38it/s]


Epoch 432 Mean Reward: -40.20089447784424


100%|██████████| 2000/2000 [03:25<00:00,  9.71it/s]


Epoch 433 Mean Reward: -38.46532740020752


100%|██████████| 2000/2000 [03:27<00:00,  9.63it/s]


Epoch 434 Mean Reward: -31.819330421447752


100%|██████████| 2000/2000 [03:30<00:00,  9.51it/s]


Epoch 435 Mean Reward: -36.428234924316406


100%|██████████| 2000/2000 [03:35<00:00,  9.30it/s]


Epoch 436 Mean Reward: -31.085716674804686


100%|██████████| 2000/2000 [03:38<00:00,  9.17it/s]


Epoch 437 Mean Reward: -27.56602680206299


100%|██████████| 2000/2000 [03:37<00:00,  9.19it/s]


Epoch 438 Mean Reward: -26.846944564819335


100%|██████████| 2000/2000 [03:33<00:00,  9.39it/s]


Epoch 439 Mean Reward: -28.288770011901857


100%|██████████| 2000/2000 [03:42<00:00,  8.98it/s]


Epoch 440 Mean Reward: -40.65008977508545
Epoch 440 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 440 test with depth buffer:
Test Episode 1 Reward: -62.182708740234375
Test Episode 2 Reward: -86.7100830078125
Test Episode 3 Reward: -115.99943542480469
Test Episode 4 Reward: -62.182708740234375
Test Episode 5 Reward: -102.49546813964844
Test Episode 6 Reward: -53.44017028808594
Test Episode 7 Reward: -62.182708740234375
Test Episode 8 Reward: -25.3310546875
Test Episode 9 Reward: -62.182708740234375
Test Episode 10 Reward: -62.182708740234375
Average Test Reward (with depth buffer:) -69.48897552490234
Epoch 440 test without depth buffer:
Test Episode 1 Reward: -104.77995300292969
Test Episode 2 Reward: -102.58537292480469
Test Episode 3 Reward: -104.77995300292969
Test Episode 4 Reward: -72.48788452148438
Test Episode 5 Reward: -114.0947265625
Test Episode 6 Reward: -82.07557678222656
Test Episode 7 Reward: -59.52455139160156
Test Episode 8 Reward: -104.77995300292969
Test Ep

100%|██████████| 2000/2000 [03:30<00:00,  9.51it/s]


Epoch 441 Mean Reward: -33.80430165863037


100%|██████████| 2000/2000 [03:41<00:00,  9.04it/s]


Epoch 442 Mean Reward: -33.677018318176266


100%|██████████| 2000/2000 [04:00<00:00,  8.33it/s]


Epoch 443 Mean Reward: -38.25958854675293


100%|██████████| 2000/2000 [03:41<00:00,  9.02it/s]


Epoch 444 Mean Reward: -35.957056358337404


100%|██████████| 2000/2000 [03:34<00:00,  9.30it/s]


Epoch 445 Mean Reward: -30.42596706390381


100%|██████████| 2000/2000 [03:34<00:00,  9.30it/s]


Epoch 446 Mean Reward: -32.598663864135744


100%|██████████| 2000/2000 [04:06<00:00,  8.13it/s]


Epoch 447 Mean Reward: -29.21130947113037


100%|██████████| 2000/2000 [04:01<00:00,  8.28it/s]


Epoch 448 Mean Reward: -26.97224927520752


100%|██████████| 2000/2000 [03:37<00:00,  9.21it/s]


Epoch 449 Mean Reward: -31.135830894470214


100%|██████████| 2000/2000 [03:40<00:00,  9.06it/s]


Epoch 450 Mean Reward: -30.322738319396972


100%|██████████| 2000/2000 [03:30<00:00,  9.50it/s]


Epoch 451 Mean Reward: -41.94399678039551


100%|██████████| 2000/2000 [03:35<00:00,  9.30it/s]


Epoch 452 Mean Reward: -37.93806616210937


100%|██████████| 2000/2000 [03:39<00:00,  9.10it/s]


Epoch 453 Mean Reward: -37.07570001220703


100%|██████████| 2000/2000 [03:38<00:00,  9.17it/s]


Epoch 454 Mean Reward: -39.94145267486572


100%|██████████| 2000/2000 [03:33<00:00,  9.36it/s]


Epoch 455 Mean Reward: -35.368869300842285


100%|██████████| 2000/2000 [03:47<00:00,  8.79it/s]


Epoch 456 Mean Reward: -39.94254058074951


100%|██████████| 2000/2000 [03:32<00:00,  9.42it/s]


Epoch 457 Mean Reward: -41.813249565124515


100%|██████████| 2000/2000 [03:39<00:00,  9.10it/s]


Epoch 458 Mean Reward: -34.074758781433104


100%|██████████| 2000/2000 [03:41<00:00,  9.02it/s]


Epoch 459 Mean Reward: -33.29758042907715


100%|██████████| 2000/2000 [03:51<00:00,  8.63it/s]


Epoch 460 Mean Reward: -37.68725361633301
Epoch 460 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 460 test with depth buffer:
Test Episode 1 Reward: -113.73068237304688
Test Episode 2 Reward: -113.73068237304688
Test Episode 3 Reward: -115.9979248046875
Test Episode 4 Reward: -115.99856567382812
Test Episode 5 Reward: -115.99752807617188
Test Episode 6 Reward: -113.73068237304688
Test Episode 7 Reward: -115.99856567382812
Test Episode 8 Reward: -113.73068237304688
Test Episode 9 Reward: -115.90237426757812
Test Episode 10 Reward: -113.73068237304688
Average Test Reward (with depth buffer:) -114.8548370361328
Epoch 460 test without depth buffer:
Test Episode 1 Reward: -108.64530944824219
Test Episode 2 Reward: 4.618927001953125
Test Episode 3 Reward: -108.64530944824219
Test Episode 4 Reward: -66.6949462890625
Test Episode 5 Reward: -108.64530944824219
Test Episode 6 Reward: -115.88919067382812
Test Episode 7 Reward: -108.64530944824219
Test Episode 8 Reward: -79.9883880615234

100%|██████████| 2000/2000 [04:13<00:00,  7.89it/s]


Epoch 461 Mean Reward: -31.523199371337892


100%|██████████| 2000/2000 [04:12<00:00,  7.92it/s]


Epoch 462 Mean Reward: -36.08829192352295


100%|██████████| 2000/2000 [04:04<00:00,  8.19it/s]


Epoch 463 Mean Reward: -34.44896685028076


100%|██████████| 2000/2000 [04:05<00:00,  8.16it/s]


Epoch 464 Mean Reward: -34.892443336486814


100%|██████████| 2000/2000 [04:06<00:00,  8.12it/s]


Epoch 465 Mean Reward: -34.694820442199706


100%|██████████| 2000/2000 [04:06<00:00,  8.11it/s]


Epoch 466 Mean Reward: -36.991110374450685


100%|██████████| 2000/2000 [04:10<00:00,  7.98it/s]


Epoch 467 Mean Reward: -33.80912889099121


100%|██████████| 2000/2000 [04:13<00:00,  7.88it/s]


Epoch 468 Mean Reward: -32.13774709320068


100%|██████████| 2000/2000 [04:03<00:00,  8.20it/s]


Epoch 469 Mean Reward: -31.254660972595214


100%|██████████| 2000/2000 [04:18<00:00,  7.74it/s]


Epoch 470 Mean Reward: -30.489385932922364


100%|██████████| 2000/2000 [04:29<00:00,  7.43it/s]


Epoch 471 Mean Reward: -15.31807420349121


100%|██████████| 2000/2000 [04:27<00:00,  7.48it/s]


Epoch 472 Mean Reward: -20.523843841552733


100%|██████████| 2000/2000 [04:19<00:00,  7.70it/s]


Epoch 473 Mean Reward: -18.39831561279297


100%|██████████| 2000/2000 [04:29<00:00,  7.42it/s]


Epoch 474 Mean Reward: -19.857079010009766


100%|██████████| 2000/2000 [04:27<00:00,  7.48it/s]


Epoch 475 Mean Reward: -19.690598274230958


100%|██████████| 2000/2000 [04:33<00:00,  7.31it/s]


Epoch 476 Mean Reward: -20.17117123413086


100%|██████████| 2000/2000 [04:27<00:00,  7.47it/s]


Epoch 477 Mean Reward: -21.0938237991333


100%|██████████| 2000/2000 [04:43<00:00,  7.05it/s]


Epoch 478 Mean Reward: -16.32651732635498


100%|██████████| 2000/2000 [04:24<00:00,  7.56it/s]


Epoch 479 Mean Reward: -18.309625373840333


100%|██████████| 2000/2000 [04:24<00:00,  7.57it/s]


Epoch 480 Mean Reward: -21.67524546813965
Epoch 480 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 480 test with depth buffer:
Test Episode 1 Reward: -115.976318359375
Test Episode 2 Reward: -113.43977355957031
Test Episode 3 Reward: -113.43977355957031
Test Episode 4 Reward: -115.95480346679688
Test Episode 5 Reward: -113.43977355957031
Test Episode 6 Reward: -113.43977355957031
Test Episode 7 Reward: -113.43977355957031
Test Episode 8 Reward: -113.43977355957031
Test Episode 9 Reward: -115.99784851074219
Test Episode 10 Reward: -112.34967041015625
Average Test Reward (with depth buffer:) -114.09172821044922
Epoch 480 test without depth buffer:
Test Episode 1 Reward: -115.97911071777344
Test Episode 2 Reward: -115.97819519042969
Test Episode 3 Reward: -115.99856567382812
Test Episode 4 Reward: -115.97576904296875
Test Episode 5 Reward: -115.97616577148438
Test Episode 6 Reward: -115.99856567382812
Test Episode 7 Reward: -108.42454528808594
Test Episode 8 Reward: -115.98213195

100%|██████████| 2000/2000 [04:14<00:00,  7.85it/s]


Epoch 481 Mean Reward: -27.873634185791015


100%|██████████| 2000/2000 [04:34<00:00,  7.28it/s]


Epoch 482 Mean Reward: -31.961331916809083


100%|██████████| 2000/2000 [04:20<00:00,  7.68it/s]


Epoch 483 Mean Reward: -31.13677869415283


100%|██████████| 2000/2000 [04:41<00:00,  7.09it/s]


Epoch 484 Mean Reward: -25.686170516967774


100%|██████████| 2000/2000 [07:45<00:00,  4.30it/s]


Epoch 485 Mean Reward: -26.974840591430663


100%|██████████| 2000/2000 [10:24<00:00,  3.20it/s]


Epoch 486 Mean Reward: -27.488094406127928


100%|██████████| 2000/2000 [09:57<00:00,  3.34it/s]


Epoch 487 Mean Reward: -32.795298385620114


100%|██████████| 2000/2000 [10:02<00:00,  3.32it/s]


Epoch 488 Mean Reward: -25.513371459960936


100%|██████████| 2000/2000 [10:19<00:00,  3.23it/s]


Epoch 489 Mean Reward: -25.87875463104248


100%|██████████| 2000/2000 [10:20<00:00,  3.22it/s]


Epoch 490 Mean Reward: -30.44100170135498


100%|██████████| 2000/2000 [11:31<00:00,  2.89it/s]


Epoch 491 Mean Reward: -21.31141162109375


100%|██████████| 2000/2000 [06:02<00:00,  5.52it/s]


Epoch 492 Mean Reward: -23.187867729187012


100%|██████████| 2000/2000 [10:45<00:00,  3.10it/s]


Epoch 493 Mean Reward: -23.55420066833496


100%|██████████| 2000/2000 [11:55<00:00,  2.80it/s]


Epoch 494 Mean Reward: -15.573209487915038


100%|██████████| 2000/2000 [10:42<00:00,  3.11it/s]


Epoch 495 Mean Reward: -15.528522064208984


100%|██████████| 2000/2000 [12:07<00:00,  2.75it/s]


Epoch 496 Mean Reward: -15.891640716552734


100%|██████████| 2000/2000 [10:49<00:00,  3.08it/s]


Epoch 497 Mean Reward: -11.74090885925293


100%|██████████| 2000/2000 [09:49<00:00,  3.39it/s]


Epoch 498 Mean Reward: -13.175672821044921


100%|██████████| 2000/2000 [05:51<00:00,  5.70it/s]


Epoch 499 Mean Reward: -13.202534294128418


100%|██████████| 2000/2000 [05:21<00:00,  6.21it/s]


Epoch 500 Mean Reward: -20.067922904968263
Epoch 500 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 500 test with depth buffer:
Test Episode 1 Reward: -108.69290161132812
Test Episode 2 Reward: -115.99874877929688
Test Episode 3 Reward: -115.99874877929688
Test Episode 4 Reward: -115.99874877929688
Test Episode 5 Reward: -115.99874877929688
Test Episode 6 Reward: -62.259307861328125
Test Episode 7 Reward: -115.99874877929688
Test Episode 8 Reward: -115.99874877929688
Test Episode 9 Reward: -115.99574279785156
Test Episode 10 Reward: -115.99874877929688
Average Test Reward (with depth buffer:) -109.8939193725586
Epoch 500 test without depth buffer:
Test Episode 1 Reward: -115.9468994140625
Test Episode 2 Reward: -115.9468994140625
Test Episode 3 Reward: -40.38981628417969
Test Episode 4 Reward: -101.36973571777344
Test Episode 5 Reward: -115.9468994140625
Test Episode 6 Reward: -115.9468994140625
Test Episode 7 Reward: -115.9468994140625
Test Episode 8 Reward: -115.946899414062

100%|██████████| 2000/2000 [05:13<00:00,  6.37it/s]


Epoch 501 Mean Reward: -21.1052010345459


100%|██████████| 2000/2000 [04:53<00:00,  6.81it/s]


Epoch 502 Mean Reward: -25.92750121307373


100%|██████████| 2000/2000 [04:55<00:00,  6.76it/s]


Epoch 503 Mean Reward: -26.37253056335449


100%|██████████| 2000/2000 [05:28<00:00,  6.09it/s]


Epoch 504 Mean Reward: -17.795205375671387


100%|██████████| 2000/2000 [05:34<00:00,  5.99it/s]


Epoch 505 Mean Reward: -30.516867820739748


100%|██████████| 2000/2000 [05:54<00:00,  5.64it/s]


Epoch 506 Mean Reward: -26.1610167388916


100%|██████████| 2000/2000 [05:14<00:00,  6.36it/s]


Epoch 507 Mean Reward: -16.66488104248047


100%|██████████| 2000/2000 [04:52<00:00,  6.84it/s]


Epoch 508 Mean Reward: -20.049792175292968


100%|██████████| 2000/2000 [05:25<00:00,  6.14it/s]


Epoch 509 Mean Reward: -27.117468994140626


100%|██████████| 2000/2000 [05:15<00:00,  6.33it/s]


Epoch 510 Mean Reward: -24.678454681396484


100%|██████████| 2000/2000 [05:41<00:00,  5.85it/s]


Epoch 511 Mean Reward: -29.06347875213623


100%|██████████| 2000/2000 [05:37<00:00,  5.92it/s]


Epoch 512 Mean Reward: -23.57263973236084


100%|██████████| 2000/2000 [05:44<00:00,  5.80it/s]


Epoch 513 Mean Reward: -22.770856925964356


100%|██████████| 2000/2000 [05:32<00:00,  6.02it/s]


Epoch 514 Mean Reward: -22.813245513916016


100%|██████████| 2000/2000 [05:42<00:00,  5.85it/s]


Epoch 515 Mean Reward: -21.279313186645506


100%|██████████| 2000/2000 [05:41<00:00,  5.86it/s]


Epoch 516 Mean Reward: -9.294007583618164


100%|██████████| 2000/2000 [10:59<00:00,  3.03it/s]


Epoch 517 Mean Reward: -21.064762756347655


100%|██████████| 2000/2000 [05:43<00:00,  5.83it/s]


Epoch 518 Mean Reward: -22.914851440429686


100%|██████████| 2000/2000 [05:20<00:00,  6.23it/s]


Epoch 519 Mean Reward: -25.968849655151367


100%|██████████| 2000/2000 [05:32<00:00,  6.01it/s]


Epoch 520 Mean Reward: -17.747914749145508
Epoch 520 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 520 test with depth buffer:
Test Episode 1 Reward: -115.65211486816406
Test Episode 2 Reward: -115.97601318359375
Test Episode 3 Reward: -72.42137145996094
Test Episode 4 Reward: -36.54450988769531
Test Episode 5 Reward: -115.97601318359375
Test Episode 6 Reward: -115.23477172851562
Test Episode 7 Reward: -115.97601318359375
Test Episode 8 Reward: -115.89714050292969
Test Episode 9 Reward: 32.405731201171875
Test Episode 10 Reward: -115.97601318359375
Average Test Reward (with depth buffer:) -88.72482299804688
Epoch 520 test without depth buffer:
Test Episode 1 Reward: -111.61332702636719
Test Episode 2 Reward: -111.61332702636719
Test Episode 3 Reward: 0.58917236328125
Test Episode 4 Reward: -111.61332702636719
Test Episode 5 Reward: -111.61332702636719
Test Episode 6 Reward: -41.727935791015625
Test Episode 7 Reward: -111.61332702636719
Test Episode 8 Reward: -111.613327026367

100%|██████████| 2000/2000 [04:51<00:00,  6.87it/s]


Epoch 521 Mean Reward: -28.1321135559082


100%|██████████| 2000/2000 [04:51<00:00,  6.86it/s]


Epoch 522 Mean Reward: -28.499124786376953


100%|██████████| 2000/2000 [08:22<00:00,  3.98it/s]


Epoch 523 Mean Reward: -23.593517318725585


100%|██████████| 2000/2000 [09:29<00:00,  3.51it/s]


Epoch 524 Mean Reward: -34.58235327911377


100%|██████████| 2000/2000 [04:57<00:00,  6.72it/s]


Epoch 525 Mean Reward: -35.61415897369385


100%|██████████| 2000/2000 [05:01<00:00,  6.64it/s]


Epoch 526 Mean Reward: -22.797484573364258


100%|██████████| 2000/2000 [04:45<00:00,  7.00it/s]


Epoch 527 Mean Reward: -27.144828895568846


100%|██████████| 2000/2000 [04:48<00:00,  6.93it/s]


Epoch 528 Mean Reward: -32.65018836212158


100%|██████████| 2000/2000 [04:54<00:00,  6.79it/s]


Epoch 529 Mean Reward: -28.083258239746094


100%|██████████| 2000/2000 [05:03<00:00,  6.60it/s]


Epoch 530 Mean Reward: -28.7046224899292


100%|██████████| 2000/2000 [05:36<00:00,  5.95it/s]


Epoch 531 Mean Reward: -29.157278274536132


100%|██████████| 2000/2000 [05:39<00:00,  5.89it/s]


Epoch 532 Mean Reward: -26.104815406799318


100%|██████████| 2000/2000 [05:28<00:00,  6.10it/s]


Epoch 533 Mean Reward: -25.042320823669435


100%|██████████| 2000/2000 [05:22<00:00,  6.21it/s]


Epoch 534 Mean Reward: -24.590625450134276


100%|██████████| 2000/2000 [05:29<00:00,  6.06it/s]


Epoch 535 Mean Reward: -27.84092008972168


100%|██████████| 2000/2000 [05:14<00:00,  6.36it/s]


Epoch 536 Mean Reward: -30.953601867675783


100%|██████████| 2000/2000 [06:07<00:00,  5.44it/s]


Epoch 537 Mean Reward: -28.59558233642578


100%|██████████| 2000/2000 [05:39<00:00,  5.89it/s]


Epoch 538 Mean Reward: -18.420840812683107


100%|██████████| 2000/2000 [05:50<00:00,  5.71it/s]


Epoch 539 Mean Reward: -27.38341638946533


100%|██████████| 2000/2000 [05:16<00:00,  6.32it/s]


Epoch 540 Mean Reward: -27.311406433105468
Epoch 540 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 540 test with depth buffer:
Test Episode 1 Reward: -115.99861145019531
Test Episode 2 Reward: -115.99784851074219
Test Episode 3 Reward: -115.99716186523438
Test Episode 4 Reward: -114.69575500488281
Test Episode 5 Reward: -115.01969909667969
Test Episode 6 Reward: -115.99934387207031
Test Episode 7 Reward: -115.99934387207031
Test Episode 8 Reward: -115.99934387207031
Test Episode 9 Reward: -115.99934387207031
Test Episode 10 Reward: -115.99644470214844
Average Test Reward (with depth buffer:) -115.77028961181641
Epoch 540 test without depth buffer:
Test Episode 1 Reward: -115.99716186523438
Test Episode 2 Reward: -115.99716186523438
Test Episode 3 Reward: -115.91152954101562
Test Episode 4 Reward: -115.99716186523438
Test Episode 5 Reward: -115.99699401855469
Test Episode 6 Reward: -115.99931335449219
Test Episode 7 Reward: -115.9786376953125
Test Episode 8 Reward: -115.997161

100%|██████████| 2000/2000 [05:48<00:00,  5.74it/s]


Epoch 541 Mean Reward: -7.7143071746826175


100%|██████████| 2000/2000 [09:53<00:00,  3.37it/s]


Epoch 542 Mean Reward: -18.074112129211425


100%|██████████| 2000/2000 [09:45<00:00,  3.41it/s]


Epoch 543 Mean Reward: -12.653181671142578


100%|██████████| 2000/2000 [08:41<00:00,  3.84it/s]


Epoch 544 Mean Reward: -21.232218711853026


100%|██████████| 2000/2000 [12:40<00:00,  2.63it/s]


Epoch 545 Mean Reward: -21.267363868713378


100%|██████████| 2000/2000 [08:24<00:00,  3.97it/s]


Epoch 546 Mean Reward: -12.923787803649903


100%|██████████| 2000/2000 [05:38<00:00,  5.91it/s]


Epoch 547 Mean Reward: -19.85526290130615


100%|██████████| 2000/2000 [05:30<00:00,  6.04it/s]


Epoch 548 Mean Reward: -20.153049278259278


100%|██████████| 2000/2000 [05:28<00:00,  6.08it/s]


Epoch 549 Mean Reward: -17.444094596862794


100%|██████████| 2000/2000 [05:39<00:00,  5.89it/s]


Epoch 550 Mean Reward: -16.96741591644287


100%|██████████| 2000/2000 [05:08<00:00,  6.47it/s]


Epoch 551 Mean Reward: -25.58140195465088


100%|██████████| 2000/2000 [04:38<00:00,  7.18it/s]


Epoch 552 Mean Reward: -32.68248871612549


100%|██████████| 2000/2000 [04:50<00:00,  6.88it/s]


Epoch 553 Mean Reward: -37.825676940917965


100%|██████████| 2000/2000 [04:44<00:00,  7.03it/s]


Epoch 554 Mean Reward: -36.05006876373291


100%|██████████| 2000/2000 [04:33<00:00,  7.31it/s]


Epoch 555 Mean Reward: -31.759014083862304


100%|██████████| 2000/2000 [04:46<00:00,  6.97it/s]


Epoch 556 Mean Reward: -33.02496496582031


100%|██████████| 2000/2000 [04:48<00:00,  6.94it/s]


Epoch 557 Mean Reward: -36.45277153015137


100%|██████████| 2000/2000 [04:47<00:00,  6.95it/s]


Epoch 558 Mean Reward: -31.87604592895508


100%|██████████| 2000/2000 [04:47<00:00,  6.96it/s]


Epoch 559 Mean Reward: -37.022403244018555


100%|██████████| 2000/2000 [04:52<00:00,  6.83it/s]


Epoch 560 Mean Reward: -29.616390899658203
Epoch 560 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 560 test with depth buffer:
Test Episode 1 Reward: -107.37135314941406
Test Episode 2 Reward: 112.11744689941406
Test Episode 3 Reward: 112.11744689941406
Test Episode 4 Reward: 130.87709045410156
Test Episode 5 Reward: 250.6220245361328
Test Episode 6 Reward: -96.40419006347656
Test Episode 7 Reward: 112.11744689941406
Test Episode 8 Reward: -105.68655395507812
Test Episode 9 Reward: 19.5244140625
Test Episode 10 Reward: 33.05741882324219
Average Test Reward (with depth buffer:) 46.097119140625
Epoch 560 test without depth buffer:
Test Episode 1 Reward: -105.22201538085938
Test Episode 2 Reward: -41.36119079589844
Test Episode 3 Reward: -99.51533508300781
Test Episode 4 Reward: -105.22201538085938
Test Episode 5 Reward: -105.22201538085938
Test Episode 6 Reward: -105.22201538085938
Test Episode 7 Reward: -105.22201538085938
Test Episode 8 Reward: -58.794708251953125
Test Episod

100%|██████████| 2000/2000 [05:54<00:00,  5.64it/s]


Epoch 561 Mean Reward: -31.427516883850096


100%|██████████| 2000/2000 [06:24<00:00,  5.20it/s]


Epoch 562 Mean Reward: -37.22157648468018


100%|██████████| 2000/2000 [05:48<00:00,  5.73it/s]


Epoch 563 Mean Reward: -27.571703407287597


100%|██████████| 2000/2000 [05:47<00:00,  5.76it/s]


Epoch 564 Mean Reward: -23.851926597595217


100%|██████████| 2000/2000 [06:05<00:00,  5.47it/s]


Epoch 565 Mean Reward: -32.27171341705322


100%|██████████| 2000/2000 [05:32<00:00,  6.02it/s]


Epoch 566 Mean Reward: -25.87473346710205


100%|██████████| 2000/2000 [05:51<00:00,  5.68it/s]


Epoch 567 Mean Reward: -24.825918556213377


100%|██████████| 2000/2000 [05:52<00:00,  5.67it/s]


Epoch 568 Mean Reward: -31.57141284942627


100%|██████████| 2000/2000 [05:55<00:00,  5.62it/s]


Epoch 569 Mean Reward: -32.85192486572266


100%|██████████| 2000/2000 [06:22<00:00,  5.23it/s]


Epoch 570 Mean Reward: -24.39536393737793


100%|██████████| 2000/2000 [07:03<00:00,  4.73it/s]


Epoch 571 Mean Reward: -19.248069679260254


100%|██████████| 2000/2000 [07:07<00:00,  4.67it/s]


Epoch 572 Mean Reward: -10.619606300354004


100%|██████████| 2000/2000 [06:40<00:00,  4.99it/s]


Epoch 573 Mean Reward: -18.50229581451416


100%|██████████| 2000/2000 [07:00<00:00,  4.76it/s]


Epoch 574 Mean Reward: -4.969194480895996


100%|██████████| 2000/2000 [06:35<00:00,  5.06it/s]


Epoch 575 Mean Reward: -9.269871421813965


100%|██████████| 2000/2000 [06:44<00:00,  4.94it/s]


Epoch 576 Mean Reward: -22.093858268737794


100%|██████████| 2000/2000 [06:45<00:00,  4.94it/s]


Epoch 577 Mean Reward: -11.57086270904541


100%|██████████| 2000/2000 [06:43<00:00,  4.95it/s]


Epoch 578 Mean Reward: -25.903778030395507


100%|██████████| 2000/2000 [06:50<00:00,  4.87it/s]


Epoch 579 Mean Reward: -11.913596824645996


100%|██████████| 2000/2000 [06:35<00:00,  5.05it/s]


Epoch 580 Mean Reward: -12.76005728149414
Epoch 580 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 580 test with depth buffer:
Test Episode 1 Reward: -87.83280944824219
Test Episode 2 Reward: -87.83280944824219
Test Episode 3 Reward: -87.83280944824219
Test Episode 4 Reward: 21.973846435546875
Test Episode 5 Reward: -87.83280944824219
Test Episode 6 Reward: -115.7344970703125
Test Episode 7 Reward: 196.5331573486328
Test Episode 8 Reward: -87.83280944824219
Test Episode 9 Reward: -87.83280944824219
Test Episode 10 Reward: -115.99441528320312
Average Test Reward (with depth buffer:) -54.02187652587891
Epoch 580 test without depth buffer:
Test Episode 1 Reward: -114.70518493652344
Test Episode 2 Reward: -115.95518493652344
Test Episode 3 Reward: -50.374359130859375
Test Episode 4 Reward: -114.70518493652344
Test Episode 5 Reward: -115.99784851074219
Test Episode 6 Reward: -107.14659118652344
Test Episode 7 Reward: -114.70518493652344
Test Episode 8 Reward: -76.72348022460938
Tes

100%|██████████| 2000/2000 [06:58<00:00,  4.78it/s]


Epoch 581 Mean Reward: -21.901015563964844


100%|██████████| 2000/2000 [06:33<00:00,  5.08it/s]


Epoch 582 Mean Reward: -26.140925994873047


100%|██████████| 2000/2000 [06:44<00:00,  4.94it/s]


Epoch 583 Mean Reward: -15.2725708694458


100%|██████████| 2000/2000 [06:40<00:00,  4.99it/s]


Epoch 584 Mean Reward: -31.84737351989746


100%|██████████| 2000/2000 [06:10<00:00,  5.40it/s]


Epoch 585 Mean Reward: -28.439908531188966


100%|██████████| 2000/2000 [06:11<00:00,  5.38it/s]


Epoch 586 Mean Reward: -26.469877799987792


100%|██████████| 2000/2000 [06:33<00:00,  5.09it/s]


Epoch 587 Mean Reward: -23.744424194335938


100%|██████████| 2000/2000 [06:49<00:00,  4.88it/s]


Epoch 588 Mean Reward: -30.954875953674318


100%|██████████| 2000/2000 [06:21<00:00,  5.25it/s]


Epoch 589 Mean Reward: -19.74197612762451


100%|██████████| 2000/2000 [06:37<00:00,  5.03it/s]


Epoch 590 Mean Reward: -22.741231391906737


100%|██████████| 2000/2000 [07:00<00:00,  4.75it/s]


Epoch 591 Mean Reward: -22.624493507385253


100%|██████████| 2000/2000 [07:06<00:00,  4.68it/s]


Epoch 592 Mean Reward: -23.306741325378418


100%|██████████| 2000/2000 [07:02<00:00,  4.73it/s]


Epoch 593 Mean Reward: -31.374283576965333


100%|██████████| 2000/2000 [07:05<00:00,  4.70it/s]


Epoch 594 Mean Reward: -23.077870414733887


100%|██████████| 2000/2000 [07:09<00:00,  4.66it/s]


Epoch 595 Mean Reward: -25.508467514038085


100%|██████████| 2000/2000 [07:32<00:00,  4.42it/s]


Epoch 596 Mean Reward: -17.140939796447753


100%|██████████| 2000/2000 [07:07<00:00,  4.68it/s]


Epoch 597 Mean Reward: -26.348237548828124


100%|██████████| 2000/2000 [07:30<00:00,  4.43it/s]


Epoch 598 Mean Reward: -28.26548416900635


100%|██████████| 2000/2000 [07:28<00:00,  4.45it/s]


Epoch 599 Mean Reward: -17.510422302246095


100%|██████████| 2000/2000 [07:31<00:00,  4.43it/s]


Epoch 600 Mean Reward: -29.271932739257814
Epoch 600 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 600 test with depth buffer:
Test Episode 1 Reward: -115.96055603027344
Test Episode 2 Reward: -115.99794006347656
Test Episode 3 Reward: -63.83341979980469
Test Episode 4 Reward: -63.83341979980469
Test Episode 5 Reward: -115.99786376953125
Test Episode 6 Reward: -115.36758422851562
Test Episode 7 Reward: -115.99430847167969
Test Episode 8 Reward: -115.97885131835938
Test Episode 9 Reward: -115.99784851074219
Test Episode 10 Reward: -115.99787902832031
Average Test Reward (with depth buffer:) -105.49596710205078
Epoch 600 test without depth buffer:
Test Episode 1 Reward: -115.97599792480469
Test Episode 2 Reward: -111.90060424804688
Test Episode 3 Reward: -75.63905334472656
Test Episode 4 Reward: -99.62202453613281
Test Episode 5 Reward: -114.048095703125
Test Episode 6 Reward: -115.92611694335938
Test Episode 7 Reward: -115.99856567382812
Test Episode 8 Reward: -115.92611694335

100%|██████████| 2000/2000 [07:45<00:00,  4.29it/s]


Epoch 601 Mean Reward: -14.061149459838868


100%|██████████| 2000/2000 [07:16<00:00,  4.58it/s]


Epoch 602 Mean Reward: -34.22644275665283


100%|██████████| 2000/2000 [07:23<00:00,  4.51it/s]


Epoch 603 Mean Reward: -24.696617149353028


100%|██████████| 2000/2000 [07:05<00:00,  4.70it/s]


Epoch 604 Mean Reward: -30.185918006896973


100%|██████████| 2000/2000 [07:08<00:00,  4.67it/s]


Epoch 605 Mean Reward: -22.862389778137207


100%|██████████| 2000/2000 [07:19<00:00,  4.55it/s]


Epoch 606 Mean Reward: -25.198102615356444


100%|██████████| 2000/2000 [07:26<00:00,  4.48it/s]


Epoch 607 Mean Reward: -23.653157218933107


100%|██████████| 2000/2000 [07:49<00:00,  4.26it/s]


Epoch 608 Mean Reward: -27.69972886657715


100%|██████████| 2000/2000 [07:07<00:00,  4.68it/s]


Epoch 609 Mean Reward: -30.091037460327147


100%|██████████| 2000/2000 [07:36<00:00,  4.39it/s]


Epoch 610 Mean Reward: -36.11789861297608


100%|██████████| 2000/2000 [08:26<00:00,  3.95it/s]


Epoch 611 Mean Reward: -20.57448336791992


100%|██████████| 2000/2000 [08:49<00:00,  3.78it/s]


Epoch 612 Mean Reward: -27.002954963684083


100%|██████████| 2000/2000 [08:27<00:00,  3.94it/s]


Epoch 613 Mean Reward: -16.88640087890625


100%|██████████| 2000/2000 [08:48<00:00,  3.78it/s]


Epoch 614 Mean Reward: -21.226140586853028


100%|██████████| 2000/2000 [09:07<00:00,  3.65it/s]


Epoch 615 Mean Reward: -20.881309745788574


100%|██████████| 2000/2000 [08:31<00:00,  3.91it/s]


Epoch 616 Mean Reward: -19.19382907104492


100%|██████████| 2000/2000 [08:12<00:00,  4.06it/s]


Epoch 617 Mean Reward: -18.805766036987304


100%|██████████| 2000/2000 [08:17<00:00,  4.02it/s]


Epoch 618 Mean Reward: -24.758430198669434


100%|██████████| 2000/2000 [08:35<00:00,  3.88it/s]


Epoch 619 Mean Reward: -15.557927230834961


100%|██████████| 2000/2000 [08:13<00:00,  4.05it/s]


Epoch 620 Mean Reward: -21.747771141052247
Epoch 620 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 620 test with depth buffer:
Test Episode 1 Reward: -115.99092102050781
Test Episode 2 Reward: -115.99092102050781
Test Episode 3 Reward: -115.99092102050781
Test Episode 4 Reward: -115.99363708496094
Test Episode 5 Reward: -115.99858093261719
Test Episode 6 Reward: -115.94114685058594
Test Episode 7 Reward: -115.99092102050781
Test Episode 8 Reward: -115.97599792480469
Test Episode 9 Reward: -115.99092102050781
Test Episode 10 Reward: -115.91966247558594
Average Test Reward (with depth buffer:) -115.97836303710938
Epoch 620 test without depth buffer:
Test Episode 1 Reward: -115.97872924804688
Test Episode 2 Reward: -96.34049987792969
Test Episode 3 Reward: -115.97563171386719
Test Episode 4 Reward: -96.34049987792969
Test Episode 5 Reward: 111.14576721191406
Test Episode 6 Reward: -96.34049987792969
Test Episode 7 Reward: -96.34049987792969
Test Episode 8 Reward: -96.34049987792

100%|██████████| 2000/2000 [08:44<00:00,  3.81it/s]


Epoch 621 Mean Reward: -8.102358680725098


100%|██████████| 2000/2000 [08:31<00:00,  3.91it/s]


Epoch 622 Mean Reward: -7.948390625


100%|██████████| 2000/2000 [07:50<00:00,  4.25it/s]


Epoch 623 Mean Reward: -6.505032524108887


100%|██████████| 2000/2000 [08:10<00:00,  4.08it/s]


Epoch 624 Mean Reward: 2.591687973022461


100%|██████████| 2000/2000 [08:03<00:00,  4.13it/s]


Epoch 625 Mean Reward: -17.10397013092041


100%|██████████| 2000/2000 [07:57<00:00,  4.19it/s]


Epoch 626 Mean Reward: -10.671664710998535


100%|██████████| 2000/2000 [07:58<00:00,  4.18it/s]


Epoch 627 Mean Reward: -8.649547973632812


100%|██████████| 2000/2000 [08:05<00:00,  4.12it/s]


Epoch 628 Mean Reward: -4.907215103149414


100%|██████████| 2000/2000 [08:26<00:00,  3.95it/s]


Epoch 629 Mean Reward: -15.835559410095215


100%|██████████| 2000/2000 [08:29<00:00,  3.92it/s]


Epoch 630 Mean Reward: -11.239941329956055


100%|██████████| 2000/2000 [08:04<00:00,  4.13it/s]


Epoch 631 Mean Reward: -10.749901489257812


100%|██████████| 2000/2000 [08:26<00:00,  3.95it/s]


Epoch 632 Mean Reward: -3.3273730773925783


100%|██████████| 2000/2000 [08:23<00:00,  3.98it/s]


Epoch 633 Mean Reward: -11.135578750610351


100%|██████████| 2000/2000 [08:12<00:00,  4.06it/s]


Epoch 634 Mean Reward: -5.8196666717529295


100%|██████████| 2000/2000 [08:25<00:00,  3.96it/s]


Epoch 635 Mean Reward: -13.346652114868164


100%|██████████| 2000/2000 [08:24<00:00,  3.96it/s]


Epoch 636 Mean Reward: 3.529417091369629


100%|██████████| 2000/2000 [08:25<00:00,  3.96it/s]


Epoch 637 Mean Reward: -5.223247047424317


100%|██████████| 2000/2000 [08:24<00:00,  3.97it/s]


Epoch 638 Mean Reward: -7.661084228515625


100%|██████████| 2000/2000 [08:21<00:00,  3.99it/s]


Epoch 639 Mean Reward: -1.038700065612793


100%|██████████| 2000/2000 [08:30<00:00,  3.92it/s]


Epoch 640 Mean Reward: 0.9357165832519532
Epoch 640 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 640 test with depth buffer:
Test Episode 1 Reward: -115.99794006347656
Test Episode 2 Reward: -112.43791198730469
Test Episode 3 Reward: -115.99119567871094
Test Episode 4 Reward: -115.81806945800781
Test Episode 5 Reward: -115.99813842773438
Test Episode 6 Reward: -115.9979248046875
Test Episode 7 Reward: -112.43791198730469
Test Episode 8 Reward: 40.50836181640625
Test Episode 9 Reward: -115.93948364257812
Test Episode 10 Reward: -88.72373962402344
Average Test Reward (with depth buffer:) -96.88339538574219
Epoch 640 test without depth buffer:
Test Episode 1 Reward: -52.2601318359375
Test Episode 2 Reward: -115.59890747070312
Test Episode 3 Reward: -115.59890747070312
Test Episode 4 Reward: -115.59890747070312
Test Episode 5 Reward: -115.59890747070312
Test Episode 6 Reward: -115.59890747070312
Test Episode 7 Reward: -52.51817321777344
Test Episode 8 Reward: -115.99928283691406

100%|██████████| 2000/2000 [08:46<00:00,  3.80it/s]


Epoch 641 Mean Reward: -16.51516005706787


100%|██████████| 2000/2000 [09:10<00:00,  3.63it/s]


Epoch 642 Mean Reward: -6.655869102478027


100%|██████████| 2000/2000 [09:30<00:00,  3.51it/s]


Epoch 643 Mean Reward: -12.295326698303223


100%|██████████| 2000/2000 [09:23<00:00,  3.55it/s]


Epoch 644 Mean Reward: -18.653095878601075


100%|██████████| 2000/2000 [08:57<00:00,  3.72it/s]


Epoch 645 Mean Reward: -10.757683784484863


100%|██████████| 2000/2000 [09:15<00:00,  3.60it/s]


Epoch 646 Mean Reward: -21.987171752929687


100%|██████████| 2000/2000 [09:03<00:00,  3.68it/s]


Epoch 647 Mean Reward: -8.049201156616212


100%|██████████| 2000/2000 [09:53<00:00,  3.37it/s]


Epoch 648 Mean Reward: -27.404122650146483


100%|██████████| 2000/2000 [09:13<00:00,  3.61it/s]


Epoch 649 Mean Reward: -14.293848579406738


100%|██████████| 2000/2000 [09:22<00:00,  3.56it/s]


Epoch 650 Mean Reward: -8.949256401062012


100%|██████████| 2000/2000 [09:30<00:00,  3.50it/s]


Epoch 651 Mean Reward: -7.668739974975586


100%|██████████| 2000/2000 [10:03<00:00,  3.32it/s]


Epoch 652 Mean Reward: -14.26384375


100%|██████████| 2000/2000 [09:38<00:00,  3.46it/s]


Epoch 653 Mean Reward: -12.766297668457032


100%|██████████| 2000/2000 [09:26<00:00,  3.53it/s]


Epoch 654 Mean Reward: -5.51850749206543


100%|██████████| 2000/2000 [09:34<00:00,  3.48it/s]


Epoch 655 Mean Reward: -13.636462203979493


100%|██████████| 2000/2000 [10:05<00:00,  3.30it/s]


Epoch 656 Mean Reward: -13.362224807739258


100%|██████████| 2000/2000 [09:48<00:00,  3.40it/s]


Epoch 657 Mean Reward: -7.800055435180664


100%|██████████| 2000/2000 [10:04<00:00,  3.31it/s]


Epoch 658 Mean Reward: -3.087134864807129


100%|██████████| 2000/2000 [10:33<00:00,  3.16it/s]


Epoch 659 Mean Reward: -8.902830146789551


100%|██████████| 2000/2000 [09:48<00:00,  3.40it/s]


Epoch 660 Mean Reward: -12.825188941955567
Epoch 660 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 660 test with depth buffer:
Test Episode 1 Reward: -115.99813842773438
Test Episode 2 Reward: -115.99813842773438
Test Episode 3 Reward: -115.99934387207031
Test Episode 4 Reward: -115.93583679199219
Test Episode 5 Reward: -115.99864196777344
Test Episode 6 Reward: -115.83575439453125
Test Episode 7 Reward: -115.99862670898438
Test Episode 8 Reward: -115.99795532226562
Test Episode 9 Reward: -115.98898315429688
Test Episode 10 Reward: -115.99813842773438
Average Test Reward (with depth buffer:) -115.97495574951172
Epoch 660 test without depth buffer:
Test Episode 1 Reward: -115.98005676269531
Test Episode 2 Reward: -115.98530578613281
Test Episode 3 Reward: -115.99824523925781
Test Episode 4 Reward: -115.98094177246094
Test Episode 5 Reward: -68.5010986328125
Test Episode 6 Reward: -115.99040222167969
Test Episode 7 Reward: -106.76052856445312
Test Episode 8 Reward: -115.9957122

100%|██████████| 2000/2000 [09:16<00:00,  3.59it/s]


Epoch 661 Mean Reward: -6.970952560424805


100%|██████████| 2000/2000 [09:48<00:00,  3.40it/s]


Epoch 662 Mean Reward: -25.489435424804686


100%|██████████| 2000/2000 [09:00<00:00,  3.70it/s]


Epoch 663 Mean Reward: -17.98829779815674


100%|██████████| 2000/2000 [08:47<00:00,  3.79it/s]


Epoch 664 Mean Reward: -18.577844268798827


100%|██████████| 2000/2000 [09:20<00:00,  3.57it/s]


Epoch 665 Mean Reward: -20.641888633728026


100%|██████████| 2000/2000 [08:53<00:00,  3.75it/s]


Epoch 666 Mean Reward: -28.6017395401001


100%|██████████| 2000/2000 [09:36<00:00,  3.47it/s]


Epoch 667 Mean Reward: -31.255288787841796


100%|██████████| 2000/2000 [09:37<00:00,  3.46it/s]


Epoch 668 Mean Reward: -13.107696166992188


100%|██████████| 2000/2000 [10:06<00:00,  3.30it/s]


Epoch 669 Mean Reward: -22.179699775695802


100%|██████████| 2000/2000 [09:20<00:00,  3.57it/s]


Epoch 670 Mean Reward: -14.463779586791992


100%|██████████| 2000/2000 [10:12<00:00,  3.27it/s]


Epoch 671 Mean Reward: -33.50402543640137


100%|██████████| 2000/2000 [10:01<00:00,  3.32it/s]


Epoch 672 Mean Reward: -25.554723770141603


100%|██████████| 2000/2000 [10:58<00:00,  3.04it/s]


Epoch 673 Mean Reward: -54.89013272857666


100%|██████████| 2000/2000 [10:42<00:00,  3.11it/s]


Epoch 674 Mean Reward: -36.06250629425049


100%|██████████| 2000/2000 [18:48<00:00,  1.77it/s]


Epoch 675 Mean Reward: -39.12291702270508


100%|██████████| 2000/2000 [25:44<00:00,  1.29it/s]


Epoch 676 Mean Reward: -32.78951109313965


100%|██████████| 2000/2000 [24:36<00:00,  1.35it/s]


Epoch 677 Mean Reward: -44.52436025238037


100%|██████████| 2000/2000 [22:10<00:00,  1.50it/s]


Epoch 678 Mean Reward: -36.567862365722654


100%|██████████| 2000/2000 [22:08<00:00,  1.51it/s]


Epoch 679 Mean Reward: -48.147163818359374


100%|██████████| 2000/2000 [23:17<00:00,  1.43it/s]


Epoch 680 Mean Reward: -57.28933960723877
Epoch 680 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 680 test with depth buffer:
Test Episode 1 Reward: -115.99714660644531
Test Episode 2 Reward: -115.99714660644531
Test Episode 3 Reward: -115.99714660644531
Test Episode 4 Reward: -115.99714660644531
Test Episode 5 Reward: -115.99714660644531
Test Episode 6 Reward: -115.9781494140625
Test Episode 7 Reward: -115.99714660644531
Test Episode 8 Reward: -115.28787231445312
Test Episode 9 Reward: -115.99714660644531
Test Episode 10 Reward: -115.97709655761719
Average Test Reward (with depth buffer:) -115.922314453125
Epoch 680 test without depth buffer:
Test Episode 1 Reward: -113.36573791503906
Test Episode 2 Reward: -113.36573791503906
Test Episode 3 Reward: -106.64779663085938
Test Episode 4 Reward: -115.89132690429688
Test Episode 5 Reward: -115.99644470214844
Test Episode 6 Reward: -110.55894470214844
Test Episode 7 Reward: -88.99050903320312
Test Episode 8 Reward: -115.9903869628

100%|██████████| 2000/2000 [11:03<00:00,  3.01it/s]


Epoch 681 Mean Reward: -33.682544090270994


100%|██████████| 2000/2000 [11:04<00:00,  3.01it/s]


Epoch 682 Mean Reward: -28.21117153930664


100%|██████████| 2000/2000 [11:01<00:00,  3.02it/s]


Epoch 683 Mean Reward: -24.670774543762207


100%|██████████| 2000/2000 [13:04<00:00,  2.55it/s]


Epoch 684 Mean Reward: -26.261703926086426


100%|██████████| 2000/2000 [12:56<00:00,  2.58it/s]


Epoch 685 Mean Reward: -39.06342402648926


100%|██████████| 2000/2000 [11:44<00:00,  2.84it/s]


Epoch 686 Mean Reward: -53.20532769012451


100%|██████████| 2000/2000 [10:36<00:00,  3.14it/s]


Epoch 687 Mean Reward: -51.08886731719971


100%|██████████| 2000/2000 [10:07<00:00,  3.29it/s]


Epoch 688 Mean Reward: -30.363683372497558


100%|██████████| 2000/2000 [10:54<00:00,  3.05it/s]


Epoch 689 Mean Reward: -42.451775650024416


100%|██████████| 2000/2000 [10:46<00:00,  3.10it/s]


Epoch 690 Mean Reward: -40.997785934448245


100%|██████████| 2000/2000 [11:02<00:00,  3.02it/s]


Epoch 691 Mean Reward: -34.91390914154053


100%|██████████| 2000/2000 [14:01<00:00,  2.38it/s]


Epoch 692 Mean Reward: -34.00491593170166


100%|██████████| 2000/2000 [12:56<00:00,  2.58it/s]


Epoch 693 Mean Reward: -51.891307342529295


100%|██████████| 2000/2000 [10:54<00:00,  3.05it/s]


Epoch 694 Mean Reward: -48.270130645751955


100%|██████████| 2000/2000 [11:27<00:00,  2.91it/s]


Epoch 695 Mean Reward: -50.11734259033203


100%|██████████| 2000/2000 [13:58<00:00,  2.39it/s]


Epoch 696 Mean Reward: -41.349265907287595


100%|██████████| 2000/2000 [10:39<00:00,  3.13it/s]


Epoch 697 Mean Reward: -43.901524795532225


100%|██████████| 2000/2000 [10:41<00:00,  3.12it/s]


Epoch 698 Mean Reward: -47.854216255187985


100%|██████████| 2000/2000 [10:15<00:00,  3.25it/s]


Epoch 699 Mean Reward: -35.796328346252444


100%|██████████| 2000/2000 [20:22<00:00,  1.64it/s]


Epoch 700 Mean Reward: -35.626420135498044
Epoch 700 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 700 test with depth buffer:
Test Episode 1 Reward: -115.99163818359375
Test Episode 2 Reward: -115.99163818359375
Test Episode 3 Reward: -115.99163818359375
Test Episode 4 Reward: -115.97836303710938
Test Episode 5 Reward: -115.98023986816406
Test Episode 6 Reward: -115.99163818359375
Test Episode 7 Reward: -115.99163818359375
Test Episode 8 Reward: -115.99859619140625
Test Episode 9 Reward: -115.99163818359375
Test Episode 10 Reward: -115.9769287109375
Average Test Reward (with depth buffer:) -115.98839569091797
Epoch 700 test without depth buffer:
Test Episode 1 Reward: -115.98295593261719
Test Episode 2 Reward: -111.61497497558594
Test Episode 3 Reward: -115.98898315429688
Test Episode 4 Reward: -115.98295593261719
Test Episode 5 Reward: -115.02093505859375
Test Episode 6 Reward: -115.98295593261719
Test Episode 7 Reward: -115.97853088378906
Test Episode 8 Reward: -81.7741241

100%|██████████| 2000/2000 [13:21<00:00,  2.50it/s]


Epoch 701 Mean Reward: -44.643205703735354


100%|██████████| 2000/2000 [09:51<00:00,  3.38it/s]


Epoch 702 Mean Reward: -19.363418251037597


100%|██████████| 2000/2000 [12:59<00:00,  2.57it/s]


Epoch 703 Mean Reward: -34.34923129272461


100%|██████████| 2000/2000 [10:16<00:00,  3.24it/s]


Epoch 704 Mean Reward: -30.765112335205078


100%|██████████| 2000/2000 [10:38<00:00,  3.13it/s]


Epoch 705 Mean Reward: -34.775920555114745


100%|██████████| 2000/2000 [09:26<00:00,  3.53it/s]


Epoch 706 Mean Reward: -23.10088591003418


100%|██████████| 2000/2000 [10:33<00:00,  3.16it/s]


Epoch 707 Mean Reward: -24.322781188964843


100%|██████████| 2000/2000 [10:51<00:00,  3.07it/s]


Epoch 708 Mean Reward: -34.13313599395752


100%|██████████| 2000/2000 [10:30<00:00,  3.17it/s]


Epoch 709 Mean Reward: -11.995031272888184


100%|██████████| 2000/2000 [10:42<00:00,  3.11it/s]


Epoch 710 Mean Reward: -31.68059558105469


100%|██████████| 2000/2000 [11:52<00:00,  2.81it/s]


Epoch 711 Mean Reward: -42.54398519897461


100%|██████████| 2000/2000 [12:07<00:00,  2.75it/s]


Epoch 712 Mean Reward: -48.23879759216309


100%|██████████| 2000/2000 [11:47<00:00,  2.83it/s]


Epoch 713 Mean Reward: -42.63678240203858


100%|██████████| 2000/2000 [11:56<00:00,  2.79it/s]


Epoch 714 Mean Reward: -38.64177394866943


100%|██████████| 2000/2000 [11:18<00:00,  2.95it/s]


Epoch 715 Mean Reward: -43.02235707855225


100%|██████████| 2000/2000 [11:29<00:00,  2.90it/s]


Epoch 716 Mean Reward: -42.44579554748535


100%|██████████| 2000/2000 [11:30<00:00,  2.90it/s]


Epoch 717 Mean Reward: -23.7350178527832


100%|██████████| 2000/2000 [11:22<00:00,  2.93it/s]


Epoch 718 Mean Reward: -53.56666312408447


100%|██████████| 2000/2000 [11:40<00:00,  2.86it/s]


Epoch 719 Mean Reward: -34.46010736083984


100%|██████████| 2000/2000 [11:35<00:00,  2.87it/s]


Epoch 720 Mean Reward: -38.03135709381103
Epoch 720 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 720 test with depth buffer:
Test Episode 1 Reward: -115.99166870117188
Test Episode 2 Reward: -113.41349792480469
Test Episode 3 Reward: -113.41349792480469
Test Episode 4 Reward: -115.99935913085938
Test Episode 5 Reward: -115.56820678710938
Test Episode 6 Reward: -115.99858093261719
Test Episode 7 Reward: -113.41349792480469
Test Episode 8 Reward: -115.82647705078125
Test Episode 9 Reward: -113.41349792480469
Test Episode 10 Reward: -115.9993896484375
Average Test Reward (with depth buffer:) -114.90376739501953
Epoch 720 test without depth buffer:
Test Episode 1 Reward: -115.97801208496094
Test Episode 2 Reward: -115.9796142578125
Test Episode 3 Reward: -115.97569274902344
Test Episode 4 Reward: -115.99501037597656
Test Episode 5 Reward: -115.99259948730469
Test Episode 6 Reward: -115.9796142578125
Test Episode 7 Reward: -115.9796142578125
Test Episode 8 Reward: -114.9468688964

100%|██████████| 2000/2000 [13:31<00:00,  2.47it/s]


Epoch 721 Mean Reward: -48.10835633087158


100%|██████████| 2000/2000 [13:30<00:00,  2.47it/s]


Epoch 722 Mean Reward: -57.42065301513672


100%|██████████| 2000/2000 [12:50<00:00,  2.60it/s]


Epoch 723 Mean Reward: -30.442440017700196


100%|██████████| 2000/2000 [13:21<00:00,  2.49it/s]


Epoch 724 Mean Reward: -45.978985816955564


100%|██████████| 2000/2000 [13:02<00:00,  2.56it/s]


Epoch 725 Mean Reward: -43.34428973388672


100%|██████████| 2000/2000 [13:12<00:00,  2.52it/s]


Epoch 726 Mean Reward: -44.851543090820314


100%|██████████| 2000/2000 [13:24<00:00,  2.49it/s]


Epoch 727 Mean Reward: -50.48967370605469


100%|██████████| 2000/2000 [14:04<00:00,  2.37it/s]


Epoch 728 Mean Reward: -45.31905266571045


100%|██████████| 2000/2000 [14:09<00:00,  2.35it/s]


Epoch 729 Mean Reward: -51.483812034606935


100%|██████████| 2000/2000 [13:24<00:00,  2.48it/s]


Epoch 730 Mean Reward: -52.4908616104126


100%|██████████| 2000/2000 [12:47<00:00,  2.61it/s]


Epoch 731 Mean Reward: -35.800778007507326


100%|██████████| 2000/2000 [13:20<00:00,  2.50it/s]


Epoch 732 Mean Reward: -48.68484605407715


100%|██████████| 2000/2000 [11:52<00:00,  2.81it/s]


Epoch 733 Mean Reward: -35.93817172241211


100%|██████████| 2000/2000 [12:25<00:00,  2.68it/s]


Epoch 734 Mean Reward: -32.19082038116455


100%|██████████| 2000/2000 [12:28<00:00,  2.67it/s]


Epoch 735 Mean Reward: -21.54393748474121


100%|██████████| 2000/2000 [12:15<00:00,  2.72it/s]


Epoch 736 Mean Reward: -42.09173331451416


100%|██████████| 2000/2000 [12:15<00:00,  2.72it/s]


Epoch 737 Mean Reward: -29.64660915374756


100%|██████████| 2000/2000 [12:21<00:00,  2.70it/s]


Epoch 738 Mean Reward: -29.556450439453126


100%|██████████| 2000/2000 [11:43<00:00,  2.84it/s]


Epoch 739 Mean Reward: -3.4641528015136718


100%|██████████| 2000/2000 [11:42<00:00,  2.85it/s]


Epoch 740 Mean Reward: -3.3135249557495117
Epoch 740 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 740 test with depth buffer:
Test Episode 1 Reward: -96.89295959472656
Test Episode 2 Reward: -96.89295959472656
Test Episode 3 Reward: -79.08297729492188
Test Episode 4 Reward: -22.3876953125
Test Episode 5 Reward: -104.22125244140625
Test Episode 6 Reward: -96.89295959472656
Test Episode 7 Reward: -20.21307373046875
Test Episode 8 Reward: -96.89295959472656
Test Episode 9 Reward: -96.89295959472656
Test Episode 10 Reward: -96.89295959472656
Average Test Reward (with depth buffer:) -80.72627563476563
Epoch 740 test without depth buffer:
Test Episode 1 Reward: -115.97564697265625
Test Episode 2 Reward: -114.72431945800781
Test Episode 3 Reward: -115.97817993164062
Test Episode 4 Reward: -115.97817993164062
Test Episode 5 Reward: -115.99925231933594
Test Episode 6 Reward: -115.97817993164062
Test Episode 7 Reward: -115.97817993164062
Test Episode 8 Reward: -115.97817993164062
Test

100%|██████████| 2000/2000 [11:42<00:00,  2.85it/s]


Epoch 741 Mean Reward: -20.392927719116212


100%|██████████| 2000/2000 [17:55<00:00,  1.86it/s]


Epoch 742 Mean Reward: -37.87980696105957


100%|██████████| 2000/2000 [19:17<00:00,  1.73it/s]


Epoch 743 Mean Reward: -30.29063307952881


100%|██████████| 2000/2000 [10:50<00:00,  3.08it/s]


Epoch 744 Mean Reward: -12.797115928649902


100%|██████████| 2000/2000 [11:02<00:00,  3.02it/s]


Epoch 745 Mean Reward: -17.385441040039062


100%|██████████| 2000/2000 [11:02<00:00,  3.02it/s]


Epoch 746 Mean Reward: -1.0256227340698243


100%|██████████| 2000/2000 [10:18<00:00,  3.23it/s]


Epoch 747 Mean Reward: -16.431918991088867


100%|██████████| 2000/2000 [10:38<00:00,  3.13it/s]


Epoch 748 Mean Reward: -16.738638938903808


100%|██████████| 2000/2000 [10:50<00:00,  3.08it/s]


Epoch 749 Mean Reward: -19.565614067077636


100%|██████████| 2000/2000 [10:31<00:00,  3.17it/s]


Epoch 750 Mean Reward: -38.201765632629396


100%|██████████| 2000/2000 [12:14<00:00,  2.72it/s]


Epoch 751 Mean Reward: -57.83257931518555


100%|██████████| 2000/2000 [11:41<00:00,  2.85it/s]


Epoch 752 Mean Reward: -56.15983968353272


100%|██████████| 2000/2000 [10:50<00:00,  3.07it/s]


Epoch 753 Mean Reward: -32.71646997833252


100%|██████████| 2000/2000 [11:13<00:00,  2.97it/s]


Epoch 754 Mean Reward: -45.59731706237793


100%|██████████| 2000/2000 [11:07<00:00,  3.00it/s]


Epoch 755 Mean Reward: -43.30981259918213


100%|██████████| 2000/2000 [11:10<00:00,  2.98it/s]


Epoch 756 Mean Reward: -49.93185069274902


100%|██████████| 2000/2000 [10:57<00:00,  3.04it/s]


Epoch 757 Mean Reward: -51.34954048919678


100%|██████████| 2000/2000 [11:54<00:00,  2.80it/s]


Epoch 758 Mean Reward: -40.83267673492432


100%|██████████| 2000/2000 [12:16<00:00,  2.72it/s]


Epoch 759 Mean Reward: -29.967863876342772


100%|██████████| 2000/2000 [11:24<00:00,  2.92it/s]


Epoch 760 Mean Reward: -62.38746867370605
Epoch 760 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 760 test with depth buffer:
Test Episode 1 Reward: -103.20834350585938
Test Episode 2 Reward: -115.99858093261719
Test Episode 3 Reward: -115.99858093261719
Test Episode 4 Reward: -115.99858093261719
Test Episode 5 Reward: -115.99858093261719
Test Episode 6 Reward: -115.99858093261719
Test Episode 7 Reward: -115.99858093261719
Test Episode 8 Reward: -115.83531188964844
Test Episode 9 Reward: -115.99858093261719
Test Episode 10 Reward: -115.93740844726562
Average Test Reward (with depth buffer:) -114.69711303710938
Epoch 760 test without depth buffer:
Test Episode 1 Reward: -115.9957275390625
Test Episode 2 Reward: -101.34579467773438
Test Episode 3 Reward: -101.34579467773438
Test Episode 4 Reward: -110.20396423339844
Test Episode 5 Reward: -115.88352966308594
Test Episode 6 Reward: -101.34579467773438
Test Episode 7 Reward: -115.80021667480469
Test Episode 8 Reward: -114.4032135

100%|██████████| 2000/2000 [11:02<00:00,  3.02it/s]


Epoch 761 Mean Reward: -50.11657884216309


100%|██████████| 2000/2000 [10:56<00:00,  3.05it/s]


Epoch 762 Mean Reward: -30.97782154083252


100%|██████████| 2000/2000 [10:07<00:00,  3.29it/s]


Epoch 763 Mean Reward: -54.55990482330322


100%|██████████| 2000/2000 [10:13<00:00,  3.26it/s]


Epoch 764 Mean Reward: -58.960647087097165


100%|██████████| 2000/2000 [10:09<00:00,  3.28it/s]


Epoch 765 Mean Reward: -55.71756838989258


100%|██████████| 2000/2000 [09:37<00:00,  3.47it/s]


Epoch 766 Mean Reward: -60.165120780944825


100%|██████████| 2000/2000 [09:54<00:00,  3.36it/s]


Epoch 767 Mean Reward: -49.41376145172119


100%|██████████| 2000/2000 [09:33<00:00,  3.49it/s]


Epoch 768 Mean Reward: -60.97206776428223


100%|██████████| 2000/2000 [10:00<00:00,  3.33it/s]


Epoch 769 Mean Reward: -71.91215118408203


100%|██████████| 2000/2000 [09:53<00:00,  3.37it/s]


Epoch 770 Mean Reward: -71.81192533111572


100%|██████████| 2000/2000 [10:51<00:00,  3.07it/s]


Epoch 771 Mean Reward: -67.70002166748047


100%|██████████| 2000/2000 [10:43<00:00,  3.11it/s]


Epoch 772 Mean Reward: -63.992627571105956


100%|██████████| 2000/2000 [11:12<00:00,  2.97it/s]


Epoch 773 Mean Reward: -59.56561919403076


100%|██████████| 2000/2000 [11:08<00:00,  2.99it/s]


Epoch 774 Mean Reward: -68.10856203460693


100%|██████████| 2000/2000 [11:09<00:00,  2.99it/s]


Epoch 775 Mean Reward: -65.7479390335083


100%|██████████| 2000/2000 [10:24<00:00,  3.20it/s]


Epoch 776 Mean Reward: -59.26793618774414


100%|██████████| 2000/2000 [11:09<00:00,  2.99it/s]


Epoch 777 Mean Reward: -68.78692948150635


100%|██████████| 2000/2000 [10:56<00:00,  3.05it/s]


Epoch 778 Mean Reward: -59.775965629577634


100%|██████████| 2000/2000 [10:47<00:00,  3.09it/s]


Epoch 779 Mean Reward: -70.58516298675536


100%|██████████| 2000/2000 [10:39<00:00,  3.13it/s]


Epoch 780 Mean Reward: -66.22523876953124
Epoch 780 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 780 test with depth buffer:
Test Episode 1 Reward: -115.99610900878906
Test Episode 2 Reward: -115.99026489257812
Test Episode 3 Reward: -115.99432373046875
Test Episode 4 Reward: -115.99610900878906
Test Episode 5 Reward: -115.99519348144531
Test Episode 6 Reward: -115.99610900878906
Test Episode 7 Reward: -115.99610900878906
Test Episode 8 Reward: -115.99610900878906
Test Episode 9 Reward: -115.99610900878906
Test Episode 10 Reward: -115.99610900878906
Average Test Reward (with depth buffer:) -115.99525451660156
Epoch 780 test without depth buffer:
Test Episode 1 Reward: -115.98890686035156
Test Episode 2 Reward: -115.98368835449219
Test Episode 3 Reward: -115.98368835449219
Test Episode 4 Reward: -115.98960876464844
Test Episode 5 Reward: -115.98368835449219
Test Episode 6 Reward: -88.09933471679688
Test Episode 7 Reward: -115.98931884765625
Test Episode 8 Reward: -115.9836883

100%|██████████| 2000/2000 [11:51<00:00,  2.81it/s]


Epoch 781 Mean Reward: -53.78012596130371


100%|██████████| 2000/2000 [08:27<00:00,  3.94it/s]


Epoch 782 Mean Reward: -64.10131370544434


100%|██████████| 2000/2000 [07:56<00:00,  4.20it/s]


Epoch 783 Mean Reward: -67.74681202697754


100%|██████████| 2000/2000 [08:36<00:00,  3.87it/s]


Epoch 784 Mean Reward: -68.63411178588868


100%|██████████| 2000/2000 [08:15<00:00,  4.03it/s]


Epoch 785 Mean Reward: -74.55933180999756


100%|██████████| 2000/2000 [09:41<00:00,  3.44it/s]


Epoch 786 Mean Reward: -63.5928794631958


100%|██████████| 2000/2000 [08:41<00:00,  3.84it/s]


Epoch 787 Mean Reward: -71.28302139282226


100%|██████████| 2000/2000 [08:06<00:00,  4.11it/s]


Epoch 788 Mean Reward: -74.2202608718872


100%|██████████| 2000/2000 [08:19<00:00,  4.00it/s]


Epoch 789 Mean Reward: -76.64887561035157


100%|██████████| 2000/2000 [11:16<00:00,  2.96it/s]


Epoch 790 Mean Reward: -76.11675702667236


100%|██████████| 2000/2000 [20:38<00:00,  1.61it/s]


Epoch 791 Mean Reward: -53.846071144104


100%|██████████| 2000/2000 [13:49<00:00,  2.41it/s]


Epoch 792 Mean Reward: -56.16392221832275


100%|██████████| 2000/2000 [13:30<00:00,  2.47it/s]


Epoch 793 Mean Reward: -51.33667993927002


100%|██████████| 2000/2000 [12:41<00:00,  2.63it/s]


Epoch 794 Mean Reward: -49.566622360229495


100%|██████████| 2000/2000 [24:00<00:00,  1.39it/s]


Epoch 795 Mean Reward: -44.98227840423584


100%|██████████| 2000/2000 [26:16<00:00,  1.27it/s]


Epoch 796 Mean Reward: -46.08407411956787


100%|██████████| 2000/2000 [22:18<00:00,  1.49it/s]


Epoch 797 Mean Reward: -54.69232089996338


100%|██████████| 2000/2000 [15:56<00:00,  2.09it/s]


Epoch 798 Mean Reward: -36.44768576812744


100%|██████████| 2000/2000 [12:14<00:00,  2.72it/s]


Epoch 799 Mean Reward: -57.536512367248534


100%|██████████| 2000/2000 [12:05<00:00,  2.76it/s]


Epoch 800 Mean Reward: -54.70565338134766
Epoch 800 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 800 test with depth buffer:
Test Episode 1 Reward: -115.99856567382812
Test Episode 2 Reward: -110.75105285644531
Test Episode 3 Reward: -110.17874145507812
Test Episode 4 Reward: -109.16941833496094
Test Episode 5 Reward: -70.90591430664062
Test Episode 6 Reward: -115.97561645507812
Test Episode 7 Reward: -109.16941833496094
Test Episode 8 Reward: -109.16941833496094
Test Episode 9 Reward: -115.09323120117188
Test Episode 10 Reward: -109.16941833496094
Average Test Reward (with depth buffer:) -107.55807952880859
Epoch 800 test without depth buffer:
Test Episode 1 Reward: -101.87973022460938
Test Episode 2 Reward: -89.53477478027344
Test Episode 3 Reward: -89.53477478027344
Test Episode 4 Reward: -89.53477478027344
Test Episode 5 Reward: -103.35087585449219
Test Episode 6 Reward: -115.99821472167969
Test Episode 7 Reward: -105.72991943359375
Test Episode 8 Reward: -115.99609375
T

In [6]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

for i in range(len(ckpts)):
    test_reward = test_agent(DQN, num_episodes=20,
                             training=False,
                             load_model=True,
                             depth=True,
                             model_dir=ckpts[i])
    print('Average Test Reward (with depth buffer):', test_reward)

    test_reward = test_agent(DQN, num_episodes=20,
                             training=False,
                             load_model=True,
                             depth=False,
                             model_dir=ckpts[i])
    print('Average Test Reward (without depth buffer):', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-420
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-420
Test Episode 1 Reward: -115.99806213378906
Test Episode 2 Reward: -115.99806213378906
Test Episode 3 Reward: -115.99806213378906
Test Episode 4 Reward: -115.99806213378906
Test Episode 5 Reward: -115.99713134765625
Test Episode 6 Reward: 14.74700927734375
Test Episode 7 Reward: -102.05650329589844
Test Episode 8 Reward: -115.99806213378906
Test Episode 9 Reward: -115.97842407226562
Test Episode 10 Reward: -115.99806213378906
Test Episode 11 Reward: -115.99806213378906
Test Episode 12 Reward: -115.99806213378906
Test Episode 13 Reward: -115.99571228027344
Test Episode 14 Reward: -115.99806213378906
Test Episode 15 Reward: -115.99856567382812
Test Episode 16 Reward: -115.99806213378906
Test Episode 17 Reward: 277.2981872558594
Test Episode 18 Reward: -115.99784851074219
Test Episode 19 Reward: -115.99642944335938
Test Episode 20 Reward: -115.9980621337890

Test Episode 16 Reward: -115.72340393066406
Test Episode 17 Reward: -53.816864013671875
Test Episode 18 Reward: -115.97564697265625
Test Episode 19 Reward: -53.816864013671875
Test Episode 20 Reward: -115.99501037597656
Average Test Reward (without depth buffer): -81.49195785522461
Loading model from checkpoints\deadly_corridor.ckpt-500
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-500
Test Episode 1 Reward: -115.6474609375
Test Episode 2 Reward: -115.84262084960938
Test Episode 3 Reward: -115.99574279785156
Test Episode 4 Reward: -115.17738342285156
Test Episode 5 Reward: -115.99432373046875
Test Episode 6 Reward: -115.99432373046875
Test Episode 7 Reward: -115.99432373046875
Test Episode 8 Reward: -115.99774169921875
Test Episode 9 Reward: -115.91317749023438
Test Episode 10 Reward: -115.99864196777344
Test Episode 11 Reward: -115.99432373046875
Test Episode 12 Reward: -115.99432373046875
Test Episode 13 Reward: -115.99432373046875
Test Episode 14 Reward:

Test Episode 10 Reward: -115.01789855957031
Test Episode 11 Reward: -65.34127807617188
Test Episode 12 Reward: 225.2841796875
Test Episode 13 Reward: 163.730712890625
Test Episode 14 Reward: -91.11805725097656
Test Episode 15 Reward: -91.11805725097656
Test Episode 16 Reward: -80.94488525390625
Test Episode 17 Reward: -37.03651428222656
Test Episode 18 Reward: -91.11805725097656
Test Episode 19 Reward: -94.33491516113281
Test Episode 20 Reward: 50.174560546875
Average Test Reward (without depth buffer): -37.73258285522461
Loading model from checkpoints\deadly_corridor.ckpt-580
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-580
Test Episode 1 Reward: -115.99642944335938
Test Episode 2 Reward: -113.44442749023438
Test Episode 3 Reward: -115.99809265136719
Test Episode 4 Reward: -115.99642944335938
Test Episode 5 Reward: -115.99642944335938
Test Episode 6 Reward: -115.95332336425781
Test Episode 7 Reward: -115.99642944335938
Test Episode 8 Reward: -115.99642944

Test Episode 2 Reward: -115.99815368652344
Test Episode 3 Reward: -115.97972106933594
Test Episode 4 Reward: -115.98426818847656
Test Episode 5 Reward: -115.99119567871094
Test Episode 6 Reward: -115.99815368652344
Test Episode 7 Reward: -115.99815368652344
Test Episode 8 Reward: -115.99916076660156
Test Episode 9 Reward: -115.99815368652344
Test Episode 10 Reward: -115.99815368652344
Test Episode 11 Reward: -115.99815368652344
Test Episode 12 Reward: -115.99815368652344
Test Episode 13 Reward: -115.99815368652344
Test Episode 14 Reward: -114.97669982910156
Test Episode 15 Reward: -115.9766845703125
Test Episode 16 Reward: -115.99815368652344
Test Episode 17 Reward: -115.99815368652344
Test Episode 18 Reward: -115.84794616699219
Test Episode 19 Reward: -103.24835205078125
Test Episode 20 Reward: -115.99815368652344
Average Test Reward (without depth buffer): -115.2990936279297
Loading model from checkpoints\deadly_corridor.ckpt-660
INFO:tensorflow:Restoring parameters from checkpoints\

Test Episode 19 Reward: -115.97813415527344
Test Episode 20 Reward: -115.99856567382812
Average Test Reward (with depth buffer): -114.04085311889648
Loading model from checkpoints\deadly_corridor.ckpt-720
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-720
Test Episode 1 Reward: -109.87702941894531
Test Episode 2 Reward: -109.87702941894531
Test Episode 3 Reward: -109.87702941894531
Test Episode 4 Reward: -115.99571228027344
Test Episode 5 Reward: -109.87702941894531
Test Episode 6 Reward: -109.87702941894531
Test Episode 7 Reward: -115.99977111816406
Test Episode 8 Reward: -115.99786376953125
Test Episode 9 Reward: -109.87702941894531
Test Episode 10 Reward: -109.87702941894531
Test Episode 11 Reward: -93.2083740234375
Test Episode 12 Reward: -109.87702941894531
Test Episode 13 Reward: -109.87702941894531
Test Episode 14 Reward: -115.97560119628906
Test Episode 15 Reward: -115.97564697265625
Test Episode 16 Reward: -109.87702941894531
Test Episode 17 Reward:

Test Episode 12 Reward: -115.998779296875
Test Episode 13 Reward: -115.16998291015625
Test Episode 14 Reward: -102.97532653808594
Test Episode 15 Reward: -115.99856567382812
Test Episode 16 Reward: -115.9791259765625
Test Episode 17 Reward: -115.92585754394531
Test Episode 18 Reward: -102.97532653808594
Test Episode 19 Reward: -115.99876403808594
Test Episode 20 Reward: -115.98017883300781
Average Test Reward (with depth buffer): -110.74221343994141
Loading model from checkpoints\deadly_corridor.ckpt-800
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-800
Test Episode 1 Reward: -115.99688720703125
Test Episode 2 Reward: -29.837112426757812
Test Episode 3 Reward: -29.837112426757812
Test Episode 4 Reward: -29.837112426757812
Test Episode 5 Reward: -115.99935913085938
Test Episode 6 Reward: -29.837112426757812
Test Episode 7 Reward: -29.837112426757812
Test Episode 8 Reward: -106.13905334472656
Test Episode 9 Reward: -115.98345947265625
Test Episode 10 Reward: 