In [1]:
import importlib.util
import time

import tensorflow as tf
import numpy as np

from skimage.transform import rescale
from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + int(game.is_depth_buffer_enabled())

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 100000
epochs = 500
steps_per_epoch = 2000
learning_rate = 0.005
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if float(down_sample_ratio) != 1.0:
        image = rescale(image=image, scale=down_sample_ratio, mode='reflect')
    image = image.astype(np.float32)
    image = np.expand_dims(image, axis=0)

    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, depth, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            
            if depth == False:
                depth_buffer = np.zeros(state.screen_buffer[:, :, :1].shape)
            else:
                depth_buffer = np.expand_dims(state.depth_buffer, axis=2)
                
            buffer = np.concatenate((state.screen_buffer, depth_buffer), axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self, epoch):
        self.learning_rate = 0.98*self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver(max_to_keep=10, reshape=True)
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            
#Substitute an array of zeros for the depth buffer if that setting is disabled

            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer[:, :, :1].shape)
            else:
                depth_buffer = np.expand_dims(state.depth_buffer, axis=2)
            
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                depth_buffer),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    depth_buffer),
                                                    axis=2),
                                                    down_sample_ratio)
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr(epoch)
    target_net.update_lr(epoch)
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 20 epochs
    
    if (epoch + 1) % 20 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

#Test the agent both with and without the depth buffer given
        
        print('Epoch {} test with depth buffer:'.format(epoch + 1))
        test_reward_depth = test_agent(DQN, num_episodes=10,
                                       training=True,
                                       load_model=False,
                                       depth=True,
                                       session=session,
                                       model_dir=model_dir)
        print('Average Test Reward (with depth buffer:)', test_reward_depth)
        
        print('Epoch {} test without depth buffer:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 depth=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward (without depth buffer):', test_reward)
        
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [03:17<00:00, 10.12it/s]


Epoch 1 Mean Reward: -70.6726957321167


100%|██████████| 2000/2000 [05:46<00:00,  5.78it/s]


Epoch 2 Mean Reward: -70.51831574249267


100%|██████████| 2000/2000 [07:05<00:00,  4.70it/s]


Epoch 3 Mean Reward: -69.86438262176513


100%|██████████| 2000/2000 [07:29<00:00,  4.45it/s]


Epoch 4 Mean Reward: -70.020674949646


100%|██████████| 2000/2000 [08:48<00:00,  3.79it/s]


Epoch 5 Mean Reward: -71.2573369140625


100%|██████████| 2000/2000 [09:08<00:00,  3.65it/s]


Epoch 6 Mean Reward: -66.79126322937012


100%|██████████| 2000/2000 [10:32<00:00,  3.16it/s]


Epoch 7 Mean Reward: -67.11312698364257


100%|██████████| 2000/2000 [10:46<00:00,  3.10it/s]


Epoch 8 Mean Reward: -69.02893545532227


100%|██████████| 2000/2000 [09:29<00:00,  3.51it/s]


Epoch 9 Mean Reward: -68.82013133239747


100%|██████████| 2000/2000 [10:10<00:00,  3.28it/s]


Epoch 10 Mean Reward: -68.38439338684083


100%|██████████| 2000/2000 [09:06<00:00,  3.66it/s]


Epoch 11 Mean Reward: -68.5786576385498


100%|██████████| 2000/2000 [09:05<00:00,  3.67it/s]


Epoch 12 Mean Reward: -70.42083762359619


100%|██████████| 2000/2000 [09:09<00:00,  3.64it/s]


Epoch 13 Mean Reward: -69.89468951416016


100%|██████████| 2000/2000 [08:54<00:00,  3.74it/s]


Epoch 14 Mean Reward: -70.43417224121093


100%|██████████| 2000/2000 [08:20<00:00,  3.99it/s]


Epoch 15 Mean Reward: -68.55143242645264


100%|██████████| 2000/2000 [08:25<00:00,  3.96it/s]


Epoch 16 Mean Reward: -68.19747480010986


100%|██████████| 2000/2000 [08:20<00:00,  3.99it/s]


Epoch 17 Mean Reward: -68.63987706756592


100%|██████████| 2000/2000 [08:33<00:00,  3.90it/s]


Epoch 18 Mean Reward: -68.02025023651123


100%|██████████| 2000/2000 [08:32<00:00,  3.90it/s]


Epoch 19 Mean Reward: -67.95362667846679


100%|██████████| 2000/2000 [09:50<00:00,  3.39it/s]


Epoch 20 Mean Reward: -66.40541453552247
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test with depth buffer:
Test Episode 1 Reward: 72.81971740722656
Test Episode 2 Reward: 80.98606872558594
Test Episode 3 Reward: 80.98606872558594
Test Episode 4 Reward: 93.06916809082031
Test Episode 5 Reward: 38.58656311035156
Test Episode 6 Reward: 27.776565551757812
Test Episode 7 Reward: 138.99075317382812
Test Episode 8 Reward: 80.98606872558594
Test Episode 9 Reward: 80.98606872558594
Test Episode 10 Reward: 58.32145690917969
Average Test Reward (with depth buffer:) 75.35084991455078
Epoch 20 test without depth buffer:
Test Episode 1 Reward: -8.509994506835938
Test Episode 2 Reward: -8.509994506835938
Test Episode 3 Reward: -3.74169921875
Test Episode 4 Reward: -40.842315673828125
Test Episode 5 Reward: -56.33369445800781
Test Episode 6 Reward: -8.509994506835938
Test Episode 7 Reward: -8.509994506835938
Test Episode 8 Reward: -8.509994506835938
Test Episode 9 Reward: -8.

100%|██████████| 2000/2000 [08:40<00:00,  3.85it/s]


Epoch 21 Mean Reward: -68.15007958221436


100%|██████████| 2000/2000 [08:17<00:00,  4.02it/s]


Epoch 22 Mean Reward: -69.43667860412597


100%|██████████| 2000/2000 [08:34<00:00,  3.89it/s]


Epoch 23 Mean Reward: -68.82995029449462


100%|██████████| 2000/2000 [07:56<00:00,  4.20it/s]


Epoch 24 Mean Reward: -69.04939730072022


100%|██████████| 2000/2000 [08:03<00:00,  4.14it/s]


Epoch 25 Mean Reward: -66.49851142883301


100%|██████████| 2000/2000 [07:54<00:00,  4.21it/s]


Epoch 26 Mean Reward: -68.87528173828125


100%|██████████| 2000/2000 [07:44<00:00,  4.31it/s]


Epoch 27 Mean Reward: -68.64050068664551


100%|██████████| 2000/2000 [07:54<00:00,  4.21it/s]


Epoch 28 Mean Reward: -69.44159510040284


100%|██████████| 2000/2000 [07:15<00:00,  4.60it/s]


Epoch 29 Mean Reward: -69.31050761413574


100%|██████████| 2000/2000 [07:18<00:00,  4.56it/s]


Epoch 30 Mean Reward: -66.89913606262208


100%|██████████| 2000/2000 [07:02<00:00,  4.74it/s]


Epoch 31 Mean Reward: -67.21548526000977


100%|██████████| 2000/2000 [07:33<00:00,  4.41it/s]


Epoch 32 Mean Reward: -70.91343824005126


100%|██████████| 2000/2000 [07:15<00:00,  4.59it/s]


Epoch 33 Mean Reward: -68.74758985137939


100%|██████████| 2000/2000 [07:18<00:00,  4.56it/s]


Epoch 34 Mean Reward: -68.23819803619385


100%|██████████| 2000/2000 [08:11<00:00,  4.07it/s]


Epoch 35 Mean Reward: -66.80223712921142


100%|██████████| 2000/2000 [08:26<00:00,  3.95it/s]


Epoch 36 Mean Reward: -68.08989212036133


100%|██████████| 2000/2000 [07:45<00:00,  4.30it/s]


Epoch 37 Mean Reward: -67.62672456359863


100%|██████████| 2000/2000 [07:10<00:00,  4.64it/s]


Epoch 38 Mean Reward: -67.56650213623047


100%|██████████| 2000/2000 [06:43<00:00,  4.95it/s]


Epoch 39 Mean Reward: -70.07744355773926


100%|██████████| 2000/2000 [06:47<00:00,  4.91it/s]


Epoch 40 Mean Reward: -69.00820126342774
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test with depth buffer:
Test Episode 1 Reward: 131.26002502441406
Test Episode 2 Reward: 389.7648620605469
Test Episode 3 Reward: 143.1270751953125
Test Episode 4 Reward: 95.76017761230469
Test Episode 5 Reward: 214.9745635986328
Test Episode 6 Reward: 214.9745635986328
Test Episode 7 Reward: 214.9745635986328
Test Episode 8 Reward: 113.3040771484375
Test Episode 9 Reward: 214.9745635986328
Test Episode 10 Reward: 214.9745635986328
Average Test Reward (with depth buffer:) 194.80890350341798
Epoch 40 test without depth buffer:
Test Episode 1 Reward: 51.66612243652344
Test Episode 2 Reward: 53.724151611328125
Test Episode 3 Reward: -15.699172973632812
Test Episode 4 Reward: 51.66612243652344
Test Episode 5 Reward: -69.23631286621094
Test Episode 6 Reward: 215.8292694091797
Test Episode 7 Reward: 50.9044189453125
Test Episode 8 Reward: 51.66612243652344
Test Episode 9 Reward: 58.12

100%|██████████| 2000/2000 [07:08<00:00,  4.67it/s]


Epoch 41 Mean Reward: -68.06334611511231


100%|██████████| 2000/2000 [07:05<00:00,  4.70it/s]


Epoch 42 Mean Reward: -68.39065028381347


100%|██████████| 2000/2000 [07:27<00:00,  4.47it/s]


Epoch 43 Mean Reward: -69.71428273010254


100%|██████████| 2000/2000 [07:42<00:00,  4.33it/s]


Epoch 44 Mean Reward: -66.97016018676757


100%|██████████| 2000/2000 [07:25<00:00,  4.49it/s]


Epoch 45 Mean Reward: -65.73708443450927


100%|██████████| 2000/2000 [07:24<00:00,  4.49it/s]


Epoch 46 Mean Reward: -66.82600158691406


100%|██████████| 2000/2000 [07:22<00:00,  4.52it/s]


Epoch 47 Mean Reward: -70.09662278747558


100%|██████████| 2000/2000 [07:28<00:00,  4.46it/s]


Epoch 48 Mean Reward: -68.10399378967286


100%|██████████| 2000/2000 [07:11<00:00,  4.64it/s]


Epoch 49 Mean Reward: -70.26858459472656


100%|██████████| 2000/2000 [07:11<00:00,  4.63it/s]


Epoch 50 Mean Reward: -68.49530377197266


100%|██████████| 2000/2000 [06:32<00:00,  5.09it/s]


Epoch 51 Mean Reward: -69.56222290039062


100%|██████████| 2000/2000 [06:03<00:00,  5.50it/s]


Epoch 52 Mean Reward: -68.90293083953857


100%|██████████| 2000/2000 [05:55<00:00,  5.63it/s]


Epoch 53 Mean Reward: -68.12421102905273


100%|██████████| 2000/2000 [05:35<00:00,  5.95it/s]


Epoch 54 Mean Reward: -68.76908132171631


100%|██████████| 2000/2000 [05:47<00:00,  5.76it/s]


Epoch 55 Mean Reward: -69.62763373565674


100%|██████████| 2000/2000 [06:01<00:00,  5.53it/s]


Epoch 56 Mean Reward: -66.24878966522216


100%|██████████| 2000/2000 [05:49<00:00,  5.72it/s]


Epoch 57 Mean Reward: -68.43659350585938


100%|██████████| 2000/2000 [05:54<00:00,  5.64it/s]


Epoch 58 Mean Reward: -67.78063638305665


100%|██████████| 2000/2000 [06:02<00:00,  5.52it/s]


Epoch 59 Mean Reward: -68.57988148498535


100%|██████████| 2000/2000 [06:25<00:00,  5.19it/s]


Epoch 60 Mean Reward: -70.5367495880127
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test with depth buffer:
Test Episode 1 Reward: 112.24687194824219
Test Episode 2 Reward: 88.91380310058594
Test Episode 3 Reward: 88.91380310058594
Test Episode 4 Reward: 88.91380310058594
Test Episode 5 Reward: 88.91380310058594
Test Episode 6 Reward: 87.94873046875
Test Episode 7 Reward: 117.29924011230469
Test Episode 8 Reward: 370.2001190185547
Test Episode 9 Reward: 360.17041015625
Test Episode 10 Reward: 355.3059844970703
Average Test Reward (with depth buffer:) 175.88265686035157
Epoch 60 test without depth buffer:
Test Episode 1 Reward: -16.601882934570312
Test Episode 2 Reward: 89.10957336425781
Test Episode 3 Reward: 53.10284423828125
Test Episode 4 Reward: -19.413223266601562
Test Episode 5 Reward: 53.10284423828125
Test Episode 6 Reward: 53.10284423828125
Test Episode 7 Reward: -18.851913452148438
Test Episode 8 Reward: -93.05158996582031
Test Episode 9 Reward: 53.102

100%|██████████| 2000/2000 [06:02<00:00,  5.52it/s]


Epoch 61 Mean Reward: -69.34668611907959


100%|██████████| 2000/2000 [05:55<00:00,  5.62it/s]


Epoch 62 Mean Reward: -68.11359796142578


100%|██████████| 2000/2000 [05:41<00:00,  5.86it/s]


Epoch 63 Mean Reward: -67.78433435058594


100%|██████████| 2000/2000 [05:11<00:00,  6.41it/s]


Epoch 64 Mean Reward: -66.13114820861816


100%|██████████| 2000/2000 [05:32<00:00,  6.01it/s]


Epoch 65 Mean Reward: -68.22951669311523


100%|██████████| 2000/2000 [05:11<00:00,  6.42it/s]


Epoch 66 Mean Reward: -68.3294031677246


100%|██████████| 2000/2000 [04:56<00:00,  6.74it/s]


Epoch 67 Mean Reward: -68.60973760986329


100%|██████████| 2000/2000 [05:01<00:00,  6.63it/s]


Epoch 68 Mean Reward: -69.02881126403808


100%|██████████| 2000/2000 [04:26<00:00,  7.51it/s]


Epoch 69 Mean Reward: -70.52694003295899


100%|██████████| 2000/2000 [04:18<00:00,  7.73it/s]


Epoch 70 Mean Reward: -67.78952525329589


100%|██████████| 2000/2000 [04:23<00:00,  7.60it/s]


Epoch 71 Mean Reward: -66.63819975280762


100%|██████████| 2000/2000 [04:38<00:00,  7.19it/s]


Epoch 72 Mean Reward: -67.92739408874512


100%|██████████| 2000/2000 [04:18<00:00,  7.73it/s]


Epoch 73 Mean Reward: -68.67475106811523


100%|██████████| 2000/2000 [04:35<00:00,  7.25it/s]


Epoch 74 Mean Reward: -68.63835050964356


100%|██████████| 2000/2000 [04:53<00:00,  6.81it/s]


Epoch 75 Mean Reward: -69.75699773406983


100%|██████████| 2000/2000 [04:45<00:00,  7.01it/s]


Epoch 76 Mean Reward: -68.9784730682373


100%|██████████| 2000/2000 [04:39<00:00,  7.15it/s]


Epoch 77 Mean Reward: -69.38364220428467


100%|██████████| 2000/2000 [04:48<00:00,  6.92it/s]


Epoch 78 Mean Reward: -69.72841886138916


100%|██████████| 2000/2000 [04:20<00:00,  7.68it/s]


Epoch 79 Mean Reward: -69.24781019592285


100%|██████████| 2000/2000 [04:21<00:00,  7.64it/s]


Epoch 80 Mean Reward: -68.56046757507325
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test with depth buffer:
Test Episode 1 Reward: 217.76788330078125
Test Episode 2 Reward: 217.76788330078125
Test Episode 3 Reward: 79.38572692871094
Test Episode 4 Reward: 13.647445678710938
Test Episode 5 Reward: 105.68707275390625
Test Episode 6 Reward: 217.76788330078125
Test Episode 7 Reward: 172.5204315185547
Test Episode 8 Reward: 369.8952941894531
Test Episode 9 Reward: 294.5778350830078
Test Episode 10 Reward: 217.76788330078125
Average Test Reward (with depth buffer:) 190.67853393554688
Epoch 80 test without depth buffer:
Test Episode 1 Reward: -1.6921234130859375
Test Episode 2 Reward: 84.44912719726562
Test Episode 3 Reward: -1.6921234130859375
Test Episode 4 Reward: 42.67054748535156
Test Episode 5 Reward: -1.6921234130859375
Test Episode 6 Reward: -1.6921234130859375
Test Episode 7 Reward: -1.6921234130859375
Test Episode 8 Reward: -1.6921234130859375
Test Episode 9

100%|██████████| 2000/2000 [04:26<00:00,  7.49it/s]


Epoch 81 Mean Reward: -65.07257484436035


100%|██████████| 2000/2000 [04:02<00:00,  8.25it/s]


Epoch 82 Mean Reward: -67.09698007965088


100%|██████████| 2000/2000 [03:47<00:00,  8.80it/s]


Epoch 83 Mean Reward: -68.06963017272949


100%|██████████| 2000/2000 [03:42<00:00,  8.99it/s]


Epoch 84 Mean Reward: -68.03128240203857


100%|██████████| 2000/2000 [03:44<00:00,  8.93it/s]


Epoch 85 Mean Reward: -67.29884914398194


100%|██████████| 2000/2000 [03:34<00:00,  9.31it/s]


Epoch 86 Mean Reward: -71.21670111846923


100%|██████████| 2000/2000 [03:40<00:00,  9.06it/s]


Epoch 87 Mean Reward: -67.8625856552124


100%|██████████| 2000/2000 [03:41<00:00,  9.04it/s]


Epoch 88 Mean Reward: -69.1391785736084


100%|██████████| 2000/2000 [03:51<00:00,  8.63it/s]


Epoch 89 Mean Reward: -68.13406923675537


100%|██████████| 2000/2000 [03:46<00:00,  8.82it/s]


Epoch 90 Mean Reward: -67.97279904174805


100%|██████████| 2000/2000 [03:39<00:00,  9.12it/s]


Epoch 91 Mean Reward: -68.32521184539794


100%|██████████| 2000/2000 [03:38<00:00,  9.14it/s]


Epoch 92 Mean Reward: -69.62936206817626


100%|██████████| 2000/2000 [03:48<00:00,  8.75it/s]


Epoch 93 Mean Reward: -69.27470381164551


100%|██████████| 2000/2000 [03:40<00:00,  9.08it/s]


Epoch 94 Mean Reward: -69.58446295928955


100%|██████████| 2000/2000 [03:42<00:00,  9.00it/s]


Epoch 95 Mean Reward: -68.64348641204835


100%|██████████| 2000/2000 [03:59<00:00,  8.34it/s]


Epoch 96 Mean Reward: -68.59017743682861


100%|██████████| 2000/2000 [04:29<00:00,  7.42it/s]


Epoch 97 Mean Reward: -68.94331882476807


100%|██████████| 2000/2000 [04:30<00:00,  7.40it/s]


Epoch 98 Mean Reward: -68.52298240661621


100%|██████████| 2000/2000 [04:20<00:00,  7.69it/s]


Epoch 99 Mean Reward: -70.28805194091797


100%|██████████| 2000/2000 [04:30<00:00,  7.40it/s]


Epoch 100 Mean Reward: -66.98542767333984
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test with depth buffer:
Test Episode 1 Reward: 63.26948547363281
Test Episode 2 Reward: 74.81126403808594
Test Episode 3 Reward: 74.81126403808594
Test Episode 4 Reward: 74.81126403808594
Test Episode 5 Reward: 224.06375122070312
Test Episode 6 Reward: 74.81126403808594
Test Episode 7 Reward: 52.862945556640625
Test Episode 8 Reward: 74.81126403808594
Test Episode 9 Reward: 74.81126403808594
Test Episode 10 Reward: 59.23204040527344
Average Test Reward (with depth buffer:) 84.82958068847657
Epoch 100 test without depth buffer:
Test Episode 1 Reward: 186.7234344482422
Test Episode 2 Reward: 7.160430908203125
Test Episode 3 Reward: -6.4378814697265625
Test Episode 4 Reward: 201.58114624023438
Test Episode 5 Reward: 186.7234344482422
Test Episode 6 Reward: -17.00762939453125
Test Episode 7 Reward: -22.245590209960938
Test Episode 8 Reward: 671.1471099853516
Test Episode 9 Reward

100%|██████████| 2000/2000 [04:19<00:00,  7.71it/s]


Epoch 101 Mean Reward: -66.24993885040283


100%|██████████| 2000/2000 [04:26<00:00,  7.50it/s]


Epoch 102 Mean Reward: -66.9095492401123


100%|██████████| 2000/2000 [04:14<00:00,  7.87it/s]


Epoch 103 Mean Reward: -70.03994078063965


100%|██████████| 2000/2000 [04:25<00:00,  7.53it/s]


Epoch 104 Mean Reward: -69.0269677734375


100%|██████████| 2000/2000 [04:04<00:00,  8.18it/s]


Epoch 105 Mean Reward: -71.7968417892456


100%|██████████| 2000/2000 [04:08<00:00,  8.05it/s]


Epoch 106 Mean Reward: -68.56792385864257


100%|██████████| 2000/2000 [04:15<00:00,  7.84it/s]


Epoch 107 Mean Reward: -67.9547851486206


100%|██████████| 2000/2000 [04:09<00:00,  8.01it/s]


Epoch 108 Mean Reward: -66.61466916656494


100%|██████████| 2000/2000 [04:11<00:00,  7.96it/s]


Epoch 109 Mean Reward: -67.21061940002441


100%|██████████| 2000/2000 [04:37<00:00,  7.21it/s]


Epoch 110 Mean Reward: -70.3744309387207


100%|██████████| 2000/2000 [04:27<00:00,  7.48it/s]


Epoch 111 Mean Reward: -67.81602911376953


100%|██████████| 2000/2000 [04:14<00:00,  7.87it/s]


Epoch 112 Mean Reward: -67.84620623779297


100%|██████████| 2000/2000 [03:59<00:00,  8.36it/s]


Epoch 113 Mean Reward: -69.6560033416748


100%|██████████| 2000/2000 [04:14<00:00,  7.86it/s]


Epoch 114 Mean Reward: -70.39423426818847


100%|██████████| 2000/2000 [04:00<00:00,  8.30it/s]


Epoch 115 Mean Reward: -67.48122008514405


100%|██████████| 2000/2000 [03:53<00:00,  8.58it/s]


Epoch 116 Mean Reward: -68.7779677658081


100%|██████████| 2000/2000 [03:47<00:00,  8.81it/s]


Epoch 117 Mean Reward: -68.48710897064208


100%|██████████| 2000/2000 [03:47<00:00,  8.78it/s]


Epoch 118 Mean Reward: -68.49768998718261


100%|██████████| 2000/2000 [03:38<00:00,  9.14it/s]


Epoch 119 Mean Reward: -67.68123135375977


100%|██████████| 2000/2000 [03:38<00:00,  9.15it/s]


Epoch 120 Mean Reward: -69.61175968170166
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test with depth buffer:
Test Episode 1 Reward: 371.0050811767578
Test Episode 2 Reward: 214.73204040527344
Test Episode 3 Reward: 503.50746154785156
Test Episode 4 Reward: 214.73204040527344
Test Episode 5 Reward: 673.5232238769531
Test Episode 6 Reward: 25.202438354492188
Test Episode 7 Reward: 214.73204040527344
Test Episode 8 Reward: 810.4558868408203
Test Episode 9 Reward: 304.0842742919922
Test Episode 10 Reward: 331.60894775390625
Average Test Reward (with depth buffer:) 366.35834350585935
Epoch 120 test without depth buffer:
Test Episode 1 Reward: 3.3960418701171875
Test Episode 2 Reward: 94.23623657226562
Test Episode 3 Reward: 94.23623657226562
Test Episode 4 Reward: 94.23623657226562
Test Episode 5 Reward: 94.23623657226562
Test Episode 6 Reward: 94.23623657226562
Test Episode 7 Reward: 45.68034362792969
Test Episode 8 Reward: 13.14520263671875
Test Episode 9 Reward

100%|██████████| 2000/2000 [03:49<00:00,  8.73it/s]


Epoch 121 Mean Reward: -68.13826595306396


100%|██████████| 2000/2000 [03:46<00:00,  8.84it/s]


Epoch 122 Mean Reward: -68.81862616729737


100%|██████████| 2000/2000 [03:45<00:00,  8.87it/s]


Epoch 123 Mean Reward: -66.86200967407227


100%|██████████| 2000/2000 [04:29<00:00,  7.42it/s]


Epoch 124 Mean Reward: -67.3159875869751


100%|██████████| 2000/2000 [04:00<00:00,  8.31it/s]


Epoch 125 Mean Reward: -68.64147190856933


100%|██████████| 2000/2000 [05:17<00:00,  6.29it/s]


Epoch 126 Mean Reward: -68.73470854187012


100%|██████████| 2000/2000 [05:21<00:00,  6.22it/s]


Epoch 127 Mean Reward: -66.77578329467774


100%|██████████| 2000/2000 [05:04<00:00,  6.58it/s]


Epoch 128 Mean Reward: -68.20147066497803


100%|██████████| 2000/2000 [05:04<00:00,  6.56it/s]


Epoch 129 Mean Reward: -69.88412196350097


100%|██████████| 2000/2000 [04:38<00:00,  7.18it/s]


Epoch 130 Mean Reward: -69.46361573791503


100%|██████████| 2000/2000 [03:16<00:00, 10.19it/s]


Epoch 131 Mean Reward: -68.97342418670654


100%|██████████| 2000/2000 [03:50<00:00,  8.68it/s]


Epoch 132 Mean Reward: -67.84795471191406


100%|██████████| 2000/2000 [04:39<00:00,  7.15it/s]


Epoch 133 Mean Reward: -66.96455383300781


100%|██████████| 2000/2000 [05:00<00:00,  6.65it/s]


Epoch 134 Mean Reward: -68.50965341186523


100%|██████████| 2000/2000 [05:08<00:00,  6.49it/s]


Epoch 135 Mean Reward: -67.6869645767212


100%|██████████| 2000/2000 [03:52<00:00,  8.60it/s]


Epoch 136 Mean Reward: -69.04148820495605


100%|██████████| 2000/2000 [05:03<00:00,  6.58it/s]


Epoch 137 Mean Reward: -69.192563331604


100%|██████████| 2000/2000 [04:50<00:00,  6.89it/s]


Epoch 138 Mean Reward: -69.2476470565796


100%|██████████| 2000/2000 [04:20<00:00,  7.68it/s]


Epoch 139 Mean Reward: -69.81557570648194


100%|██████████| 2000/2000 [04:14<00:00,  7.86it/s]


Epoch 140 Mean Reward: -68.12921504974365
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test with depth buffer:
Test Episode 1 Reward: 89.51968383789062
Test Episode 2 Reward: 89.51968383789062
Test Episode 3 Reward: 80.11625671386719
Test Episode 4 Reward: 50.90382385253906
Test Episode 5 Reward: 89.51968383789062
Test Episode 6 Reward: 89.51968383789062
Test Episode 7 Reward: 105.89729309082031
Test Episode 8 Reward: 441.5984649658203
Test Episode 9 Reward: 89.51968383789062
Test Episode 10 Reward: 89.51968383789062
Average Test Reward (with depth buffer:) 121.56339416503906
Epoch 140 test without depth buffer:
Test Episode 1 Reward: 192.67703247070312
Test Episode 2 Reward: 69.44046020507812
Test Episode 3 Reward: 273.7863006591797
Test Episode 4 Reward: 17.582778930664062
Test Episode 5 Reward: 192.67703247070312
Test Episode 6 Reward: 192.67703247070312
Test Episode 7 Reward: 192.67703247070312
Test Episode 8 Reward: -4.985198974609375
Test Episode 9 Reward

100%|██████████| 2000/2000 [04:19<00:00,  7.70it/s]


Epoch 141 Mean Reward: -69.63718578338623


100%|██████████| 2000/2000 [04:17<00:00,  7.78it/s]


Epoch 142 Mean Reward: -68.53458796691895


100%|██████████| 2000/2000 [04:16<00:00,  7.79it/s]


Epoch 143 Mean Reward: -68.17916548919678


100%|██████████| 2000/2000 [04:22<00:00,  7.63it/s]


Epoch 144 Mean Reward: -68.58955062103271


100%|██████████| 2000/2000 [04:24<00:00,  7.56it/s]


Epoch 145 Mean Reward: -69.44238436126709


100%|██████████| 2000/2000 [03:28<00:00,  9.61it/s]


Epoch 146 Mean Reward: -66.9170004196167


100%|██████████| 2000/2000 [03:48<00:00,  8.75it/s]


Epoch 147 Mean Reward: -69.02110375213623


100%|██████████| 2000/2000 [04:17<00:00,  7.77it/s]


Epoch 148 Mean Reward: -68.30039226531983


100%|██████████| 2000/2000 [03:20<00:00,  9.98it/s]


Epoch 149 Mean Reward: -69.61234588623047


100%|██████████| 2000/2000 [03:11<00:00, 10.45it/s]


Epoch 150 Mean Reward: -69.2189310684204


100%|██████████| 2000/2000 [03:08<00:00, 10.63it/s]


Epoch 151 Mean Reward: -37.807291244506835


100%|██████████| 2000/2000 [03:21<00:00,  9.93it/s]


Epoch 152 Mean Reward: -42.11442250823975


100%|██████████| 2000/2000 [03:24<00:00,  9.76it/s]


Epoch 153 Mean Reward: -38.50582051849365


100%|██████████| 2000/2000 [03:14<00:00, 10.31it/s]


Epoch 154 Mean Reward: -38.78905511474609


100%|██████████| 2000/2000 [03:15<00:00, 10.25it/s]


Epoch 155 Mean Reward: -38.851295265197756


100%|██████████| 2000/2000 [03:21<00:00,  9.92it/s]


Epoch 156 Mean Reward: -35.30804943084717


100%|██████████| 2000/2000 [03:20<00:00,  9.99it/s]


Epoch 157 Mean Reward: -35.611233116149904


100%|██████████| 2000/2000 [03:20<00:00,  9.95it/s]


Epoch 158 Mean Reward: -35.55910134887695


100%|██████████| 2000/2000 [03:25<00:00,  9.74it/s]


Epoch 159 Mean Reward: -37.918340156555175


100%|██████████| 2000/2000 [03:11<00:00, 10.47it/s]


Epoch 160 Mean Reward: -33.8279142074585
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test with depth buffer:
Test Episode 1 Reward: 71.63063049316406
Test Episode 2 Reward: 71.63063049316406
Test Episode 3 Reward: 64.2850341796875
Test Episode 4 Reward: 366.63433837890625
Test Episode 5 Reward: 71.63063049316406
Test Episode 6 Reward: 396.05320739746094
Test Episode 7 Reward: 357.04522705078125
Test Episode 8 Reward: 87.20166015625
Test Episode 9 Reward: 39.20991516113281
Test Episode 10 Reward: 71.63063049316406
Average Test Reward (with depth buffer:) 159.6951904296875
Epoch 160 test without depth buffer:
Test Episode 1 Reward: 80.19996643066406
Test Episode 2 Reward: 199.57960510253906
Test Episode 3 Reward: 243.65679931640625
Test Episode 4 Reward: 288.17955017089844
Test Episode 5 Reward: -12.0894775390625
Test Episode 6 Reward: 199.57960510253906
Test Episode 7 Reward: 199.57960510253906
Test Episode 8 Reward: 350.77549743652344
Test Episode 9 Reward: 22

100%|██████████| 2000/2000 [04:57<00:00,  6.72it/s]


Epoch 161 Mean Reward: -34.52717741394043


100%|██████████| 2000/2000 [04:53<00:00,  6.80it/s]


Epoch 162 Mean Reward: -35.02463423156738


100%|██████████| 2000/2000 [04:10<00:00,  7.98it/s]


Epoch 163 Mean Reward: -34.23823212432861


100%|██████████| 2000/2000 [03:10<00:00, 10.53it/s]


Epoch 164 Mean Reward: -35.098093444824215


100%|██████████| 2000/2000 [03:14<00:00, 10.28it/s]


Epoch 165 Mean Reward: -34.35420431518555


100%|██████████| 2000/2000 [03:14<00:00, 10.30it/s]


Epoch 166 Mean Reward: -30.56830850982666


100%|██████████| 2000/2000 [03:06<00:00, 10.72it/s]


Epoch 167 Mean Reward: -33.57990663909912


100%|██████████| 2000/2000 [03:10<00:00, 10.48it/s]


Epoch 168 Mean Reward: -31.646459915161135


100%|██████████| 2000/2000 [03:11<00:00, 10.46it/s]


Epoch 169 Mean Reward: -30.385175041198732


100%|██████████| 2000/2000 [03:11<00:00, 10.43it/s]


Epoch 170 Mean Reward: -32.18021639251709


100%|██████████| 2000/2000 [03:18<00:00, 10.06it/s]


Epoch 171 Mean Reward: -27.667896865844728


100%|██████████| 2000/2000 [03:09<00:00, 10.56it/s]


Epoch 172 Mean Reward: -27.685209465026855


100%|██████████| 2000/2000 [03:05<00:00, 10.77it/s]


Epoch 173 Mean Reward: -29.92675695800781


100%|██████████| 2000/2000 [03:08<00:00, 10.62it/s]


Epoch 174 Mean Reward: -26.748276153564454


100%|██████████| 2000/2000 [03:18<00:00, 10.07it/s]


Epoch 175 Mean Reward: -26.207492698669434


100%|██████████| 2000/2000 [03:20<00:00,  9.96it/s]


Epoch 176 Mean Reward: -25.967171546936036


100%|██████████| 2000/2000 [03:04<00:00, 10.83it/s]


Epoch 177 Mean Reward: -26.438924255371095


100%|██████████| 2000/2000 [03:06<00:00, 10.72it/s]


Epoch 178 Mean Reward: -24.673794403076172


100%|██████████| 2000/2000 [03:07<00:00, 10.67it/s]


Epoch 179 Mean Reward: -26.068201095581056


100%|██████████| 2000/2000 [03:14<00:00, 10.27it/s]


Epoch 180 Mean Reward: -25.319764503479004
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test with depth buffer:
Test Episode 1 Reward: 234.20077514648438
Test Episode 2 Reward: 395.9478759765625
Test Episode 3 Reward: 74.40867614746094
Test Episode 4 Reward: 624.2905120849609
Test Episode 5 Reward: 74.40867614746094
Test Episode 6 Reward: 74.40867614746094
Test Episode 7 Reward: 70.44480895996094
Test Episode 8 Reward: 79.60302734375
Test Episode 9 Reward: 90.70770263671875
Test Episode 10 Reward: 74.40867614746094
Average Test Reward (with depth buffer:) 179.28294067382814
Epoch 180 test without depth buffer:
Test Episode 1 Reward: -28.946701049804688
Test Episode 2 Reward: -24.13189697265625
Test Episode 3 Reward: 88.95295715332031
Test Episode 4 Reward: -2.5756072998046875
Test Episode 5 Reward: 88.95295715332031
Test Episode 6 Reward: -24.392547607421875
Test Episode 7 Reward: 88.95295715332031
Test Episode 8 Reward: 88.95295715332031
Test Episode 9 Reward:

100%|██████████| 2000/2000 [03:04<00:00, 10.84it/s]


Epoch 181 Mean Reward: -22.72278823852539


100%|██████████| 2000/2000 [03:04<00:00, 10.82it/s]


Epoch 182 Mean Reward: -23.197012962341308


100%|██████████| 2000/2000 [03:06<00:00, 10.70it/s]


Epoch 183 Mean Reward: -17.782442504882813


100%|██████████| 2000/2000 [03:09<00:00, 10.55it/s]


Epoch 184 Mean Reward: -16.37018643951416


100%|██████████| 2000/2000 [03:20<00:00,  9.95it/s]


Epoch 185 Mean Reward: -21.386449569702147


100%|██████████| 2000/2000 [03:07<00:00, 10.64it/s]


Epoch 186 Mean Reward: -16.89681398010254


100%|██████████| 2000/2000 [03:09<00:00, 10.54it/s]


Epoch 187 Mean Reward: -16.363919410705567


100%|██████████| 2000/2000 [03:13<00:00, 10.32it/s]


Epoch 188 Mean Reward: -15.817669281005859


100%|██████████| 2000/2000 [03:10<00:00, 10.50it/s]


Epoch 189 Mean Reward: -18.88062657928467


100%|██████████| 2000/2000 [03:20<00:00,  9.98it/s]


Epoch 190 Mean Reward: -17.485081687927245


100%|██████████| 2000/2000 [03:09<00:00, 10.57it/s]


Epoch 191 Mean Reward: -13.354599906921386


100%|██████████| 2000/2000 [03:10<00:00, 10.48it/s]


Epoch 192 Mean Reward: -17.196246505737303


100%|██████████| 2000/2000 [03:09<00:00, 10.57it/s]


Epoch 193 Mean Reward: -13.307705070495606


100%|██████████| 2000/2000 [03:19<00:00, 10.05it/s]


Epoch 194 Mean Reward: -16.323297691345214


100%|██████████| 2000/2000 [03:14<00:00, 10.27it/s]


Epoch 195 Mean Reward: -10.631712226867675


100%|██████████| 2000/2000 [03:14<00:00, 10.29it/s]


Epoch 196 Mean Reward: -10.155248718261719


100%|██████████| 2000/2000 [03:10<00:00, 10.49it/s]


Epoch 197 Mean Reward: -11.006304718017578


100%|██████████| 2000/2000 [03:13<00:00, 10.32it/s]


Epoch 198 Mean Reward: -8.653588623046875


100%|██████████| 2000/2000 [03:28<00:00,  9.58it/s]


Epoch 199 Mean Reward: -11.382747634887695


100%|██████████| 2000/2000 [03:13<00:00, 10.33it/s]


Epoch 200 Mean Reward: -7.814986778259278
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test with depth buffer:
Test Episode 1 Reward: 84.48188781738281
Test Episode 2 Reward: 121.51127624511719
Test Episode 3 Reward: -31.862197875976562
Test Episode 4 Reward: 328.6807403564453
Test Episode 5 Reward: 47.827850341796875
Test Episode 6 Reward: 62.10963439941406
Test Episode 7 Reward: 377.3649597167969
Test Episode 8 Reward: 62.10963439941406
Test Episode 9 Reward: 44.32102966308594
Test Episode 10 Reward: 304.3035430908203
Average Test Reward (with depth buffer:) 140.0848358154297
Epoch 200 test without depth buffer:
Test Episode 1 Reward: -20.582977294921875
Test Episode 2 Reward: 93.46310424804688
Test Episode 3 Reward: 35.932403564453125
Test Episode 4 Reward: 57.771270751953125
Test Episode 5 Reward: 93.46310424804688
Test Episode 6 Reward: 201.21612548828125
Test Episode 7 Reward: 42.42491149902344
Test Episode 8 Reward: 91.9161376953125
Test Episode 9 Reward

100%|██████████| 2000/2000 [03:07<00:00, 10.64it/s]


Epoch 201 Mean Reward: -12.644883453369141


100%|██████████| 2000/2000 [03:09<00:00, 10.53it/s]


Epoch 202 Mean Reward: -13.40502796936035


100%|██████████| 2000/2000 [03:19<00:00, 10.04it/s]


Epoch 203 Mean Reward: -13.263702201843262


100%|██████████| 2000/2000 [03:10<00:00, 10.50it/s]


Epoch 204 Mean Reward: -9.09608006286621


100%|██████████| 2000/2000 [03:14<00:00, 10.30it/s]


Epoch 205 Mean Reward: -7.266375343322754


100%|██████████| 2000/2000 [03:09<00:00, 10.56it/s]


Epoch 206 Mean Reward: -11.912394065856933


100%|██████████| 2000/2000 [03:11<00:00, 10.43it/s]


Epoch 207 Mean Reward: -13.99516497039795


100%|██████████| 2000/2000 [03:21<00:00,  9.94it/s]


Epoch 208 Mean Reward: -7.1057994842529295


100%|██████████| 2000/2000 [03:08<00:00, 10.59it/s]


Epoch 209 Mean Reward: -9.408944274902344


100%|██████████| 2000/2000 [03:07<00:00, 10.65it/s]


Epoch 210 Mean Reward: -7.555684280395508


100%|██████████| 2000/2000 [03:08<00:00, 10.61it/s]


Epoch 211 Mean Reward: -8.244783226013183


100%|██████████| 2000/2000 [03:20<00:00,  9.98it/s]


Epoch 212 Mean Reward: -6.578235603332519


100%|██████████| 2000/2000 [03:12<00:00, 10.40it/s]


Epoch 213 Mean Reward: -6.403080848693848


100%|██████████| 2000/2000 [03:08<00:00, 10.61it/s]


Epoch 214 Mean Reward: -7.3867171783447265


100%|██████████| 2000/2000 [03:05<00:00, 10.76it/s]


Epoch 215 Mean Reward: -6.988951118469238


100%|██████████| 2000/2000 [03:09<00:00, 10.54it/s]


Epoch 216 Mean Reward: -8.029226432800293


100%|██████████| 2000/2000 [03:21<00:00,  9.94it/s]


Epoch 217 Mean Reward: -3.4264641036987307


100%|██████████| 2000/2000 [03:10<00:00, 10.50it/s]


Epoch 218 Mean Reward: -4.271890975952148


100%|██████████| 2000/2000 [03:13<00:00, 10.32it/s]


Epoch 219 Mean Reward: -4.428306510925293


100%|██████████| 2000/2000 [03:12<00:00, 10.42it/s]


Epoch 220 Mean Reward: -2.475607650756836
Epoch 220 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 220 test with depth buffer:
Test Episode 1 Reward: 428.2397003173828
Test Episode 2 Reward: 63.87666320800781
Test Episode 3 Reward: 88.00544738769531
Test Episode 4 Reward: -90.70904541015625
Test Episode 5 Reward: 102.2950439453125
Test Episode 6 Reward: 428.2397003173828
Test Episode 7 Reward: 52.12669372558594
Test Episode 8 Reward: 428.2397003173828
Test Episode 9 Reward: 58.51191711425781
Test Episode 10 Reward: 428.2397003173828
Average Test Reward (with depth buffer:) 198.70655212402343
Epoch 220 test without depth buffer:
Test Episode 1 Reward: 376.86656188964844
Test Episode 2 Reward: 29.856292724609375
Test Episode 3 Reward: 210.71099853515625
Test Episode 4 Reward: 91.31448364257812
Test Episode 5 Reward: 72.14265441894531
Test Episode 6 Reward: 72.14265441894531
Test Episode 7 Reward: 82.625
Test Episode 8 Reward: 156.19418334960938
Test Episode 9 Reward: 399.2708892

100%|██████████| 2000/2000 [03:28<00:00,  9.59it/s]


Epoch 221 Mean Reward: 4.456618232727051


100%|██████████| 2000/2000 [03:56<00:00,  8.46it/s]


Epoch 222 Mean Reward: 3.371605537414551


100%|██████████| 2000/2000 [03:41<00:00,  9.04it/s]


Epoch 223 Mean Reward: 1.203969596862793


100%|██████████| 2000/2000 [03:39<00:00,  9.11it/s]


Epoch 224 Mean Reward: 2.228691635131836


100%|██████████| 2000/2000 [03:48<00:00,  8.77it/s]


Epoch 225 Mean Reward: 7.616601371765137


100%|██████████| 2000/2000 [03:42<00:00,  8.97it/s]


Epoch 226 Mean Reward: 4.095522018432617


100%|██████████| 2000/2000 [03:31<00:00,  9.47it/s]


Epoch 227 Mean Reward: 6.065513282775879


100%|██████████| 2000/2000 [03:42<00:00,  8.97it/s]


Epoch 228 Mean Reward: 2.359781898498535


100%|██████████| 2000/2000 [03:52<00:00,  8.59it/s]


Epoch 229 Mean Reward: 8.700476440429687


100%|██████████| 2000/2000 [03:25<00:00,  9.74it/s]


Epoch 230 Mean Reward: 7.076815147399902


100%|██████████| 2000/2000 [03:29<00:00,  9.53it/s]


Epoch 231 Mean Reward: 9.274790245056153


100%|██████████| 2000/2000 [03:34<00:00,  9.34it/s]


Epoch 232 Mean Reward: 8.233482688903809


100%|██████████| 2000/2000 [03:31<00:00,  9.44it/s]


Epoch 233 Mean Reward: 7.310339263916015


100%|██████████| 2000/2000 [03:35<00:00,  9.30it/s]


Epoch 234 Mean Reward: 16.1729102935791


100%|██████████| 2000/2000 [03:33<00:00,  9.36it/s]


Epoch 235 Mean Reward: 12.580828125


100%|██████████| 2000/2000 [03:24<00:00,  9.78it/s]


Epoch 236 Mean Reward: 11.427288681030273


100%|██████████| 2000/2000 [03:32<00:00,  9.42it/s]


Epoch 237 Mean Reward: 15.768045433044433


100%|██████████| 2000/2000 [03:50<00:00,  8.69it/s]


Epoch 238 Mean Reward: 17.325841979980467


100%|██████████| 2000/2000 [03:34<00:00,  9.32it/s]


Epoch 239 Mean Reward: 10.837749198913574


100%|██████████| 2000/2000 [03:33<00:00,  9.37it/s]


Epoch 240 Mean Reward: 21.427005157470703
Epoch 240 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 240 test with depth buffer:
Test Episode 1 Reward: 104.16316223144531
Test Episode 2 Reward: 98.75975036621094
Test Episode 3 Reward: 345.22401428222656
Test Episode 4 Reward: -1.5041351318359375
Test Episode 5 Reward: 233.2348175048828
Test Episode 6 Reward: 43.441650390625
Test Episode 7 Reward: 233.2348175048828
Test Episode 8 Reward: 233.2348175048828
Test Episode 9 Reward: 233.2348175048828
Test Episode 10 Reward: 60.591461181640625
Average Test Reward (with depth buffer:) 158.36151733398438
Epoch 240 test without depth buffer:
Test Episode 1 Reward: 76.30754089355469
Test Episode 2 Reward: 388.1917724609375
Test Episode 3 Reward: 132.97357177734375
Test Episode 4 Reward: 709.479248046875
Test Episode 5 Reward: 76.30754089355469
Test Episode 6 Reward: 62.181549072265625
Test Episode 7 Reward: 13.211532592773438
Test Episode 8 Reward: 76.30754089355469
Test Episode 9 Reward: 

100%|██████████| 2000/2000 [03:32<00:00,  9.40it/s]


Epoch 241 Mean Reward: 17.78023394012451


100%|██████████| 2000/2000 [03:32<00:00,  9.39it/s]


Epoch 242 Mean Reward: 15.876893882751466


100%|██████████| 2000/2000 [03:26<00:00,  9.67it/s]


Epoch 243 Mean Reward: 18.000329116821288


100%|██████████| 2000/2000 [03:21<00:00,  9.93it/s]


Epoch 244 Mean Reward: 18.793498046875


100%|██████████| 2000/2000 [03:21<00:00,  9.94it/s]


Epoch 245 Mean Reward: 17.077294006347657


100%|██████████| 2000/2000 [03:38<00:00,  9.14it/s]


Epoch 246 Mean Reward: 18.941045150756835


100%|██████████| 2000/2000 [03:26<00:00,  9.69it/s]


Epoch 247 Mean Reward: 22.10968797302246


100%|██████████| 2000/2000 [03:29<00:00,  9.55it/s]


Epoch 248 Mean Reward: 20.98971043395996


100%|██████████| 2000/2000 [03:23<00:00,  9.83it/s]


Epoch 249 Mean Reward: 19.385680267333985


100%|██████████| 2000/2000 [03:24<00:00,  9.80it/s]


Epoch 250 Mean Reward: 20.566187644958497


100%|██████████| 2000/2000 [03:29<00:00,  9.55it/s]


Epoch 251 Mean Reward: 21.64088335418701


100%|██████████| 2000/2000 [03:18<00:00, 10.07it/s]


Epoch 252 Mean Reward: 19.587110496520996


100%|██████████| 2000/2000 [03:25<00:00,  9.72it/s]


Epoch 253 Mean Reward: 21.433568710327147


100%|██████████| 2000/2000 [03:22<00:00,  9.88it/s]


Epoch 254 Mean Reward: 25.712979705810547


100%|██████████| 2000/2000 [03:35<00:00,  9.30it/s]


Epoch 255 Mean Reward: 23.0222225189209


100%|██████████| 2000/2000 [03:30<00:00,  9.52it/s]


Epoch 256 Mean Reward: 26.220800300598146


100%|██████████| 2000/2000 [03:22<00:00,  9.88it/s]


Epoch 257 Mean Reward: 23.80430339050293


100%|██████████| 2000/2000 [03:32<00:00,  9.40it/s]


Epoch 258 Mean Reward: 21.795226623535157


100%|██████████| 2000/2000 [03:48<00:00,  8.73it/s]


Epoch 259 Mean Reward: 25.559774436950683


100%|██████████| 2000/2000 [03:47<00:00,  8.79it/s]


Epoch 260 Mean Reward: 29.537395622253417
Epoch 260 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 260 test with depth buffer:
Test Episode 1 Reward: 80.87774658203125
Test Episode 2 Reward: 80.87774658203125
Test Episode 3 Reward: 245.14308166503906
Test Episode 4 Reward: 80.87774658203125
Test Episode 5 Reward: 239.70639038085938
Test Episode 6 Reward: 26.85833740234375
Test Episode 7 Reward: 80.87774658203125
Test Episode 8 Reward: 92.22035217285156
Test Episode 9 Reward: 80.87774658203125
Test Episode 10 Reward: 80.87774658203125
Average Test Reward (with depth buffer:) 108.91946411132812
Epoch 260 test without depth buffer:
Test Episode 1 Reward: 84.63902282714844
Test Episode 2 Reward: 121.61738586425781
Test Episode 3 Reward: -33.8350830078125
Test Episode 4 Reward: 84.63902282714844
Test Episode 5 Reward: -19.355438232421875
Test Episode 6 Reward: 84.63902282714844
Test Episode 7 Reward: 1.5228729248046875
Test Episode 8 Reward: 84.63902282714844
Test Episode 9 Reward:

100%|██████████| 2000/2000 [04:15<00:00,  7.82it/s]


Epoch 261 Mean Reward: 40.32209045410156


100%|██████████| 2000/2000 [04:44<00:00,  7.03it/s]


Epoch 262 Mean Reward: 42.24078400421143


100%|██████████| 2000/2000 [04:08<00:00,  8.04it/s]


Epoch 263 Mean Reward: 37.96009894561767


100%|██████████| 2000/2000 [03:56<00:00,  8.47it/s]


Epoch 264 Mean Reward: 43.47130576324463


100%|██████████| 2000/2000 [03:48<00:00,  8.77it/s]


Epoch 265 Mean Reward: 43.00994481658935


100%|██████████| 2000/2000 [03:50<00:00,  8.67it/s]


Epoch 266 Mean Reward: 40.19143669891358


100%|██████████| 2000/2000 [03:41<00:00,  9.01it/s]


Epoch 267 Mean Reward: 44.03171622467041


100%|██████████| 2000/2000 [03:33<00:00,  9.35it/s]


Epoch 268 Mean Reward: 45.31509564971924


100%|██████████| 2000/2000 [03:34<00:00,  9.30it/s]


Epoch 269 Mean Reward: 44.60329553222656


100%|██████████| 2000/2000 [04:01<00:00,  8.27it/s]


Epoch 270 Mean Reward: 46.01549824523926


100%|██████████| 2000/2000 [04:23<00:00,  7.58it/s]


Epoch 271 Mean Reward: 48.99216209411621


100%|██████████| 2000/2000 [04:32<00:00,  7.33it/s]


Epoch 272 Mean Reward: 49.65750865936279


100%|██████████| 2000/2000 [03:53<00:00,  8.57it/s]


Epoch 273 Mean Reward: 48.15708117675781


100%|██████████| 2000/2000 [04:28<00:00,  7.45it/s]


Epoch 274 Mean Reward: 56.39644205474853


100%|██████████| 2000/2000 [04:27<00:00,  7.49it/s]


Epoch 275 Mean Reward: 55.68580814361572


100%|██████████| 2000/2000 [04:36<00:00,  7.24it/s]


Epoch 276 Mean Reward: 52.30475664520264


100%|██████████| 2000/2000 [04:38<00:00,  7.17it/s]


Epoch 277 Mean Reward: 49.3870311126709


100%|██████████| 2000/2000 [04:20<00:00,  7.67it/s]


Epoch 278 Mean Reward: 52.02206255340576


100%|██████████| 2000/2000 [04:10<00:00,  7.97it/s]


Epoch 279 Mean Reward: 52.391371032714844


100%|██████████| 2000/2000 [04:21<00:00,  7.64it/s]


Epoch 280 Mean Reward: 55.833202033996585
Epoch 280 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 280 test with depth buffer:
Test Episode 1 Reward: 277.47984313964844
Test Episode 2 Reward: 83.65512084960938
Test Episode 3 Reward: 72.67413330078125
Test Episode 4 Reward: 83.65512084960938
Test Episode 5 Reward: 83.65512084960938
Test Episode 6 Reward: 26.652938842773438
Test Episode 7 Reward: 153.27210998535156
Test Episode 8 Reward: 83.65512084960938
Test Episode 9 Reward: 83.65512084960938
Test Episode 10 Reward: 378.97047424316406
Average Test Reward (with depth buffer:) 132.73251037597657
Epoch 280 test without depth buffer:
Test Episode 1 Reward: 2.67333984375
Test Episode 2 Reward: 31.24249267578125
Test Episode 3 Reward: 73.87840270996094
Test Episode 4 Reward: -2.59124755859375
Test Episode 5 Reward: 73.87840270996094
Test Episode 6 Reward: 6.381805419921875
Test Episode 7 Reward: -108.28924560546875
Test Episode 8 Reward: -20.260940551757812
Test Episode 9 Reward: -

100%|██████████| 2000/2000 [03:50<00:00,  8.69it/s]


Epoch 281 Mean Reward: 47.64074616241455


100%|██████████| 2000/2000 [03:56<00:00,  8.46it/s]


Epoch 282 Mean Reward: 49.160585708618164


100%|██████████| 2000/2000 [03:53<00:00,  8.58it/s]


Epoch 283 Mean Reward: 42.24325288391113


100%|██████████| 2000/2000 [03:31<00:00,  9.44it/s]


Epoch 284 Mean Reward: 47.04495363616943


100%|██████████| 2000/2000 [03:30<00:00,  9.51it/s]


Epoch 285 Mean Reward: 44.82258130645752


100%|██████████| 2000/2000 [03:35<00:00,  9.27it/s]


Epoch 286 Mean Reward: 47.64557189178467


100%|██████████| 2000/2000 [03:35<00:00,  9.29it/s]


Epoch 287 Mean Reward: 46.19556507873535


100%|██████████| 2000/2000 [03:47<00:00,  8.79it/s]


Epoch 288 Mean Reward: 53.30713958740235


100%|██████████| 2000/2000 [03:37<00:00,  9.19it/s]


Epoch 289 Mean Reward: 49.39231694793701


100%|██████████| 2000/2000 [03:36<00:00,  9.24it/s]


Epoch 290 Mean Reward: 47.94070113372803


100%|██████████| 2000/2000 [03:41<00:00,  9.05it/s]


Epoch 291 Mean Reward: 44.720292449951174


100%|██████████| 2000/2000 [03:35<00:00,  9.26it/s]


Epoch 292 Mean Reward: 47.14492527008056


100%|██████████| 2000/2000 [04:00<00:00,  8.33it/s]


Epoch 293 Mean Reward: 49.617118515014646


100%|██████████| 2000/2000 [03:38<00:00,  9.15it/s]


Epoch 294 Mean Reward: 57.6248235244751


100%|██████████| 2000/2000 [03:40<00:00,  9.08it/s]


Epoch 295 Mean Reward: 52.760722839355466


100%|██████████| 2000/2000 [03:35<00:00,  9.29it/s]


Epoch 296 Mean Reward: 51.413773727416995


100%|██████████| 2000/2000 [03:38<00:00,  9.14it/s]


Epoch 297 Mean Reward: 52.991032470703125


100%|██████████| 2000/2000 [03:34<00:00,  9.33it/s]


Epoch 298 Mean Reward: 44.92164047241211


100%|██████████| 2000/2000 [03:31<00:00,  9.48it/s]


Epoch 299 Mean Reward: 51.84340991210937


100%|██████████| 2000/2000 [03:26<00:00,  9.68it/s]


Epoch 300 Mean Reward: 56.16067129516602
Epoch 300 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 300 test with depth buffer:
Test Episode 1 Reward: 224.91285705566406
Test Episode 2 Reward: 224.91285705566406
Test Episode 3 Reward: 53.71064758300781
Test Episode 4 Reward: 51.72865295410156
Test Episode 5 Reward: 224.91285705566406
Test Episode 6 Reward: 74.72377014160156
Test Episode 7 Reward: 224.91285705566406
Test Episode 8 Reward: 224.91285705566406
Test Episode 9 Reward: 326.5917663574219
Test Episode 10 Reward: 123.62283325195312
Average Test Reward (with depth buffer:) 175.49419555664062
Epoch 300 test without depth buffer:
Test Episode 1 Reward: -115.97560119628906
Test Episode 2 Reward: -110.61453247070312
Test Episode 3 Reward: -110.61453247070312
Test Episode 4 Reward: -12.8076171875
Test Episode 5 Reward: -110.61453247070312
Test Episode 6 Reward: -110.61453247070312
Test Episode 7 Reward: -81.71260070800781
Test Episode 8 Reward: -110.61453247070312
Test Episode 

100%|██████████| 2000/2000 [04:14<00:00,  7.85it/s]


Epoch 301 Mean Reward: 85.22508257293701


100%|██████████| 2000/2000 [04:04<00:00,  8.19it/s]


Epoch 302 Mean Reward: 89.91992276000977


100%|██████████| 2000/2000 [04:13<00:00,  7.89it/s]


Epoch 303 Mean Reward: 89.83518335723878


100%|██████████| 2000/2000 [04:34<00:00,  7.30it/s]


Epoch 304 Mean Reward: 85.01217718505859


100%|██████████| 2000/2000 [04:57<00:00,  6.73it/s]


Epoch 305 Mean Reward: 90.67815763092041


100%|██████████| 2000/2000 [04:57<00:00,  6.73it/s]


Epoch 306 Mean Reward: 91.80811845397949


100%|██████████| 2000/2000 [04:34<00:00,  7.28it/s]


Epoch 307 Mean Reward: 88.42850523376465


100%|██████████| 2000/2000 [04:15<00:00,  7.82it/s]


Epoch 308 Mean Reward: 97.36176181793213


100%|██████████| 2000/2000 [04:23<00:00,  7.58it/s]


Epoch 309 Mean Reward: 101.77210119628906


100%|██████████| 2000/2000 [04:13<00:00,  7.89it/s]


Epoch 310 Mean Reward: 98.11013513183593


100%|██████████| 2000/2000 [04:18<00:00,  7.75it/s]


Epoch 311 Mean Reward: 104.6629192199707


100%|██████████| 2000/2000 [04:09<00:00,  8.01it/s]


Epoch 312 Mean Reward: 103.85390403747559


100%|██████████| 2000/2000 [04:17<00:00,  7.77it/s]


Epoch 313 Mean Reward: 100.738696144104


100%|██████████| 2000/2000 [04:14<00:00,  7.86it/s]


Epoch 314 Mean Reward: 101.1011960144043


100%|██████████| 2000/2000 [04:14<00:00,  7.85it/s]


Epoch 315 Mean Reward: 99.56591411590576


100%|██████████| 2000/2000 [04:12<00:00,  7.94it/s]


Epoch 316 Mean Reward: 96.1621996307373


100%|██████████| 2000/2000 [04:15<00:00,  7.82it/s]


Epoch 317 Mean Reward: 102.64564514160156


100%|██████████| 2000/2000 [04:26<00:00,  7.51it/s]


Epoch 318 Mean Reward: 101.92589575195312


100%|██████████| 2000/2000 [06:06<00:00,  5.46it/s]


Epoch 319 Mean Reward: 107.09043517303466


100%|██████████| 2000/2000 [05:56<00:00,  5.62it/s]


Epoch 320 Mean Reward: 116.6895436782837
Epoch 320 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 320 test with depth buffer:
Test Episode 1 Reward: 321.9363250732422
Test Episode 2 Reward: 399.62913513183594
Test Episode 3 Reward: 399.62913513183594
Test Episode 4 Reward: 399.62913513183594
Test Episode 5 Reward: 399.62913513183594
Test Episode 6 Reward: 399.62913513183594
Test Episode 7 Reward: 399.62913513183594
Test Episode 8 Reward: 399.62913513183594
Test Episode 9 Reward: 399.62913513183594
Test Episode 10 Reward: 386.8042907714844
Average Test Reward (with depth buffer:) 390.5773696899414
Epoch 320 test without depth buffer:
Test Episode 1 Reward: -115.9755859375
Test Episode 2 Reward: -115.9755859375
Test Episode 3 Reward: -39.439208984375
Test Episode 4 Reward: 102.70014953613281
Test Episode 5 Reward: -115.97560119628906
Test Episode 6 Reward: -115.9755859375
Test Episode 7 Reward: -115.97560119628906
Test Episode 8 Reward: 102.70014953613281
Test Episode 9 Reward: 

100%|██████████| 2000/2000 [05:01<00:00,  6.64it/s]


Epoch 321 Mean Reward: 108.97370596313476


100%|██████████| 2000/2000 [04:41<00:00,  7.11it/s]


Epoch 322 Mean Reward: 107.09613739776611


100%|██████████| 2000/2000 [05:17<00:00,  6.30it/s]


Epoch 323 Mean Reward: 103.7785200958252


100%|██████████| 2000/2000 [06:15<00:00,  5.33it/s]


Epoch 324 Mean Reward: 103.24172427368164


100%|██████████| 2000/2000 [06:12<00:00,  5.37it/s]


Epoch 325 Mean Reward: 101.6063316116333


100%|██████████| 2000/2000 [05:58<00:00,  5.59it/s]


Epoch 326 Mean Reward: 108.16939249420166


100%|██████████| 2000/2000 [04:49<00:00,  6.92it/s]


Epoch 327 Mean Reward: 110.08704844665527


100%|██████████| 2000/2000 [04:34<00:00,  7.28it/s]


Epoch 328 Mean Reward: 110.6302961807251


100%|██████████| 2000/2000 [04:52<00:00,  6.84it/s]


Epoch 329 Mean Reward: 117.59803810882569


100%|██████████| 2000/2000 [04:38<00:00,  7.19it/s]


Epoch 330 Mean Reward: 110.63538381195069


100%|██████████| 2000/2000 [06:14<00:00,  5.34it/s]


Epoch 331 Mean Reward: 114.7548909072876


100%|██████████| 2000/2000 [06:12<00:00,  5.36it/s]


Epoch 332 Mean Reward: 109.95171468353271


100%|██████████| 2000/2000 [06:10<00:00,  5.40it/s]


Epoch 333 Mean Reward: 117.62579669952393


100%|██████████| 2000/2000 [06:00<00:00,  5.55it/s]


Epoch 334 Mean Reward: 111.04845152282715


100%|██████████| 2000/2000 [06:07<00:00,  5.45it/s]


Epoch 335 Mean Reward: 107.04039499664307


100%|██████████| 2000/2000 [06:07<00:00,  5.44it/s]


Epoch 336 Mean Reward: 115.5618345413208


100%|██████████| 2000/2000 [06:07<00:00,  5.44it/s]


Epoch 337 Mean Reward: 125.75936149597167


100%|██████████| 2000/2000 [06:06<00:00,  5.46it/s]


Epoch 338 Mean Reward: 122.51615086364747


100%|██████████| 2000/2000 [06:04<00:00,  5.49it/s]


Epoch 339 Mean Reward: 116.31948334503174


100%|██████████| 2000/2000 [06:09<00:00,  5.41it/s]


Epoch 340 Mean Reward: 120.63451750946045
Epoch 340 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 340 test with depth buffer:
Test Episode 1 Reward: 76.88340759277344
Test Episode 2 Reward: 273.2422790527344
Test Episode 3 Reward: 76.88340759277344
Test Episode 4 Reward: 76.88340759277344
Test Episode 5 Reward: 76.88340759277344
Test Episode 6 Reward: 76.88340759277344
Test Episode 7 Reward: 346.3415985107422
Test Episode 8 Reward: 317.39739990234375
Test Episode 9 Reward: 51.41595458984375
Test Episode 10 Reward: 223.13323974609375
Average Test Reward (with depth buffer:) 159.5947509765625
Epoch 340 test without depth buffer:
Test Episode 1 Reward: -100.642578125
Test Episode 2 Reward: 5.2840728759765625
Test Episode 3 Reward: 16.301773071289062
Test Episode 4 Reward: 16.301773071289062
Test Episode 5 Reward: 52.63063049316406
Test Episode 6 Reward: 16.301773071289062
Test Episode 7 Reward: 16.301773071289062
Test Episode 8 Reward: -115.99931335449219
Test Episode 9 Reward: 

100%|██████████| 2000/2000 [06:18<00:00,  5.29it/s]


Epoch 341 Mean Reward: 121.82157682037354


100%|██████████| 2000/2000 [06:11<00:00,  5.38it/s]


Epoch 342 Mean Reward: 117.03504583740235


100%|██████████| 2000/2000 [06:22<00:00,  5.23it/s]


Epoch 343 Mean Reward: 127.57323470306396


100%|██████████| 2000/2000 [06:19<00:00,  5.27it/s]


Epoch 344 Mean Reward: 129.0434914855957


100%|██████████| 2000/2000 [06:11<00:00,  5.38it/s]


Epoch 345 Mean Reward: 127.27465502166748


100%|██████████| 2000/2000 [06:13<00:00,  5.36it/s]


Epoch 346 Mean Reward: 130.47539501953125


100%|██████████| 2000/2000 [06:16<00:00,  5.32it/s]


Epoch 347 Mean Reward: 133.0067321243286


100%|██████████| 2000/2000 [06:20<00:00,  5.25it/s]


Epoch 348 Mean Reward: 132.97488903808593


100%|██████████| 2000/2000 [06:17<00:00,  5.30it/s]


Epoch 349 Mean Reward: 132.9234998779297


100%|██████████| 2000/2000 [06:12<00:00,  5.37it/s]


Epoch 350 Mean Reward: 129.8633843612671


100%|██████████| 2000/2000 [05:32<00:00,  6.02it/s]


Epoch 351 Mean Reward: 132.50254399871827


100%|██████████| 2000/2000 [05:37<00:00,  5.93it/s]


Epoch 352 Mean Reward: 140.6840560836792


100%|██████████| 2000/2000 [05:53<00:00,  5.66it/s]


Epoch 353 Mean Reward: 137.48980919647218


100%|██████████| 2000/2000 [05:53<00:00,  5.65it/s]


Epoch 354 Mean Reward: 144.86560386657715


100%|██████████| 2000/2000 [05:42<00:00,  5.84it/s]


Epoch 355 Mean Reward: 139.40632384490968


100%|██████████| 2000/2000 [06:04<00:00,  5.49it/s]


Epoch 356 Mean Reward: 146.65833757019044


100%|██████████| 2000/2000 [06:22<00:00,  5.23it/s]


Epoch 357 Mean Reward: 139.37429696655275


100%|██████████| 2000/2000 [06:32<00:00,  5.10it/s]


Epoch 358 Mean Reward: 146.88651272583007


100%|██████████| 2000/2000 [06:13<00:00,  5.36it/s]


Epoch 359 Mean Reward: 146.88663528442382


100%|██████████| 2000/2000 [05:45<00:00,  5.79it/s]


Epoch 360 Mean Reward: 145.0196722793579
Epoch 360 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 360 test with depth buffer:
Test Episode 1 Reward: 71.83000183105469
Test Episode 2 Reward: 102.04827880859375
Test Episode 3 Reward: 128.0494842529297
Test Episode 4 Reward: 325.2063903808594
Test Episode 5 Reward: 117.03398132324219
Test Episode 6 Reward: 325.2063903808594
Test Episode 7 Reward: 325.2063903808594
Test Episode 8 Reward: 354.76438903808594
Test Episode 9 Reward: 93.57060241699219
Test Episode 10 Reward: 325.2063903808594
Average Test Reward (with depth buffer:) 216.8122299194336
Epoch 360 test without depth buffer:
Test Episode 1 Reward: -13.27783203125
Test Episode 2 Reward: -24.0748291015625
Test Episode 3 Reward: -114.1640625
Test Episode 4 Reward: -115.30491638183594
Test Episode 5 Reward: -115.30491638183594
Test Episode 6 Reward: -95.58580017089844
Test Episode 7 Reward: -115.30491638183594
Test Episode 8 Reward: 14.261795043945312
Test Episode 9 Reward: -64

100%|██████████| 2000/2000 [04:16<00:00,  7.81it/s]


Epoch 361 Mean Reward: 109.56300029754638


100%|██████████| 2000/2000 [04:27<00:00,  7.48it/s]


Epoch 362 Mean Reward: 105.61775039672851


100%|██████████| 2000/2000 [04:29<00:00,  7.42it/s]


Epoch 363 Mean Reward: 98.55517678833007


100%|██████████| 2000/2000 [04:46<00:00,  6.98it/s]


Epoch 364 Mean Reward: 103.51182569885253


100%|██████████| 2000/2000 [04:42<00:00,  7.08it/s]


Epoch 365 Mean Reward: 108.39324112701416


100%|██████████| 2000/2000 [04:49<00:00,  6.92it/s]


Epoch 366 Mean Reward: 106.41683373260499


100%|██████████| 2000/2000 [05:35<00:00,  5.96it/s]


Epoch 367 Mean Reward: 110.8763288269043


100%|██████████| 2000/2000 [05:16<00:00,  6.31it/s]


Epoch 368 Mean Reward: 106.19490810394286


100%|██████████| 2000/2000 [05:26<00:00,  6.13it/s]


Epoch 369 Mean Reward: 103.1904362411499


100%|██████████| 2000/2000 [05:35<00:00,  5.96it/s]


Epoch 370 Mean Reward: 102.036423828125


100%|██████████| 2000/2000 [05:40<00:00,  5.87it/s]


Epoch 371 Mean Reward: 106.64133271026611


100%|██████████| 2000/2000 [04:59<00:00,  6.67it/s]


Epoch 372 Mean Reward: 107.78072205352784


100%|██████████| 2000/2000 [03:57<00:00,  8.40it/s]


Epoch 373 Mean Reward: 103.30235385131836


100%|██████████| 2000/2000 [04:01<00:00,  8.27it/s]


Epoch 374 Mean Reward: 108.28353668212891


100%|██████████| 2000/2000 [04:12<00:00,  7.93it/s]


Epoch 375 Mean Reward: 103.54664724731445


100%|██████████| 2000/2000 [04:16<00:00,  7.79it/s]


Epoch 376 Mean Reward: 101.87326228332519


100%|██████████| 2000/2000 [04:24<00:00,  7.56it/s]


Epoch 377 Mean Reward: 102.39781853485107


100%|██████████| 2000/2000 [04:09<00:00,  8.02it/s]


Epoch 378 Mean Reward: 108.12974435424805


100%|██████████| 2000/2000 [03:59<00:00,  8.35it/s]


Epoch 379 Mean Reward: 105.2824100265503


100%|██████████| 2000/2000 [04:10<00:00,  7.98it/s]


Epoch 380 Mean Reward: 109.59372408294678
Epoch 380 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 380 test with depth buffer:
Test Episode 1 Reward: 330.9148712158203
Test Episode 2 Reward: 61.7451171875
Test Episode 3 Reward: 330.9148712158203
Test Episode 4 Reward: 62.9080810546875
Test Episode 5 Reward: 330.9148712158203
Test Episode 6 Reward: 330.9148712158203
Test Episode 7 Reward: 109.94676208496094
Test Episode 8 Reward: 330.9148712158203
Test Episode 9 Reward: 6.147186279296875
Test Episode 10 Reward: 330.9148712158203
Average Test Reward (with depth buffer:) 222.6236373901367
Epoch 380 test without depth buffer:
Test Episode 1 Reward: 14.014892578125
Test Episode 2 Reward: 14.014892578125
Test Episode 3 Reward: -74.21896362304688
Test Episode 4 Reward: -112.08851623535156
Test Episode 5 Reward: 14.014892578125
Test Episode 6 Reward: -115.97560119628906
Test Episode 7 Reward: -114.44065856933594
Test Episode 8 Reward: 14.014892578125
Test Episode 9 Reward: -39.3201599

100%|██████████| 2000/2000 [04:16<00:00,  7.79it/s]


Epoch 381 Mean Reward: 120.81322468566894


100%|██████████| 2000/2000 [04:16<00:00,  7.80it/s]


Epoch 382 Mean Reward: 126.10660121154785


100%|██████████| 2000/2000 [04:14<00:00,  7.85it/s]


Epoch 383 Mean Reward: 119.3448646697998


100%|██████████| 2000/2000 [03:54<00:00,  8.53it/s]


Epoch 384 Mean Reward: 120.66359126281738


100%|██████████| 2000/2000 [03:56<00:00,  8.45it/s]


Epoch 385 Mean Reward: 120.78988246917724


100%|██████████| 2000/2000 [03:56<00:00,  8.46it/s]


Epoch 386 Mean Reward: 127.29492971038819


100%|██████████| 2000/2000 [03:58<00:00,  8.38it/s]


Epoch 387 Mean Reward: 132.46432695007323


100%|██████████| 2000/2000 [03:56<00:00,  8.47it/s]


Epoch 388 Mean Reward: 125.82857310485839


100%|██████████| 2000/2000 [03:36<00:00,  9.23it/s]


Epoch 389 Mean Reward: 131.04165710449217


100%|██████████| 2000/2000 [03:21<00:00,  9.92it/s]


Epoch 390 Mean Reward: 130.30966762542724


100%|██████████| 2000/2000 [03:26<00:00,  9.67it/s]


Epoch 391 Mean Reward: 125.22501863861083


100%|██████████| 2000/2000 [03:24<00:00,  9.80it/s]


Epoch 392 Mean Reward: 114.21752848052978


100%|██████████| 2000/2000 [03:27<00:00,  9.62it/s]


Epoch 393 Mean Reward: 109.00232424926757


100%|██████████| 2000/2000 [03:18<00:00, 10.09it/s]


Epoch 394 Mean Reward: 121.11483530426025


100%|██████████| 2000/2000 [03:25<00:00,  9.71it/s]


Epoch 395 Mean Reward: 112.73470995330811


100%|██████████| 2000/2000 [03:15<00:00, 10.21it/s]


Epoch 396 Mean Reward: 118.88951300811767


100%|██████████| 2000/2000 [03:13<00:00, 10.33it/s]


Epoch 397 Mean Reward: 118.64820202636719


100%|██████████| 2000/2000 [03:12<00:00, 10.40it/s]


Epoch 398 Mean Reward: 118.52625988006592


100%|██████████| 2000/2000 [03:16<00:00, 10.19it/s]


Epoch 399 Mean Reward: 118.52555654144287


100%|██████████| 2000/2000 [03:34<00:00,  9.34it/s]


Epoch 400 Mean Reward: 120.67630950164795
Epoch 400 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 400 test with depth buffer:
Test Episode 1 Reward: 230.4676971435547
Test Episode 2 Reward: 39.101470947265625
Test Episode 3 Reward: 230.4676971435547
Test Episode 4 Reward: 78.0565185546875
Test Episode 5 Reward: 48.535675048828125
Test Episode 6 Reward: 190.71868896484375
Test Episode 7 Reward: 230.4676971435547
Test Episode 8 Reward: 184.17462158203125
Test Episode 9 Reward: 39.37127685546875
Test Episode 10 Reward: 315.62408447265625
Average Test Reward (with depth buffer:) 158.69854278564452
Epoch 400 test without depth buffer:
Test Episode 1 Reward: 146.37899780273438
Test Episode 2 Reward: 146.37899780273438
Test Episode 3 Reward: 146.37899780273438
Test Episode 4 Reward: 146.37899780273438
Test Episode 5 Reward: 146.37899780273438
Test Episode 6 Reward: 146.37899780273438
Test Episode 7 Reward: -49.23741149902344
Test Episode 8 Reward: 15.106475830078125
Test Episode 9 R

100%|██████████| 2000/2000 [03:45<00:00,  8.86it/s]


Epoch 401 Mean Reward: 188.98291171264648


100%|██████████| 2000/2000 [04:08<00:00,  8.04it/s]


Epoch 402 Mean Reward: 202.72968379974364


100%|██████████| 2000/2000 [04:28<00:00,  7.44it/s]


Epoch 403 Mean Reward: 197.3552294692993


100%|██████████| 2000/2000 [04:26<00:00,  7.51it/s]


Epoch 404 Mean Reward: 201.17459451293945


100%|██████████| 2000/2000 [04:22<00:00,  7.63it/s]


Epoch 405 Mean Reward: 157.80392762756347


100%|██████████| 2000/2000 [04:25<00:00,  7.53it/s]


Epoch 406 Mean Reward: 132.69338326263428


100%|██████████| 2000/2000 [03:55<00:00,  8.48it/s]


Epoch 407 Mean Reward: 126.3586493988037


100%|██████████| 2000/2000 [04:01<00:00,  8.27it/s]


Epoch 408 Mean Reward: 131.62144577026368


100%|██████████| 2000/2000 [04:27<00:00,  7.48it/s]


Epoch 409 Mean Reward: 134.39294246673583


100%|██████████| 2000/2000 [04:40<00:00,  7.13it/s]


Epoch 410 Mean Reward: 136.10956282043458


100%|██████████| 2000/2000 [04:39<00:00,  7.15it/s]


Epoch 411 Mean Reward: 138.08599674224854


100%|██████████| 2000/2000 [04:42<00:00,  7.08it/s]


Epoch 412 Mean Reward: 122.72485120391846


100%|██████████| 2000/2000 [04:28<00:00,  7.44it/s]


Epoch 413 Mean Reward: 131.7107396774292


100%|██████████| 2000/2000 [03:52<00:00,  8.60it/s]


Epoch 414 Mean Reward: 128.9006145248413


100%|██████████| 2000/2000 [03:59<00:00,  8.34it/s]


Epoch 415 Mean Reward: 126.8234570236206


100%|██████████| 2000/2000 [04:09<00:00,  8.02it/s]


Epoch 416 Mean Reward: 127.81184666442871


100%|██████████| 2000/2000 [04:16<00:00,  7.78it/s]


Epoch 417 Mean Reward: 126.06398658752441


100%|██████████| 2000/2000 [04:28<00:00,  7.44it/s]


Epoch 418 Mean Reward: 126.94542720794678


100%|██████████| 2000/2000 [04:47<00:00,  6.95it/s]


Epoch 419 Mean Reward: 133.0962554550171


100%|██████████| 2000/2000 [04:35<00:00,  7.26it/s]


Epoch 420 Mean Reward: 129.57971974945067
Epoch 420 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 420 test with depth buffer:
Test Episode 1 Reward: 220.99729919433594
Test Episode 2 Reward: 57.461700439453125
Test Episode 3 Reward: 59.137725830078125
Test Episode 4 Reward: 220.99729919433594
Test Episode 5 Reward: 220.99729919433594
Test Episode 6 Reward: 220.99729919433594
Test Episode 7 Reward: 220.99729919433594
Test Episode 8 Reward: 291.7442169189453
Test Episode 9 Reward: 220.99729919433594
Test Episode 10 Reward: 220.99729919433594
Average Test Reward (with depth buffer:) 195.5324737548828
Epoch 420 test without depth buffer:
Test Episode 1 Reward: 71.25238037109375
Test Episode 2 Reward: 71.25238037109375
Test Episode 3 Reward: 71.25238037109375
Test Episode 4 Reward: 71.25238037109375
Test Episode 5 Reward: -36.62825012207031
Test Episode 6 Reward: 71.25238037109375
Test Episode 7 Reward: -96.30059814453125
Test Episode 8 Reward: 71.25238037109375
Test Episode 9 Rew

100%|██████████| 2000/2000 [04:33<00:00,  7.31it/s]


Epoch 421 Mean Reward: 140.43998348236084


100%|██████████| 2000/2000 [04:12<00:00,  7.91it/s]


Epoch 422 Mean Reward: 146.95059426116944


100%|██████████| 2000/2000 [04:07<00:00,  8.08it/s]


Epoch 423 Mean Reward: 144.63135276794435


100%|██████████| 2000/2000 [04:15<00:00,  7.84it/s]


Epoch 424 Mean Reward: 141.27029625701906


100%|██████████| 2000/2000 [04:35<00:00,  7.26it/s]


Epoch 425 Mean Reward: 142.61892018890381


100%|██████████| 2000/2000 [04:18<00:00,  7.74it/s]


Epoch 426 Mean Reward: 137.47589066314697


100%|██████████| 2000/2000 [04:10<00:00,  7.97it/s]


Epoch 427 Mean Reward: 138.76637197875976


100%|██████████| 2000/2000 [04:12<00:00,  7.92it/s]


Epoch 428 Mean Reward: 142.60559299468994


100%|██████████| 2000/2000 [03:55<00:00,  8.48it/s]


Epoch 429 Mean Reward: 151.89183979797363


100%|██████████| 2000/2000 [04:17<00:00,  7.77it/s]


Epoch 430 Mean Reward: 138.82693186187745


100%|██████████| 2000/2000 [04:12<00:00,  7.92it/s]


Epoch 431 Mean Reward: 137.38811978149414


100%|██████████| 2000/2000 [04:10<00:00,  7.98it/s]


Epoch 432 Mean Reward: 140.45441792297362


100%|██████████| 2000/2000 [04:23<00:00,  7.60it/s]


Epoch 433 Mean Reward: 139.46878047180175


100%|██████████| 2000/2000 [04:09<00:00,  8.02it/s]


Epoch 434 Mean Reward: 139.25511241149903


100%|██████████| 2000/2000 [03:31<00:00,  9.44it/s]


Epoch 435 Mean Reward: 141.54225049591065


100%|██████████| 2000/2000 [03:27<00:00,  9.64it/s]


Epoch 436 Mean Reward: 138.72349869537354


100%|██████████| 2000/2000 [03:39<00:00,  9.10it/s]


Epoch 437 Mean Reward: 150.8924453277588


100%|██████████| 2000/2000 [03:40<00:00,  9.06it/s]


Epoch 438 Mean Reward: 141.02788063812255


100%|██████████| 2000/2000 [03:32<00:00,  9.42it/s]


Epoch 439 Mean Reward: 144.19415236663818


100%|██████████| 2000/2000 [03:35<00:00,  9.29it/s]


Epoch 440 Mean Reward: 150.2662470932007
Epoch 440 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 440 test with depth buffer:
Test Episode 1 Reward: 392.36163330078125
Test Episode 2 Reward: 28.48651123046875
Test Episode 3 Reward: 330.3659973144531
Test Episode 4 Reward: 146.8408966064453
Test Episode 5 Reward: 10.127700805664062
Test Episode 6 Reward: 199.5989990234375
Test Episode 7 Reward: 785.5270233154297
Test Episode 8 Reward: 146.8408966064453
Test Episode 9 Reward: 146.8408966064453
Test Episode 10 Reward: 146.8408966064453
Average Test Reward (with depth buffer:) 233.38314514160157
Epoch 440 test without depth buffer:
Test Episode 1 Reward: 167.5055694580078
Test Episode 2 Reward: -115.97560119628906
Test Episode 3 Reward: -48.33135986328125
Test Episode 4 Reward: -26.143234252929688
Test Episode 5 Reward: 167.5055694580078
Test Episode 6 Reward: -106.46983337402344
Test Episode 7 Reward: 28.748260498046875
Test Episode 8 Reward: -57.38578796386719
Test Episode 9 Rew

100%|██████████| 2000/2000 [03:38<00:00,  9.13it/s]


Epoch 441 Mean Reward: 171.47490045166015


100%|██████████| 2000/2000 [03:47<00:00,  8.80it/s]


Epoch 442 Mean Reward: 158.53546378326416


100%|██████████| 2000/2000 [03:43<00:00,  8.94it/s]


Epoch 443 Mean Reward: 162.76526387786865


100%|██████████| 2000/2000 [03:56<00:00,  8.44it/s]


Epoch 444 Mean Reward: 154.4415418395996


100%|██████████| 2000/2000 [03:42<00:00,  8.98it/s]


Epoch 445 Mean Reward: 160.64886058807372


100%|██████████| 2000/2000 [03:33<00:00,  9.36it/s]


Epoch 446 Mean Reward: 160.99341397094727


100%|██████████| 2000/2000 [03:50<00:00,  8.67it/s]


Epoch 447 Mean Reward: 158.38548892211915


100%|██████████| 2000/2000 [03:42<00:00,  9.00it/s]


Epoch 448 Mean Reward: 157.5506347961426


100%|██████████| 2000/2000 [03:44<00:00,  8.92it/s]


Epoch 449 Mean Reward: 161.23064198303223


100%|██████████| 2000/2000 [03:45<00:00,  8.88it/s]


Epoch 450 Mean Reward: 166.37650592803956


100%|██████████| 2000/2000 [03:42<00:00,  8.99it/s]


Epoch 451 Mean Reward: 163.4976171722412


100%|██████████| 2000/2000 [03:41<00:00,  9.01it/s]


Epoch 452 Mean Reward: 163.54146050262452


100%|██████████| 2000/2000 [04:00<00:00,  8.31it/s]


Epoch 453 Mean Reward: 165.15505577087401


100%|██████████| 2000/2000 [05:06<00:00,  6.53it/s]


Epoch 454 Mean Reward: 162.78206597137452


100%|██████████| 2000/2000 [05:08<00:00,  6.48it/s]


Epoch 455 Mean Reward: 163.83826733398436


100%|██████████| 2000/2000 [04:58<00:00,  6.69it/s]


Epoch 456 Mean Reward: 158.24405393218993


100%|██████████| 2000/2000 [05:10<00:00,  6.43it/s]


Epoch 457 Mean Reward: 167.39129109191896


100%|██████████| 2000/2000 [05:39<00:00,  5.88it/s]


Epoch 458 Mean Reward: 158.04620252990722


100%|██████████| 2000/2000 [04:54<00:00,  6.79it/s]


Epoch 459 Mean Reward: 158.1230592803955


100%|██████████| 2000/2000 [05:21<00:00,  6.23it/s]


Epoch 460 Mean Reward: 152.06195067596437
Epoch 460 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 460 test with depth buffer:
Test Episode 1 Reward: 204.6424102783203
Test Episode 2 Reward: 233.96188354492188
Test Episode 3 Reward: 204.6424102783203
Test Episode 4 Reward: 204.6424102783203
Test Episode 5 Reward: 86.57640075683594
Test Episode 6 Reward: 526.4897613525391
Test Episode 7 Reward: 325.3266143798828
Test Episode 8 Reward: 204.6424102783203
Test Episode 9 Reward: -3.985931396484375
Test Episode 10 Reward: 197.0859375
Average Test Reward (with depth buffer:) 218.40243072509764
Epoch 460 test without depth buffer:
Test Episode 1 Reward: -5.24322509765625
Test Episode 2 Reward: 60.21067810058594
Test Episode 3 Reward: 60.21067810058594
Test Episode 4 Reward: 60.21067810058594
Test Episode 5 Reward: -28.995864868164062
Test Episode 6 Reward: -12.904525756835938
Test Episode 7 Reward: 82.95559692382812
Test Episode 8 Reward: -29.127655029296875
Test Episode 9 Reward: 60.

100%|██████████| 2000/2000 [06:00<00:00,  5.55it/s]


Epoch 461 Mean Reward: 181.3991336517334


100%|██████████| 2000/2000 [05:36<00:00,  5.95it/s]


Epoch 462 Mean Reward: 180.75693808746337


100%|██████████| 2000/2000 [04:58<00:00,  6.71it/s]


Epoch 463 Mean Reward: 205.15494017791747


100%|██████████| 2000/2000 [06:31<00:00,  5.11it/s]


Epoch 464 Mean Reward: 205.40297608184815


100%|██████████| 2000/2000 [06:40<00:00,  5.00it/s]


Epoch 465 Mean Reward: 206.04073625946046


100%|██████████| 2000/2000 [06:32<00:00,  5.09it/s]


Epoch 466 Mean Reward: 203.50895612335205


100%|██████████| 2000/2000 [06:37<00:00,  5.03it/s]


Epoch 467 Mean Reward: 194.5121971282959


100%|██████████| 2000/2000 [07:00<00:00,  4.75it/s]


Epoch 468 Mean Reward: 201.23681468200684


100%|██████████| 2000/2000 [06:54<00:00,  4.83it/s]


Epoch 469 Mean Reward: 199.1051918029785


100%|██████████| 2000/2000 [06:17<00:00,  5.30it/s]


Epoch 470 Mean Reward: 194.59912390899657


100%|██████████| 2000/2000 [06:56<00:00,  4.81it/s]


Epoch 471 Mean Reward: 201.1266866531372


100%|██████████| 2000/2000 [06:51<00:00,  4.86it/s]


Epoch 472 Mean Reward: 193.89519258117676


100%|██████████| 2000/2000 [06:57<00:00,  4.79it/s]


Epoch 473 Mean Reward: 194.06575832366943


100%|██████████| 2000/2000 [06:20<00:00,  5.26it/s]


Epoch 474 Mean Reward: 188.4479384765625


100%|██████████| 2000/2000 [06:24<00:00,  5.21it/s]


Epoch 475 Mean Reward: 189.58665503692626


100%|██████████| 2000/2000 [06:54<00:00,  4.83it/s]


Epoch 476 Mean Reward: 193.712243019104


100%|██████████| 2000/2000 [06:30<00:00,  5.12it/s]


Epoch 477 Mean Reward: 186.17045600891115


100%|██████████| 2000/2000 [04:46<00:00,  6.99it/s]


Epoch 478 Mean Reward: 185.4019094696045


100%|██████████| 2000/2000 [06:45<00:00,  4.93it/s]


Epoch 479 Mean Reward: 186.25963897705077


100%|██████████| 2000/2000 [06:47<00:00,  4.90it/s]


Epoch 480 Mean Reward: 182.33358988189698
Epoch 480 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 480 test with depth buffer:
Test Episode 1 Reward: 266.4479522705078
Test Episode 2 Reward: 63.249053955078125
Test Episode 3 Reward: -3.0710906982421875
Test Episode 4 Reward: 177.27801513671875
Test Episode 5 Reward: 63.249053955078125
Test Episode 6 Reward: 183.06358337402344
Test Episode 7 Reward: 49.545989990234375
Test Episode 8 Reward: 63.249053955078125
Test Episode 9 Reward: 63.249053955078125
Test Episode 10 Reward: 25.243087768554688
Average Test Reward (with depth buffer:) 95.15037536621094
Epoch 480 test without depth buffer:
Test Episode 1 Reward: -24.21405029296875
Test Episode 2 Reward: 16.279830932617188
Test Episode 3 Reward: -15.401626586914062
Test Episode 4 Reward: -49.52919006347656
Test Episode 5 Reward: -111.78193664550781
Test Episode 6 Reward: -15.401626586914062
Test Episode 7 Reward: -15.401626586914062
Test Episode 8 Reward: -20.82281494140625
Test Ep

100%|██████████| 2000/2000 [06:26<00:00,  5.17it/s]


Epoch 481 Mean Reward: 234.34987270355225


100%|██████████| 2000/2000 [05:10<00:00,  6.44it/s]


Epoch 482 Mean Reward: 197.21194980621337


100%|██████████| 2000/2000 [04:07<00:00,  8.09it/s]


Epoch 483 Mean Reward: 202.54789405822754


100%|██████████| 2000/2000 [03:53<00:00,  8.58it/s]


Epoch 484 Mean Reward: 200.10680795288087


100%|██████████| 2000/2000 [03:59<00:00,  8.35it/s]


Epoch 485 Mean Reward: 199.02578115844727


100%|██████████| 2000/2000 [03:47<00:00,  8.78it/s]


Epoch 486 Mean Reward: 200.03372108459473


100%|██████████| 2000/2000 [04:14<00:00,  7.87it/s]


Epoch 487 Mean Reward: 198.56029455566406


100%|██████████| 2000/2000 [04:20<00:00,  7.68it/s]


Epoch 488 Mean Reward: 206.2145910949707


100%|██████████| 2000/2000 [04:20<00:00,  7.68it/s]


Epoch 489 Mean Reward: 201.79913710784913


100%|██████████| 2000/2000 [04:19<00:00,  7.70it/s]


Epoch 490 Mean Reward: 194.46882694244385


100%|██████████| 2000/2000 [04:19<00:00,  7.69it/s]


Epoch 491 Mean Reward: 185.80819874572754


100%|██████████| 2000/2000 [04:13<00:00,  7.90it/s]


Epoch 492 Mean Reward: 182.13214486694335


100%|██████████| 2000/2000 [04:17<00:00,  7.76it/s]


Epoch 493 Mean Reward: 185.65811697387696


100%|██████████| 2000/2000 [04:12<00:00,  7.93it/s]


Epoch 494 Mean Reward: 179.9872971420288


100%|██████████| 2000/2000 [03:58<00:00,  8.40it/s]


Epoch 495 Mean Reward: 186.2368005142212


100%|██████████| 2000/2000 [03:55<00:00,  8.49it/s]


Epoch 496 Mean Reward: 187.88341676330566


100%|██████████| 2000/2000 [04:03<00:00,  8.21it/s]


Epoch 497 Mean Reward: 187.83116569519044


100%|██████████| 2000/2000 [04:02<00:00,  8.25it/s]


Epoch 498 Mean Reward: 187.10977541351318


100%|██████████| 2000/2000 [03:45<00:00,  8.88it/s]


Epoch 499 Mean Reward: 188.77832635498046


100%|██████████| 2000/2000 [03:48<00:00,  8.76it/s]


Epoch 500 Mean Reward: 185.93992027282715
Epoch 500 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 500 test with depth buffer:
Test Episode 1 Reward: 186.5520477294922
Test Episode 2 Reward: 186.5520477294922
Test Episode 3 Reward: 21.284866333007812
Test Episode 4 Reward: 186.5520477294922
Test Episode 5 Reward: 74.497314453125
Test Episode 6 Reward: 176.04962158203125
Test Episode 7 Reward: 271.9722595214844
Test Episode 8 Reward: -49.95011901855469
Test Episode 9 Reward: -22.617034912109375
Test Episode 10 Reward: 4.05523681640625
Average Test Reward (with depth buffer:) 103.49482879638671
Epoch 500 test without depth buffer:
Test Episode 1 Reward: -63.79725646972656
Test Episode 2 Reward: -15.32965087890625
Test Episode 3 Reward: -94.22477722167969
Test Episode 4 Reward: -24.660858154296875
Test Episode 5 Reward: -24.660858154296875
Test Episode 6 Reward: -24.660858154296875
Test Episode 7 Reward: -98.6231689453125
Test Episode 8 Reward: -44.21051025390625
Test Episode 9 R

In [7]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

test_reward = test_agent(DQN, num_episodes=20,
                         training=False,
                         load_model=True,
                         depth=True,
                         model_dir=ckpts[-1])
print('Average Test Reward (with depth buffer):', test_reward)

test_reward = test_agent(DQN, num_episodes=20,
                         training=False,
                         load_model=True,
                         depth=False,
                         model_dir=ckpts[-1])
print('Average Test Reward (without depth buffer):', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-500
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-500
Test Episode 1 Reward: 230.37501525878906
Test Episode 2 Reward: -24.505050659179688
Test Episode 3 Reward: 172.47891235351562
Test Episode 4 Reward: 22.823379516601562
Test Episode 5 Reward: 15.305938720703125
Test Episode 6 Reward: 172.47891235351562
Test Episode 7 Reward: 385.2270050048828
Test Episode 8 Reward: 172.47891235351562
Test Episode 9 Reward: 172.47891235351562
Test Episode 10 Reward: 172.47891235351562
Test Episode 11 Reward: -5.651702880859375
Test Episode 12 Reward: 133.2265625
Test Episode 13 Reward: 172.47891235351562
Test Episode 14 Reward: 32.4940185546875
Test Episode 15 Reward: 172.47891235351562
Test Episode 16 Reward: 172.47891235351562
Test Episode 17 Reward: 172.47891235351562
Test Episode 18 Reward: -31.488800048828125
Test Episode 19 Reward: 172.47891235351562
Test Episode 20 Reward: 24.7908935546875
Average Test Reward (wit