In [1]:
import importlib.util
import time

import tensorflow as tf
import numpy as np

from skimage.transform import rescale
from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.GRAY8)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 1
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + int(game.is_depth_buffer_enabled())

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 75000
epochs = 400
steps_per_epoch = 2000
learning_rate = 0.0025
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if float(down_sample_ratio) != 1.0:
        image = rescale(image=image, scale=down_sample_ratio, mode='reflect')
    image = image.astype(np.float32)
    image = np.expand_dims(image, axis=0)

    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, depth, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            
            if depth == False:
                depth_buffer = np.zeros(state.screen_buffer.shape)
            else:
                depth_buffer = np.expand_dims(state.depth_buffer, axis=2)
                
            state_buffer = np.stack((state.screen_buffer,
                                     state.depth_buffer), axis=-1)
            state1 = preprocess(state_buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.layers.flatten(self.conv2,
                                         name=network_name + '_flatten'
                                        )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self):
        self.learning_rate = 0.98*self.learning_rate
        
        return self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver(max_to_keep=10, reshape=True)
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            
#Substitute an array of zeros for the depth buffer if that setting is disabled

            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer.shape)
            else:
                depth_buffer = state.depth_buffer
            
            state1_buffer = np.stack((state.screen_buffer, depth_buffer), axis=-1)
            state1 = preprocess(state1_buffer, down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2_buffer = np.stack((state.screen_buffer, state.depth_buffer), axis=-1)
                state2 = preprocess(state2_buffer, down_sample_ratio)
                
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr()
    target_net.update_lr()
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Update the target network every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        update_target(update_ops, session)
        
#Save the model and test the agent for 10 episodes every 20 epochs
    
    if (epoch + 1) % 20 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

#Test the agent both with and without the depth buffer given
        
        print('Epoch {} test with depth buffer:'.format(epoch + 1))
        test_reward_depth = test_agent(DQN, num_episodes=10,
                                       training=True,
                                       load_model=False,
                                       depth=True,
                                       session=session,
                                       model_dir=model_dir)
        print('Average Test Reward (with depth buffer:)', test_reward_depth)
        
        print('Epoch {} test without depth buffer:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 depth=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward (without depth buffer):', test_reward)
        
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [01:41<00:00, 19.64it/s]


Epoch 1 Mean Reward: -70.34874752044678


100%|██████████| 2000/2000 [01:43<00:00, 19.25it/s]


Epoch 2 Mean Reward: -67.4187717590332


100%|██████████| 2000/2000 [01:29<00:00, 22.37it/s]


Epoch 3 Mean Reward: -69.43725765228271


100%|██████████| 2000/2000 [01:30<00:00, 21.98it/s]


Epoch 4 Mean Reward: -66.14991775512695


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 5 Mean Reward: -67.90185329437256


100%|██████████| 2000/2000 [01:34<00:00, 21.09it/s]


Epoch 6 Mean Reward: -68.64238536834716


100%|██████████| 2000/2000 [01:31<00:00, 21.83it/s]


Epoch 7 Mean Reward: -70.16232398223877


100%|██████████| 2000/2000 [01:31<00:00, 21.82it/s]


Epoch 8 Mean Reward: -68.70191046905518


100%|██████████| 2000/2000 [01:30<00:00, 22.02it/s]


Epoch 9 Mean Reward: -67.38696146392822


100%|██████████| 2000/2000 [01:30<00:00, 22.11it/s]


Epoch 10 Mean Reward: -69.65390396118164


100%|██████████| 2000/2000 [01:31<00:00, 21.89it/s]


Epoch 11 Mean Reward: -67.4368422012329


100%|██████████| 2000/2000 [01:30<00:00, 22.02it/s]


Epoch 12 Mean Reward: -69.12417576599121


100%|██████████| 2000/2000 [01:30<00:00, 22.00it/s]


Epoch 13 Mean Reward: -67.70249571228027


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 14 Mean Reward: -68.53885604095458


100%|██████████| 2000/2000 [01:32<00:00, 21.73it/s]


Epoch 15 Mean Reward: -67.73534006500245


100%|██████████| 2000/2000 [01:31<00:00, 21.89it/s]


Epoch 16 Mean Reward: -68.61665476989747


100%|██████████| 2000/2000 [01:31<00:00, 21.83it/s]


Epoch 17 Mean Reward: -69.7095958480835


100%|██████████| 2000/2000 [01:30<00:00, 22.03it/s]


Epoch 18 Mean Reward: -71.72577774047852


100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 19 Mean Reward: -70.91366878509521


100%|██████████| 2000/2000 [01:31<00:00, 21.77it/s]


Epoch 20 Mean Reward: -67.24080211639404
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test with depth buffer:
Test Episode 1 Reward: 56.37648010253906
Test Episode 2 Reward: 340.6106719970703
Test Episode 3 Reward: 431.45721435546875
Test Episode 4 Reward: 56.37648010253906
Test Episode 5 Reward: 56.37648010253906
Test Episode 6 Reward: 56.37648010253906
Test Episode 7 Reward: 219.1642608642578
Test Episode 8 Reward: 340.0702667236328
Test Episode 9 Reward: 56.37648010253906
Test Episode 10 Reward: 56.37648010253906
Average Test Reward (with depth buffer:) 166.9561294555664
Epoch 20 test without depth buffer:
Test Episode 1 Reward: 320.401611328125
Test Episode 2 Reward: 11.590087890625
Test Episode 3 Reward: 354.45166015625
Test Episode 4 Reward: 56.37648010253906
Test Episode 5 Reward: 56.37648010253906
Test Episode 6 Reward: 56.37648010253906
Test Episode 7 Reward: 69.28022766113281
Test Episode 8 Reward: 56.37648010253906
Test Episode 9 Reward: 93.97186279296

100%|██████████| 2000/2000 [01:31<00:00, 21.78it/s]


Epoch 21 Mean Reward: -67.79414495849609


100%|██████████| 2000/2000 [01:31<00:00, 21.97it/s]


Epoch 22 Mean Reward: -66.51500511169434


100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 23 Mean Reward: -69.05826104736329


100%|██████████| 2000/2000 [01:31<00:00, 21.90it/s]


Epoch 24 Mean Reward: -69.51912084197998


100%|██████████| 2000/2000 [01:31<00:00, 21.80it/s]


Epoch 25 Mean Reward: -67.95758199310303


100%|██████████| 2000/2000 [01:35<00:00, 20.90it/s]


Epoch 26 Mean Reward: -68.66849095153809


100%|██████████| 2000/2000 [01:32<00:00, 21.72it/s]


Epoch 27 Mean Reward: -70.78637317657471


100%|██████████| 2000/2000 [01:32<00:00, 21.64it/s]


Epoch 28 Mean Reward: -70.54002215576172


100%|██████████| 2000/2000 [01:32<00:00, 21.66it/s]


Epoch 29 Mean Reward: -68.11146450042725


100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 30 Mean Reward: -66.8806706161499


100%|██████████| 2000/2000 [01:32<00:00, 21.68it/s]


Epoch 31 Mean Reward: -67.82415312194824


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 32 Mean Reward: -67.41665068054199


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 33 Mean Reward: -69.93697465515137


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 34 Mean Reward: -68.32547748565673


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 35 Mean Reward: -67.24427024078369


100%|██████████| 2000/2000 [01:34<00:00, 21.21it/s]


Epoch 36 Mean Reward: -69.8491867904663


100%|██████████| 2000/2000 [01:32<00:00, 21.61it/s]


Epoch 37 Mean Reward: -68.25079510498047


100%|██████████| 2000/2000 [01:32<00:00, 21.63it/s]


Epoch 38 Mean Reward: -66.13093434143066


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 39 Mean Reward: -69.37560523986816


100%|██████████| 2000/2000 [01:32<00:00, 21.57it/s]


Epoch 40 Mean Reward: -68.68682375335693
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test with depth buffer:
Test Episode 1 Reward: 58.35874938964844
Test Episode 2 Reward: 212.0392303466797
Test Episode 3 Reward: 58.35874938964844
Test Episode 4 Reward: 254.87594604492188
Test Episode 5 Reward: 58.35874938964844
Test Episode 6 Reward: 58.35874938964844
Test Episode 7 Reward: 58.35874938964844
Test Episode 8 Reward: 90.97235107421875
Test Episode 9 Reward: 58.35874938964844
Test Episode 10 Reward: 58.35874938964844
Average Test Reward (with depth buffer:) 96.63987731933594
Epoch 40 test without depth buffer:
Test Episode 1 Reward: 49.74522399902344
Test Episode 2 Reward: 58.35874938964844
Test Episode 3 Reward: 58.04679870605469
Test Episode 4 Reward: 58.35874938964844
Test Episode 5 Reward: 88.34764099121094
Test Episode 6 Reward: 58.35874938964844
Test Episode 7 Reward: 58.35874938964844
Test Episode 8 Reward: 120.12472534179688
Test Episode 9 Reward: 58.35874

100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 41 Mean Reward: -65.83411849212646


100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 42 Mean Reward: -67.99034001159669


100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 43 Mean Reward: -68.6001408920288


100%|██████████| 2000/2000 [01:33<00:00, 21.50it/s]


Epoch 44 Mean Reward: -67.80897841644287


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 45 Mean Reward: -69.38006540679932


100%|██████████| 2000/2000 [01:34<00:00, 21.06it/s]


Epoch 46 Mean Reward: -69.9004210281372


100%|██████████| 2000/2000 [01:33<00:00, 21.36it/s]


Epoch 47 Mean Reward: -68.13990602111816


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 48 Mean Reward: -66.44766372680664


100%|██████████| 2000/2000 [01:33<00:00, 21.50it/s]


Epoch 49 Mean Reward: -70.1572696685791


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 50 Mean Reward: -68.68098606872559


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 51 Mean Reward: -69.32613759613037


100%|██████████| 2000/2000 [01:33<00:00, 21.46it/s]


Epoch 52 Mean Reward: -67.49723764801026


100%|██████████| 2000/2000 [01:33<00:00, 21.36it/s]


Epoch 53 Mean Reward: -71.39681579589843


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 54 Mean Reward: -69.94489008331298


100%|██████████| 2000/2000 [01:34<00:00, 21.27it/s]


Epoch 55 Mean Reward: -68.29179901885986


100%|██████████| 2000/2000 [01:33<00:00, 21.39it/s]


Epoch 56 Mean Reward: -71.25022718048096


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 57 Mean Reward: -68.58976413726806


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 58 Mean Reward: -67.19042333984375


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 59 Mean Reward: -68.54463328552247


100%|██████████| 2000/2000 [01:32<00:00, 21.59it/s]


Epoch 60 Mean Reward: -69.82765422821045
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test with depth buffer:
Test Episode 1 Reward: 88.91380310058594
Test Episode 2 Reward: 130.90528869628906
Test Episode 3 Reward: 93.87403869628906
Test Episode 4 Reward: 88.91380310058594
Test Episode 5 Reward: 88.91380310058594
Test Episode 6 Reward: 88.91380310058594
Test Episode 7 Reward: 393.7066192626953
Test Episode 8 Reward: 88.91380310058594
Test Episode 9 Reward: 88.91380310058594
Test Episode 10 Reward: 88.91380310058594
Average Test Reward (with depth buffer:) 124.0882568359375
Epoch 60 test without depth buffer:
Test Episode 1 Reward: 88.91380310058594
Test Episode 2 Reward: 88.91380310058594
Test Episode 3 Reward: 88.91380310058594
Test Episode 4 Reward: 73.68702697753906
Test Episode 5 Reward: -65.12493896484375
Test Episode 6 Reward: 85.22138977050781
Test Episode 7 Reward: 48.193267822265625
Test Episode 8 Reward: 88.91380310058594
Test Episode 9 Reward: 300.823

100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 61 Mean Reward: -68.12740185546875


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 62 Mean Reward: -69.20437313842774


100%|██████████| 2000/2000 [01:33<00:00, 21.50it/s]


Epoch 63 Mean Reward: -69.57416462707519


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 64 Mean Reward: -68.48863570404053


100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 65 Mean Reward: -69.33805627441406


100%|██████████| 2000/2000 [01:35<00:00, 20.98it/s]


Epoch 66 Mean Reward: -69.34291608428956


100%|██████████| 2000/2000 [01:34<00:00, 21.27it/s]


Epoch 67 Mean Reward: -68.92253354644775


100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 68 Mean Reward: -68.71460629272461


100%|██████████| 2000/2000 [01:33<00:00, 21.33it/s]


Epoch 69 Mean Reward: -68.7995673828125


100%|██████████| 2000/2000 [01:34<00:00, 21.26it/s]


Epoch 70 Mean Reward: -68.92998101043702


100%|██████████| 2000/2000 [01:32<00:00, 21.56it/s]


Epoch 71 Mean Reward: -69.80855863952637


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 72 Mean Reward: -68.66602241516114


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 73 Mean Reward: -66.9089952545166


100%|██████████| 2000/2000 [01:34<00:00, 21.26it/s]


Epoch 74 Mean Reward: -66.82508786010742


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 75 Mean Reward: -67.54893774414063


100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 76 Mean Reward: -67.03399741363525


100%|██████████| 2000/2000 [01:33<00:00, 21.43it/s]


Epoch 77 Mean Reward: -68.66481011199951


100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Epoch 78 Mean Reward: -67.97682423400879


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 79 Mean Reward: -66.23069107818604


100%|██████████| 2000/2000 [01:32<00:00, 21.53it/s]


Epoch 80 Mean Reward: -67.81382174682618
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test with depth buffer:
Test Episode 1 Reward: 217.7736053466797
Test Episode 2 Reward: 217.7736053466797
Test Episode 3 Reward: 97.01417541503906
Test Episode 4 Reward: 217.7736053466797
Test Episode 5 Reward: 248.4740753173828
Test Episode 6 Reward: 217.7736053466797
Test Episode 7 Reward: 217.7736053466797
Test Episode 8 Reward: 296.5083770751953
Test Episode 9 Reward: 28.148269653320312
Test Episode 10 Reward: 47.87763977050781
Average Test Reward (with depth buffer:) 180.68905639648438
Epoch 80 test without depth buffer:
Test Episode 1 Reward: 87.19203186035156
Test Episode 2 Reward: 56.976898193359375
Test Episode 3 Reward: 217.7736053466797
Test Episode 4 Reward: 95.91175842285156
Test Episode 5 Reward: 217.7736053466797
Test Episode 6 Reward: 36.10255432128906
Test Episode 7 Reward: 217.7736053466797
Test Episode 8 Reward: 581.033203125
Test Episode 9 Reward: -106.762344

100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 81 Mean Reward: -68.97850661468506


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 82 Mean Reward: -67.9606916809082


100%|██████████| 2000/2000 [01:32<00:00, 21.58it/s]


Epoch 83 Mean Reward: -69.09042449188233


100%|██████████| 2000/2000 [01:32<00:00, 21.55it/s]


Epoch 84 Mean Reward: -68.42303899383545


100%|██████████| 2000/2000 [01:34<00:00, 21.18it/s]


Epoch 85 Mean Reward: -68.87920096588135


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 86 Mean Reward: -69.96277238464356


100%|██████████| 2000/2000 [01:33<00:00, 21.50it/s]


Epoch 87 Mean Reward: -67.62289102172852


100%|██████████| 2000/2000 [01:33<00:00, 21.45it/s]


Epoch 88 Mean Reward: -70.38387143707276


100%|██████████| 2000/2000 [01:33<00:00, 21.33it/s]


Epoch 89 Mean Reward: -68.81719494628906


100%|██████████| 2000/2000 [01:33<00:00, 21.42it/s]


Epoch 90 Mean Reward: -67.78642777252198


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 91 Mean Reward: -66.703804145813


100%|██████████| 2000/2000 [01:31<00:00, 21.76it/s]


Epoch 92 Mean Reward: -68.06045174407959


100%|██████████| 2000/2000 [01:32<00:00, 21.71it/s]


Epoch 93 Mean Reward: -69.79361511993409


100%|██████████| 2000/2000 [01:33<00:00, 21.36it/s]


Epoch 94 Mean Reward: -65.67250185394288


100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 95 Mean Reward: -69.39829199981689


100%|██████████| 2000/2000 [01:33<00:00, 21.44it/s]


Epoch 96 Mean Reward: -70.16462031555176


100%|██████████| 2000/2000 [01:33<00:00, 21.50it/s]


Epoch 97 Mean Reward: -66.75827429962158


100%|██████████| 2000/2000 [01:33<00:00, 21.49it/s]


Epoch 98 Mean Reward: -69.28629666900635


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 99 Mean Reward: -68.36756390380859


100%|██████████| 2000/2000 [01:31<00:00, 21.74it/s]


Epoch 100 Mean Reward: -68.99905783081054
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test with depth buffer:
Test Episode 1 Reward: 88.65116882324219
Test Episode 2 Reward: 76.08125305175781
Test Episode 3 Reward: 404.2420349121094
Test Episode 4 Reward: 480.6712188720703
Test Episode 5 Reward: 84.73899841308594
Test Episode 6 Reward: 319.81787109375
Test Episode 7 Reward: 86.77743530273438
Test Episode 8 Reward: 85.89578247070312
Test Episode 9 Reward: 88.65116882324219
Test Episode 10 Reward: 88.65116882324219
Average Test Reward (with depth buffer:) 180.41781005859374
Epoch 100 test without depth buffer:
Test Episode 1 Reward: 88.65116882324219
Test Episode 2 Reward: 88.65116882324219
Test Episode 3 Reward: 54.059051513671875
Test Episode 4 Reward: 88.65116882324219
Test Episode 5 Reward: 88.65116882324219
Test Episode 6 Reward: 232.1947479248047
Test Episode 7 Reward: 166.80409240722656
Test Episode 8 Reward: 88.65116882324219
Test Episode 9 Reward: 264.1

100%|██████████| 2000/2000 [01:32<00:00, 21.62it/s]


Epoch 101 Mean Reward: -69.60581312561035


100%|██████████| 2000/2000 [01:32<00:00, 21.54it/s]


Epoch 102 Mean Reward: -69.72260597991944


100%|██████████| 2000/2000 [01:33<00:00, 21.48it/s]


Epoch 103 Mean Reward: -68.61436899566651


100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


Epoch 104 Mean Reward: -68.24337113952637


100%|██████████| 2000/2000 [01:35<00:00, 20.92it/s]


Epoch 105 Mean Reward: -66.77332872772217


100%|██████████| 2000/2000 [01:35<00:00, 21.01it/s]


Epoch 106 Mean Reward: -67.95546886444092


100%|██████████| 2000/2000 [01:34<00:00, 21.16it/s]


Epoch 107 Mean Reward: -68.56788566589356


100%|██████████| 2000/2000 [01:34<00:00, 21.09it/s]


Epoch 108 Mean Reward: -66.74919634246827


100%|██████████| 2000/2000 [01:34<00:00, 21.23it/s]


Epoch 109 Mean Reward: -67.69603684997558


100%|██████████| 2000/2000 [01:33<00:00, 21.29it/s]


Epoch 110 Mean Reward: -68.27539896392823


100%|██████████| 2000/2000 [01:34<00:00, 21.24it/s]


Epoch 111 Mean Reward: -67.99006552886964


100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Epoch 112 Mean Reward: -67.8942767715454


100%|██████████| 2000/2000 [01:34<00:00, 21.24it/s]


Epoch 113 Mean Reward: -67.25885551452637


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 114 Mean Reward: -69.39894556427002


100%|██████████| 2000/2000 [01:34<00:00, 21.25it/s]


Epoch 115 Mean Reward: -69.0376591720581


100%|██████████| 2000/2000 [01:33<00:00, 21.28it/s]


Epoch 116 Mean Reward: -70.11197819519043


100%|██████████| 2000/2000 [01:34<00:00, 21.10it/s]


Epoch 117 Mean Reward: -69.57334688568115


100%|██████████| 2000/2000 [01:33<00:00, 21.47it/s]


Epoch 118 Mean Reward: -67.23060260009765


100%|██████████| 2000/2000 [01:33<00:00, 21.32it/s]


Epoch 119 Mean Reward: -67.88136544036865


100%|██████████| 2000/2000 [01:33<00:00, 21.38it/s]


Epoch 120 Mean Reward: -68.08198585510254
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test with depth buffer:
Test Episode 1 Reward: 32.43013000488281
Test Episode 2 Reward: 259.66912841796875
Test Episode 3 Reward: 258.4878387451172
Test Episode 4 Reward: 10.647796630859375
Test Episode 5 Reward: 259.66912841796875
Test Episode 6 Reward: 21.78814697265625
Test Episode 7 Reward: 23.253738403320312
Test Episode 8 Reward: 70.97895812988281
Test Episode 9 Reward: 20.107955932617188
Test Episode 10 Reward: 259.66912841796875
Average Test Reward (with depth buffer:) 121.67019500732422
Epoch 120 test without depth buffer:
Test Episode 1 Reward: 259.66912841796875
Test Episode 2 Reward: 259.66912841796875
Test Episode 3 Reward: 259.66912841796875
Test Episode 4 Reward: 71.88082885742188
Test Episode 5 Reward: 61.020263671875
Test Episode 6 Reward: 164.28890991210938
Test Episode 7 Reward: 201.14404296875
Test Episode 8 Reward: 135.99325561523438
Test Episode 9 Reward

100%|██████████| 2000/2000 [01:40<00:00, 19.81it/s]


Epoch 121 Mean Reward: -45.17892928314209


100%|██████████| 2000/2000 [01:42<00:00, 19.59it/s]


Epoch 122 Mean Reward: -42.179578956604004


100%|██████████| 2000/2000 [01:42<00:00, 19.55it/s]


Epoch 123 Mean Reward: -40.67956286621094


100%|██████████| 2000/2000 [01:43<00:00, 19.31it/s]


Epoch 124 Mean Reward: -39.871496131896976


100%|██████████| 2000/2000 [01:44<00:00, 19.21it/s]


Epoch 125 Mean Reward: -42.13929042053223


100%|██████████| 2000/2000 [01:42<00:00, 19.44it/s]


Epoch 126 Mean Reward: -42.00896394348145


100%|██████████| 2000/2000 [01:42<00:00, 19.56it/s]


Epoch 127 Mean Reward: -41.97876577758789


100%|██████████| 2000/2000 [01:43<00:00, 19.40it/s]


Epoch 128 Mean Reward: -39.98066576385498


100%|██████████| 2000/2000 [01:42<00:00, 19.51it/s]


Epoch 129 Mean Reward: -40.42211328125


100%|██████████| 2000/2000 [01:43<00:00, 19.34it/s]


Epoch 130 Mean Reward: -42.111637321472166


100%|██████████| 2000/2000 [01:42<00:00, 19.56it/s]


Epoch 131 Mean Reward: -37.17881745910645


100%|██████████| 2000/2000 [01:43<00:00, 19.29it/s]


Epoch 132 Mean Reward: -36.33873993682862


100%|██████████| 2000/2000 [01:44<00:00, 19.08it/s]


Epoch 133 Mean Reward: -37.515600959777835


100%|██████████| 2000/2000 [01:46<00:00, 18.83it/s]


Epoch 134 Mean Reward: -31.495340011596678


100%|██████████| 2000/2000 [01:46<00:00, 18.86it/s]


Epoch 135 Mean Reward: -33.784493392944334


100%|██████████| 2000/2000 [01:45<00:00, 18.96it/s]


Epoch 136 Mean Reward: -35.6378141784668


100%|██████████| 2000/2000 [01:45<00:00, 18.98it/s]


Epoch 137 Mean Reward: -33.6070689086914


100%|██████████| 2000/2000 [01:46<00:00, 18.86it/s]


Epoch 138 Mean Reward: -31.212234870910645


100%|██████████| 2000/2000 [01:45<00:00, 18.88it/s]


Epoch 139 Mean Reward: -30.75373931121826


100%|██████████| 2000/2000 [01:47<00:00, 18.67it/s]


Epoch 140 Mean Reward: -30.439051460266114
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test with depth buffer:
Test Episode 1 Reward: 413.2279052734375
Test Episode 2 Reward: 25.310989379882812
Test Episode 3 Reward: 380.7069854736328
Test Episode 4 Reward: 118.28367614746094
Test Episode 5 Reward: 76.90895080566406
Test Episode 6 Reward: 385.6875457763672
Test Episode 7 Reward: 369.8135681152344
Test Episode 8 Reward: 380.7069854736328
Test Episode 9 Reward: 380.7069854736328
Test Episode 10 Reward: 380.7069854736328
Average Test Reward (with depth buffer:) 291.2060577392578
Epoch 140 test without depth buffer:
Test Episode 1 Reward: 245.9138641357422
Test Episode 2 Reward: 380.7069854736328
Test Episode 3 Reward: 380.7069854736328
Test Episode 4 Reward: 380.7069854736328
Test Episode 5 Reward: 380.7069854736328
Test Episode 6 Reward: 75.03727722167969
Test Episode 7 Reward: 19.570846557617188
Test Episode 8 Reward: 380.7069854736328
Test Episode 9 Reward: 38

100%|██████████| 2000/2000 [01:46<00:00, 18.78it/s]


Epoch 141 Mean Reward: -30.313592742919923


100%|██████████| 2000/2000 [01:49<00:00, 18.24it/s]


Epoch 142 Mean Reward: -27.28662398529053


100%|██████████| 2000/2000 [01:49<00:00, 18.23it/s]


Epoch 143 Mean Reward: -24.98141435241699


100%|██████████| 2000/2000 [01:48<00:00, 18.51it/s]


Epoch 144 Mean Reward: -30.177642166137694


100%|██████████| 2000/2000 [01:49<00:00, 18.27it/s]


Epoch 145 Mean Reward: -27.107558372497557


100%|██████████| 2000/2000 [01:48<00:00, 18.39it/s]


Epoch 146 Mean Reward: -25.939580139160157


100%|██████████| 2000/2000 [01:50<00:00, 18.03it/s]


Epoch 147 Mean Reward: -22.848143928527833


100%|██████████| 2000/2000 [01:50<00:00, 18.17it/s]


Epoch 148 Mean Reward: -27.852208534240724


100%|██████████| 2000/2000 [01:50<00:00, 18.16it/s]


Epoch 149 Mean Reward: -23.935205711364745


100%|██████████| 2000/2000 [01:49<00:00, 18.23it/s]


Epoch 150 Mean Reward: -25.690724159240723


100%|██████████| 2000/2000 [01:56<00:00, 17.23it/s]


Epoch 151 Mean Reward: -16.67528224182129


100%|██████████| 2000/2000 [01:53<00:00, 17.68it/s]


Epoch 152 Mean Reward: -21.987857398986815


100%|██████████| 2000/2000 [01:55<00:00, 17.29it/s]


Epoch 153 Mean Reward: -19.25977456665039


100%|██████████| 2000/2000 [01:57<00:00, 16.95it/s]


Epoch 154 Mean Reward: -15.302540588378907


100%|██████████| 2000/2000 [01:56<00:00, 17.15it/s]


Epoch 155 Mean Reward: -18.717269790649414


100%|██████████| 2000/2000 [01:58<00:00, 16.92it/s]


Epoch 156 Mean Reward: -13.727827507019043


100%|██████████| 2000/2000 [01:57<00:00, 17.03it/s]


Epoch 157 Mean Reward: -18.224271644592285


100%|██████████| 2000/2000 [01:58<00:00, 16.94it/s]


Epoch 158 Mean Reward: -17.35374983215332


100%|██████████| 2000/2000 [01:57<00:00, 17.07it/s]


Epoch 159 Mean Reward: -16.99369529724121


100%|██████████| 2000/2000 [02:00<00:00, 16.60it/s]


Epoch 160 Mean Reward: -15.79117226409912
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test with depth buffer:
Test Episode 1 Reward: 280.1833801269531
Test Episode 2 Reward: 258.6802215576172
Test Episode 3 Reward: 323.9911804199219
Test Episode 4 Reward: 33.07237243652344
Test Episode 5 Reward: 259.43260192871094
Test Episode 6 Reward: -15.057373046875
Test Episode 7 Reward: 258.6802215576172
Test Episode 8 Reward: 258.6802215576172
Test Episode 9 Reward: 258.6802215576172
Test Episode 10 Reward: 316.79356384277344
Average Test Reward (with depth buffer:) 223.31366119384765
Epoch 160 test without depth buffer:
Test Episode 1 Reward: 271.7798614501953
Test Episode 2 Reward: 258.6802215576172
Test Episode 3 Reward: 258.6802215576172
Test Episode 4 Reward: 258.6802215576172
Test Episode 5 Reward: 263.3233337402344
Test Episode 6 Reward: 34.953338623046875
Test Episode 7 Reward: 377.1927947998047
Test Episode 8 Reward: 258.6802215576172
Test Episode 9 Reward: 384

100%|██████████| 2000/2000 [01:55<00:00, 17.28it/s]


Epoch 161 Mean Reward: -11.808778343200684


100%|██████████| 2000/2000 [01:57<00:00, 17.01it/s]


Epoch 162 Mean Reward: -9.353274291992188


100%|██████████| 2000/2000 [01:57<00:00, 16.99it/s]


Epoch 163 Mean Reward: -9.084580596923828


100%|██████████| 2000/2000 [01:58<00:00, 16.85it/s]


Epoch 164 Mean Reward: -10.58220979309082


100%|██████████| 2000/2000 [01:57<00:00, 16.97it/s]


Epoch 165 Mean Reward: -7.531370353698731


100%|██████████| 2000/2000 [01:56<00:00, 17.16it/s]


Epoch 166 Mean Reward: -10.048766166687011


100%|██████████| 2000/2000 [01:59<00:00, 16.77it/s]


Epoch 167 Mean Reward: -9.846193031311035


100%|██████████| 2000/2000 [01:59<00:00, 16.77it/s]


Epoch 168 Mean Reward: -7.915852661132813


100%|██████████| 2000/2000 [01:57<00:00, 17.08it/s]


Epoch 169 Mean Reward: -5.541982284545899


100%|██████████| 2000/2000 [01:58<00:00, 16.89it/s]


Epoch 170 Mean Reward: -8.413456581115723


100%|██████████| 2000/2000 [02:04<00:00, 16.01it/s]


Epoch 171 Mean Reward: 3.877496292114258


100%|██████████| 2000/2000 [02:03<00:00, 16.17it/s]


Epoch 172 Mean Reward: -2.2311595764160157


100%|██████████| 2000/2000 [02:03<00:00, 16.23it/s]


Epoch 173 Mean Reward: 0.5518692169189453


100%|██████████| 2000/2000 [02:05<00:00, 15.92it/s]


Epoch 174 Mean Reward: 3.4926895675659178


100%|██████████| 2000/2000 [02:05<00:00, 15.93it/s]


Epoch 175 Mean Reward: 3.2111590728759767


100%|██████████| 2000/2000 [02:07<00:00, 15.64it/s]


Epoch 176 Mean Reward: 8.18549799346924


100%|██████████| 2000/2000 [02:05<00:00, 15.95it/s]


Epoch 177 Mean Reward: 5.560013290405274


100%|██████████| 2000/2000 [02:07<00:00, 15.65it/s]


Epoch 178 Mean Reward: 10.56062883758545


100%|██████████| 2000/2000 [02:10<00:00, 15.33it/s]


Epoch 179 Mean Reward: 13.385245300292969


100%|██████████| 2000/2000 [02:09<00:00, 15.49it/s]


Epoch 180 Mean Reward: 9.093200523376465
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test with depth buffer:
Test Episode 1 Reward: 130.17608642578125
Test Episode 2 Reward: 130.17608642578125
Test Episode 3 Reward: 189.9620361328125
Test Episode 4 Reward: 130.17608642578125
Test Episode 5 Reward: 130.17608642578125
Test Episode 6 Reward: 130.17608642578125
Test Episode 7 Reward: 130.17608642578125
Test Episode 8 Reward: 41.252044677734375
Test Episode 9 Reward: 146.59690856933594
Test Episode 10 Reward: 130.17608642578125
Average Test Reward (with depth buffer:) 128.90435943603515
Epoch 180 test without depth buffer:
Test Episode 1 Reward: 130.17608642578125
Test Episode 2 Reward: 22.973983764648438
Test Episode 3 Reward: 426.55039978027344
Test Episode 4 Reward: 100.04322814941406
Test Episode 5 Reward: 130.17608642578125
Test Episode 6 Reward: 130.17608642578125
Test Episode 7 Reward: 68.07354736328125
Test Episode 8 Reward: 408.03228759765625
Test Episode 

100%|██████████| 2000/2000 [02:00<00:00, 16.53it/s]


Epoch 181 Mean Reward: 8.996007774353027


100%|██████████| 2000/2000 [02:01<00:00, 16.51it/s]


Epoch 182 Mean Reward: 8.178471588134766


100%|██████████| 2000/2000 [02:03<00:00, 16.16it/s]


Epoch 183 Mean Reward: 9.471781211853028


100%|██████████| 2000/2000 [02:02<00:00, 16.38it/s]


Epoch 184 Mean Reward: 12.281071533203125


100%|██████████| 2000/2000 [02:02<00:00, 16.32it/s]


Epoch 185 Mean Reward: 9.415890426635743


100%|██████████| 2000/2000 [02:01<00:00, 16.51it/s]


Epoch 186 Mean Reward: 10.472322471618652


100%|██████████| 2000/2000 [02:03<00:00, 16.22it/s]


Epoch 187 Mean Reward: 14.864421524047852


100%|██████████| 2000/2000 [02:02<00:00, 16.34it/s]


Epoch 188 Mean Reward: 12.314096710205078


100%|██████████| 2000/2000 [02:02<00:00, 16.37it/s]


Epoch 189 Mean Reward: 9.964631370544433


100%|██████████| 2000/2000 [02:06<00:00, 15.77it/s]


Epoch 190 Mean Reward: 17.44499742126465


100%|██████████| 2000/2000 [02:09<00:00, 15.41it/s]


Epoch 191 Mean Reward: 23.217673919677733


100%|██████████| 2000/2000 [02:09<00:00, 15.48it/s]


Epoch 192 Mean Reward: 22.71118434906006


100%|██████████| 2000/2000 [02:09<00:00, 15.43it/s]


Epoch 193 Mean Reward: 25.957880126953125


100%|██████████| 2000/2000 [02:09<00:00, 15.39it/s]


Epoch 194 Mean Reward: 25.523921058654786


100%|██████████| 2000/2000 [02:10<00:00, 15.27it/s]


Epoch 195 Mean Reward: 23.266487747192382


100%|██████████| 2000/2000 [02:12<00:00, 15.06it/s]


Epoch 196 Mean Reward: 28.600313041687013


100%|██████████| 2000/2000 [02:13<00:00, 14.99it/s]


Epoch 197 Mean Reward: 24.856379203796386


100%|██████████| 2000/2000 [02:11<00:00, 15.23it/s]


Epoch 198 Mean Reward: 34.29289977264404


100%|██████████| 2000/2000 [02:12<00:00, 15.08it/s]


Epoch 199 Mean Reward: 28.564089797973633


100%|██████████| 2000/2000 [02:13<00:00, 14.96it/s]


Epoch 200 Mean Reward: 28.575148391723634
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test with depth buffer:
Test Episode 1 Reward: 306.77642822265625
Test Episode 2 Reward: 10.853408813476562
Test Episode 3 Reward: 277.99974060058594
Test Episode 4 Reward: 443.2786865234375
Test Episode 5 Reward: 133.7250213623047
Test Episode 6 Reward: 156.96771240234375
Test Episode 7 Reward: 133.7250213623047
Test Episode 8 Reward: 156.96771240234375
Test Episode 9 Reward: 133.7250213623047
Test Episode 10 Reward: 238.95799255371094
Average Test Reward (with depth buffer:) 199.29767456054688
Epoch 200 test without depth buffer:
Test Episode 1 Reward: 133.7250213623047
Test Episode 2 Reward: 156.96771240234375
Test Episode 3 Reward: 136.7875518798828
Test Episode 4 Reward: 133.7250213623047
Test Episode 5 Reward: 156.96771240234375
Test Episode 6 Reward: 133.7250213623047
Test Episode 7 Reward: 231.47900390625
Test Episode 8 Reward: 133.7250213623047
Test Episode 9 Reward:

100%|██████████| 2000/2000 [02:13<00:00, 14.95it/s]


Epoch 201 Mean Reward: 25.478078132629395


100%|██████████| 2000/2000 [02:12<00:00, 15.04it/s]


Epoch 202 Mean Reward: 32.47440849304199


100%|██████████| 2000/2000 [02:14<00:00, 14.88it/s]


Epoch 203 Mean Reward: 29.463364120483398


100%|██████████| 2000/2000 [02:18<00:00, 14.42it/s]


Epoch 204 Mean Reward: 35.833485610961915


100%|██████████| 2000/2000 [02:14<00:00, 14.88it/s]


Epoch 205 Mean Reward: 29.61880785369873


100%|██████████| 2000/2000 [02:19<00:00, 14.33it/s]


Epoch 206 Mean Reward: 35.723410362243655


100%|██████████| 2000/2000 [02:14<00:00, 14.85it/s]


Epoch 207 Mean Reward: 31.118179794311523


100%|██████████| 2000/2000 [02:18<00:00, 14.44it/s]


Epoch 208 Mean Reward: 33.8306092300415


100%|██████████| 2000/2000 [02:16<00:00, 14.66it/s]


Epoch 209 Mean Reward: 35.74551454162598


100%|██████████| 2000/2000 [02:18<00:00, 14.41it/s]


Epoch 210 Mean Reward: 38.53521937561035


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 211 Mean Reward: 41.07358591461182


100%|██████████| 2000/2000 [02:19<00:00, 14.36it/s]


Epoch 212 Mean Reward: 40.1942342376709


100%|██████████| 2000/2000 [02:20<00:00, 14.26it/s]


Epoch 213 Mean Reward: 34.52341402435303


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 214 Mean Reward: 43.27102294921875


100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]


Epoch 215 Mean Reward: 34.19339405822754


100%|██████████| 2000/2000 [02:22<00:00, 14.03it/s]


Epoch 216 Mean Reward: 45.98355874633789


100%|██████████| 2000/2000 [02:22<00:00, 14.05it/s]


Epoch 217 Mean Reward: 42.03844412231445


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 218 Mean Reward: 42.85301890563965


100%|██████████| 2000/2000 [02:22<00:00, 14.08it/s]


Epoch 219 Mean Reward: 40.481744163513184


100%|██████████| 2000/2000 [02:21<00:00, 14.14it/s]


Epoch 220 Mean Reward: 47.0248423614502
Epoch 220 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 220 test with depth buffer:
Test Episode 1 Reward: 48.52375793457031
Test Episode 2 Reward: 183.2485809326172
Test Episode 3 Reward: 82.59262084960938
Test Episode 4 Reward: 48.52375793457031
Test Episode 5 Reward: 48.52375793457031
Test Episode 6 Reward: 48.52375793457031
Test Episode 7 Reward: 48.52375793457031
Test Episode 8 Reward: 22.297027587890625
Test Episode 9 Reward: 48.52375793457031
Test Episode 10 Reward: 71.32145690917969
Average Test Reward (with depth buffer:) 65.06022338867187
Epoch 220 test without depth buffer:
Test Episode 1 Reward: 56.596282958984375
Test Episode 2 Reward: 48.52375793457031
Test Episode 3 Reward: 48.52375793457031
Test Episode 4 Reward: 48.52375793457031
Test Episode 5 Reward: 48.52375793457031
Test Episode 6 Reward: 48.52375793457031
Test Episode 7 Reward: 48.52375793457031
Test Episode 8 Reward: 26.080535888671875
Test Episode 9 Reward: 7.939

100%|██████████| 2000/2000 [02:27<00:00, 13.60it/s]


Epoch 221 Mean Reward: 44.778049194335935


100%|██████████| 2000/2000 [02:28<00:00, 13.49it/s]


Epoch 222 Mean Reward: 44.399228439331054


100%|██████████| 2000/2000 [02:29<00:00, 13.36it/s]


Epoch 223 Mean Reward: 43.74475285339356


100%|██████████| 2000/2000 [02:35<00:00, 12.89it/s]


Epoch 224 Mean Reward: 50.67623879241943


100%|██████████| 2000/2000 [02:38<00:00, 12.64it/s]


Epoch 225 Mean Reward: 44.61308326721191


100%|██████████| 2000/2000 [02:31<00:00, 13.16it/s]


Epoch 226 Mean Reward: 42.83998554992676


100%|██████████| 2000/2000 [02:40<00:00, 12.49it/s]


Epoch 227 Mean Reward: 44.00843941497803


100%|██████████| 2000/2000 [02:35<00:00, 12.86it/s]


Epoch 228 Mean Reward: 56.94821244812012


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Epoch 229 Mean Reward: 54.97723221588135


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Epoch 230 Mean Reward: 58.51562985229492


100%|██████████| 2000/2000 [02:34<00:00, 12.93it/s]


Epoch 231 Mean Reward: 60.108532348632814


100%|██████████| 2000/2000 [02:36<00:00, 12.81it/s]


Epoch 232 Mean Reward: 60.228512344360354


100%|██████████| 2000/2000 [02:35<00:00, 12.89it/s]


Epoch 233 Mean Reward: 55.192639205932615


100%|██████████| 2000/2000 [02:34<00:00, 12.93it/s]


Epoch 234 Mean Reward: 62.45078492736816


100%|██████████| 2000/2000 [02:36<00:00, 12.74it/s]


Epoch 235 Mean Reward: 62.45689215087891


100%|██████████| 2000/2000 [02:38<00:00, 12.62it/s]


Epoch 236 Mean Reward: 67.67366219329834


100%|██████████| 2000/2000 [02:36<00:00, 12.78it/s]


Epoch 237 Mean Reward: 66.28682481384277


100%|██████████| 2000/2000 [02:37<00:00, 12.71it/s]


Epoch 238 Mean Reward: 63.18488204956055


100%|██████████| 2000/2000 [02:37<00:00, 12.67it/s]


Epoch 239 Mean Reward: 67.57845053863525


100%|██████████| 2000/2000 [02:35<00:00, 12.85it/s]


Epoch 240 Mean Reward: 70.27776667785645
Epoch 240 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 240 test with depth buffer:
Test Episode 1 Reward: 60.74528503417969
Test Episode 2 Reward: 185.92840576171875
Test Episode 3 Reward: 89.62374877929688
Test Episode 4 Reward: 20.327163696289062
Test Episode 5 Reward: 25.715927124023438
Test Episode 6 Reward: 109.80323791503906
Test Episode 7 Reward: 89.62374877929688
Test Episode 8 Reward: 185.92840576171875
Test Episode 9 Reward: 104.06108093261719
Test Episode 10 Reward: 390.5578918457031
Average Test Reward (with depth buffer:) 126.23148956298829
Epoch 240 test without depth buffer:
Test Episode 1 Reward: 51.00334167480469
Test Episode 2 Reward: 30.715560913085938
Test Episode 3 Reward: 98.7515869140625
Test Episode 4 Reward: 423.2014923095703
Test Episode 5 Reward: 185.92840576171875
Test Episode 6 Reward: 89.62374877929688
Test Episode 7 Reward: 262.8193359375
Test Episode 8 Reward: 322.9217987060547
Test Episode 9 Reward: 21

100%|██████████| 2000/2000 [02:45<00:00, 12.05it/s]


Epoch 241 Mean Reward: 59.76071945953369


100%|██████████| 2000/2000 [02:51<00:00, 11.65it/s]


Epoch 242 Mean Reward: 69.78472085571289


100%|██████████| 2000/2000 [02:47<00:00, 11.96it/s]


Epoch 243 Mean Reward: 73.58975247955323


100%|██████████| 2000/2000 [02:45<00:00, 12.12it/s]


Epoch 244 Mean Reward: 74.78664707183837


100%|██████████| 2000/2000 [02:45<00:00, 12.06it/s]


Epoch 245 Mean Reward: 74.59403035736084


100%|██████████| 2000/2000 [02:49<00:00, 11.80it/s]


Epoch 246 Mean Reward: 85.20434690093994


100%|██████████| 2000/2000 [02:48<00:00, 11.85it/s]


Epoch 247 Mean Reward: 82.52746807861328


100%|██████████| 2000/2000 [02:50<00:00, 11.75it/s]


Epoch 248 Mean Reward: 79.68474964904784


100%|██████████| 2000/2000 [02:47<00:00, 11.91it/s]


Epoch 249 Mean Reward: 81.78948561096192


100%|██████████| 2000/2000 [02:51<00:00, 11.68it/s]


Epoch 250 Mean Reward: 80.80656266784668


100%|██████████| 2000/2000 [02:49<00:00, 11.77it/s]


Epoch 251 Mean Reward: 93.9274394607544


100%|██████████| 2000/2000 [02:52<00:00, 11.62it/s]


Epoch 252 Mean Reward: 105.60606782531738


100%|██████████| 2000/2000 [02:56<00:00, 11.32it/s]


Epoch 253 Mean Reward: 95.2896097869873


100%|██████████| 2000/2000 [02:57<00:00, 11.28it/s]


Epoch 254 Mean Reward: 99.35004048919677


100%|██████████| 2000/2000 [02:55<00:00, 11.40it/s]


Epoch 255 Mean Reward: 105.22174159240723


100%|██████████| 2000/2000 [02:52<00:00, 11.58it/s]


Epoch 256 Mean Reward: 105.01460943603516


100%|██████████| 2000/2000 [02:55<00:00, 11.42it/s]


Epoch 257 Mean Reward: 109.20135961151124


100%|██████████| 2000/2000 [03:01<00:00, 11.02it/s]


Epoch 258 Mean Reward: 116.47044395446777


100%|██████████| 2000/2000 [02:59<00:00, 11.17it/s]


Epoch 259 Mean Reward: 113.37981177520751


100%|██████████| 2000/2000 [03:01<00:00, 11.01it/s]


Epoch 260 Mean Reward: 112.19115814208985
Epoch 260 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 260 test with depth buffer:
Test Episode 1 Reward: 77.04808044433594
Test Episode 2 Reward: 7.959228515625
Test Episode 3 Reward: 8.2008056640625
Test Episode 4 Reward: 8.2008056640625
Test Episode 5 Reward: 8.2008056640625
Test Episode 6 Reward: 8.2008056640625
Test Episode 7 Reward: 1.3792724609375
Test Episode 8 Reward: 8.716293334960938
Test Episode 9 Reward: 231.80169677734375
Test Episode 10 Reward: 8.2008056640625
Average Test Reward (with depth buffer:) 36.79085998535156
Epoch 260 test without depth buffer:
Test Episode 1 Reward: 21.457916259765625
Test Episode 2 Reward: -39.210845947265625
Test Episode 3 Reward: 8.2008056640625
Test Episode 4 Reward: 188.63902282714844
Test Episode 5 Reward: -44.662933349609375
Test Episode 6 Reward: 58.46205139160156
Test Episode 7 Reward: 8.2008056640625
Test Episode 8 Reward: 8.2008056640625
Test Episode 9 Reward: 303.31492614746094
T

100%|██████████| 2000/2000 [03:12<00:00, 10.41it/s]


Epoch 261 Mean Reward: 112.36041110229492


100%|██████████| 2000/2000 [03:14<00:00, 10.30it/s]


Epoch 262 Mean Reward: 119.23727365112305


100%|██████████| 2000/2000 [03:12<00:00, 10.37it/s]


Epoch 263 Mean Reward: 122.88149341583252


100%|██████████| 2000/2000 [03:02<00:00, 10.96it/s]


Epoch 264 Mean Reward: 121.99081611633301


100%|██████████| 2000/2000 [03:08<00:00, 10.63it/s]


Epoch 265 Mean Reward: 125.2781968383789


100%|██████████| 2000/2000 [03:03<00:00, 10.90it/s]


Epoch 266 Mean Reward: 119.99701973724365


100%|██████████| 2000/2000 [03:14<00:00, 10.30it/s]


Epoch 267 Mean Reward: 134.47288803100585


100%|██████████| 2000/2000 [03:11<00:00, 10.43it/s]


Epoch 268 Mean Reward: 125.08574313354492


100%|██████████| 2000/2000 [03:08<00:00, 10.62it/s]


Epoch 269 Mean Reward: 132.0020233078003


100%|██████████| 2000/2000 [03:08<00:00, 10.58it/s]


Epoch 270 Mean Reward: 134.08578984832764


100%|██████████| 2000/2000 [03:32<00:00,  9.40it/s]


Epoch 271 Mean Reward: 131.84483157348632


100%|██████████| 2000/2000 [03:29<00:00,  9.56it/s]


Epoch 272 Mean Reward: 135.1561423110962


100%|██████████| 2000/2000 [03:29<00:00,  9.53it/s]


Epoch 273 Mean Reward: 134.48822129821778


100%|██████████| 2000/2000 [03:31<00:00,  9.44it/s]


Epoch 274 Mean Reward: 136.41219158172606


100%|██████████| 2000/2000 [03:34<00:00,  9.35it/s]


Epoch 275 Mean Reward: 136.10003343963623


100%|██████████| 2000/2000 [03:42<00:00,  8.98it/s]


Epoch 276 Mean Reward: 141.18230869293214


100%|██████████| 2000/2000 [03:35<00:00,  9.28it/s]


Epoch 277 Mean Reward: 150.88431479644777


100%|██████████| 2000/2000 [03:33<00:00,  9.35it/s]


Epoch 278 Mean Reward: 152.20751837921142


100%|██████████| 2000/2000 [03:38<00:00,  9.16it/s]


Epoch 279 Mean Reward: 150.67533879089356


100%|██████████| 2000/2000 [03:36<00:00,  9.24it/s]


Epoch 280 Mean Reward: 153.30126638793945
Epoch 280 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 280 test with depth buffer:
Test Episode 1 Reward: -73.72346496582031
Test Episode 2 Reward: -9.830093383789062
Test Episode 3 Reward: -28.709243774414062
Test Episode 4 Reward: -108.48500061035156
Test Episode 5 Reward: -108.48500061035156
Test Episode 6 Reward: 2.0232086181640625
Test Episode 7 Reward: 140.6801300048828
Test Episode 8 Reward: 20.32794189453125
Test Episode 9 Reward: -108.48500061035156
Test Episode 10 Reward: -9.100875854492188
Average Test Reward (with depth buffer:) -28.37873992919922
Epoch 280 test without depth buffer:
Test Episode 1 Reward: 1.32867431640625
Test Episode 2 Reward: 308.9603271484375
Test Episode 3 Reward: 18.083877563476562
Test Episode 4 Reward: 30.575225830078125
Test Episode 5 Reward: -108.48500061035156
Test Episode 6 Reward: -66.41473388671875
Test Episode 7 Reward: -108.48500061035156
Test Episode 8 Reward: -108.48500061035156
Test Epi

100%|██████████| 2000/2000 [04:03<00:00,  8.21it/s]


Epoch 281 Mean Reward: 152.64188959503173


100%|██████████| 2000/2000 [03:49<00:00,  8.70it/s]


Epoch 282 Mean Reward: 157.20715754699708


100%|██████████| 2000/2000 [03:57<00:00,  8.41it/s]


Epoch 283 Mean Reward: 157.75033568573


100%|██████████| 2000/2000 [03:58<00:00,  8.37it/s]


Epoch 284 Mean Reward: 150.63441331481934


100%|██████████| 2000/2000 [03:53<00:00,  8.57it/s]


Epoch 285 Mean Reward: 165.7161930847168


100%|██████████| 2000/2000 [03:54<00:00,  8.54it/s]


Epoch 286 Mean Reward: 167.78084758758544


100%|██████████| 2000/2000 [04:05<00:00,  8.16it/s]


Epoch 287 Mean Reward: 173.0338378601074


100%|██████████| 2000/2000 [03:52<00:00,  8.60it/s]


Epoch 288 Mean Reward: 183.69666847991942


100%|██████████| 2000/2000 [03:51<00:00,  8.66it/s]


Epoch 289 Mean Reward: 171.5874099960327


100%|██████████| 2000/2000 [03:52<00:00,  8.59it/s]


Epoch 290 Mean Reward: 181.5703915863037


100%|██████████| 2000/2000 [04:03<00:00,  8.20it/s]


Epoch 291 Mean Reward: 170.30424026489257


100%|██████████| 2000/2000 [04:02<00:00,  8.25it/s]


Epoch 292 Mean Reward: 175.27880367279053


100%|██████████| 2000/2000 [03:57<00:00,  8.42it/s]


Epoch 293 Mean Reward: 183.8558252029419


100%|██████████| 2000/2000 [04:03<00:00,  8.21it/s]


Epoch 294 Mean Reward: 176.48657317352294


100%|██████████| 2000/2000 [04:10<00:00,  8.00it/s]


Epoch 295 Mean Reward: 181.82426343536378


100%|██████████| 2000/2000 [04:07<00:00,  8.07it/s]


Epoch 296 Mean Reward: 182.94013941192628


100%|██████████| 2000/2000 [04:17<00:00,  7.77it/s]


Epoch 297 Mean Reward: 190.5928024368286


100%|██████████| 2000/2000 [04:07<00:00,  8.08it/s]


Epoch 298 Mean Reward: 183.9883235321045


100%|██████████| 2000/2000 [04:18<00:00,  7.73it/s]


Epoch 299 Mean Reward: 189.7735328903198


100%|██████████| 2000/2000 [04:02<00:00,  8.25it/s]


Epoch 300 Mean Reward: 192.48953087615968
Epoch 300 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 300 test with depth buffer:
Test Episode 1 Reward: 50.106597900390625
Test Episode 2 Reward: 281.8015441894531
Test Episode 3 Reward: 139.3054656982422
Test Episode 4 Reward: -11.980697631835938
Test Episode 5 Reward: -0.0035858154296875
Test Episode 6 Reward: 50.106597900390625
Test Episode 7 Reward: 90.417236328125
Test Episode 8 Reward: 50.106597900390625
Test Episode 9 Reward: -73.46220397949219
Test Episode 10 Reward: -34.477508544921875
Average Test Reward (with depth buffer:) 54.19200439453125
Epoch 300 test without depth buffer:
Test Episode 1 Reward: 50.106597900390625
Test Episode 2 Reward: 23.446014404296875
Test Episode 3 Reward: -2.502685546875
Test Episode 4 Reward: -92.45570373535156
Test Episode 5 Reward: 50.106597900390625
Test Episode 6 Reward: 174.682373046875
Test Episode 7 Reward: 90.417236328125
Test Episode 8 Reward: 50.106597900390625
Test Episode 9 Reward

100%|██████████| 2000/2000 [04:42<00:00,  7.09it/s]


Epoch 301 Mean Reward: 191.5942024383545


100%|██████████| 2000/2000 [04:22<00:00,  7.61it/s]


Epoch 302 Mean Reward: 198.60395361328125


100%|██████████| 2000/2000 [04:05<00:00,  8.15it/s]


Epoch 303 Mean Reward: 198.17406287384034


100%|██████████| 2000/2000 [04:21<00:00,  7.65it/s]


Epoch 304 Mean Reward: 209.2569066619873


100%|██████████| 2000/2000 [04:31<00:00,  7.38it/s]


Epoch 305 Mean Reward: 210.18471981811524


100%|██████████| 2000/2000 [04:30<00:00,  7.40it/s]


Epoch 306 Mean Reward: 212.77745557403566


100%|██████████| 2000/2000 [04:20<00:00,  7.68it/s]


Epoch 307 Mean Reward: 214.28733435058595


100%|██████████| 2000/2000 [04:11<00:00,  7.95it/s]


Epoch 308 Mean Reward: 217.72444358062745


100%|██████████| 2000/2000 [04:18<00:00,  7.72it/s]


Epoch 309 Mean Reward: 210.08014912414552


100%|██████████| 2000/2000 [04:08<00:00,  8.03it/s]


Epoch 310 Mean Reward: 209.82934545135498


100%|██████████| 2000/2000 [04:25<00:00,  7.52it/s]


Epoch 311 Mean Reward: 215.6838595352173


100%|██████████| 2000/2000 [04:39<00:00,  7.16it/s]


Epoch 312 Mean Reward: 216.33595304870605


100%|██████████| 2000/2000 [04:24<00:00,  7.56it/s]


Epoch 313 Mean Reward: 209.99936808776854


100%|██████████| 2000/2000 [04:28<00:00,  7.46it/s]


Epoch 314 Mean Reward: 222.54791800689696


100%|██████████| 2000/2000 [04:41<00:00,  7.12it/s]


Epoch 315 Mean Reward: 220.78966319274903


100%|██████████| 2000/2000 [04:38<00:00,  7.19it/s]


Epoch 316 Mean Reward: 227.08886241149904


100%|██████████| 2000/2000 [04:29<00:00,  7.41it/s]


Epoch 317 Mean Reward: 214.65735762023925


100%|██████████| 2000/2000 [04:21<00:00,  7.65it/s]


Epoch 318 Mean Reward: 231.01363320922852


100%|██████████| 2000/2000 [04:37<00:00,  7.22it/s]


Epoch 319 Mean Reward: 220.3244140396118


100%|██████████| 2000/2000 [04:25<00:00,  7.54it/s]


Epoch 320 Mean Reward: 233.20636780548097
Epoch 320 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 320 test with depth buffer:
Test Episode 1 Reward: -42.00175476074219
Test Episode 2 Reward: 20.187606811523438
Test Episode 3 Reward: -57.01763916015625
Test Episode 4 Reward: -105.70201110839844
Test Episode 5 Reward: -105.70201110839844
Test Episode 6 Reward: 0.279083251953125
Test Episode 7 Reward: -105.70201110839844
Test Episode 8 Reward: -103.57316589355469
Test Episode 9 Reward: -105.70201110839844
Test Episode 10 Reward: 2.5382537841796875
Average Test Reward (with depth buffer:) -60.239566040039065
Epoch 320 test without depth buffer:
Test Episode 1 Reward: -105.70201110839844
Test Episode 2 Reward: -66.78001403808594
Test Episode 3 Reward: -56.43353271484375
Test Episode 4 Reward: -23.577255249023438
Test Episode 5 Reward: -24.016983032226562
Test Episode 6 Reward: -105.70201110839844
Test Episode 7 Reward: -105.70201110839844
Test Episode 8 Reward: -105.70201110839844

100%|██████████| 2000/2000 [05:11<00:00,  6.41it/s]


Epoch 321 Mean Reward: 260.7745969390869


100%|██████████| 2000/2000 [05:59<00:00,  5.57it/s]


Epoch 322 Mean Reward: 268.33933066558836


100%|██████████| 2000/2000 [05:22<00:00,  6.20it/s]


Epoch 323 Mean Reward: 272.54308726501466


100%|██████████| 2000/2000 [05:35<00:00,  5.95it/s]


Epoch 324 Mean Reward: 278.6316449737549


100%|██████████| 2000/2000 [05:40<00:00,  5.87it/s]


Epoch 325 Mean Reward: 267.6443008956909


100%|██████████| 2000/2000 [05:34<00:00,  5.98it/s]


Epoch 326 Mean Reward: 272.4386986541748


100%|██████████| 2000/2000 [05:20<00:00,  6.24it/s]


Epoch 327 Mean Reward: 297.63652362823484


100%|██████████| 2000/2000 [05:26<00:00,  6.12it/s]


Epoch 328 Mean Reward: 300.349482673645


100%|██████████| 2000/2000 [05:34<00:00,  5.99it/s]


Epoch 329 Mean Reward: 292.85134813690183


100%|██████████| 2000/2000 [05:25<00:00,  6.15it/s]


Epoch 330 Mean Reward: 286.5981970901489


100%|██████████| 2000/2000 [04:22<00:00,  7.63it/s]


Epoch 331 Mean Reward: 297.9699295501709


100%|██████████| 2000/2000 [04:22<00:00,  7.62it/s]


Epoch 332 Mean Reward: 311.1087338562012


100%|██████████| 2000/2000 [04:42<00:00,  7.09it/s]


Epoch 333 Mean Reward: 319.90411862182617


100%|██████████| 2000/2000 [04:27<00:00,  7.47it/s]


Epoch 334 Mean Reward: 322.0646431655884


100%|██████████| 2000/2000 [04:27<00:00,  7.48it/s]


Epoch 335 Mean Reward: 310.83122234344484


100%|██████████| 2000/2000 [04:43<00:00,  7.05it/s]


Epoch 336 Mean Reward: 325.28082598114014


100%|██████████| 2000/2000 [04:58<00:00,  6.70it/s]


Epoch 337 Mean Reward: 309.41619437408445


100%|██████████| 2000/2000 [04:37<00:00,  7.21it/s]


Epoch 338 Mean Reward: 320.6342912445068


100%|██████████| 2000/2000 [04:39<00:00,  7.15it/s]


Epoch 339 Mean Reward: 337.1896271514893


100%|██████████| 2000/2000 [04:36<00:00,  7.24it/s]


Epoch 340 Mean Reward: 342.93823021698
Epoch 340 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 340 test with depth buffer:
Test Episode 1 Reward: -115.26446533203125
Test Episode 2 Reward: -74.37162780761719
Test Episode 3 Reward: 79.63093566894531
Test Episode 4 Reward: -19.44482421875
Test Episode 5 Reward: -115.26446533203125
Test Episode 6 Reward: -115.26446533203125
Test Episode 7 Reward: -91.74058532714844
Test Episode 8 Reward: -115.26446533203125
Test Episode 9 Reward: -66.50827026367188
Test Episode 10 Reward: -115.26446533203125
Average Test Reward (with depth buffer:) -74.87566986083985
Epoch 340 test without depth buffer:
Test Episode 1 Reward: -83.60694885253906
Test Episode 2 Reward: -115.26446533203125
Test Episode 3 Reward: -115.26446533203125
Test Episode 4 Reward: -64.26301574707031
Test Episode 5 Reward: -101.08415222167969
Test Episode 6 Reward: -24.529052734375
Test Episode 7 Reward: -43.21907043457031
Test Episode 8 Reward: -115.26446533203125
Test Episo

100%|██████████| 2000/2000 [05:22<00:00,  6.19it/s]


Epoch 341 Mean Reward: 329.04646672058107


100%|██████████| 2000/2000 [05:08<00:00,  6.48it/s]


Epoch 342 Mean Reward: 346.149676902771


100%|██████████| 2000/2000 [04:54<00:00,  6.80it/s]


Epoch 343 Mean Reward: 336.977972366333


100%|██████████| 2000/2000 [05:05<00:00,  6.55it/s]


Epoch 344 Mean Reward: 333.4018515625


100%|██████████| 2000/2000 [05:15<00:00,  6.33it/s]


Epoch 345 Mean Reward: 344.59877908325194


100%|██████████| 2000/2000 [05:12<00:00,  6.39it/s]


Epoch 346 Mean Reward: 352.7318739013672


100%|██████████| 2000/2000 [05:02<00:00,  6.60it/s]


Epoch 347 Mean Reward: 349.63240772247315


100%|██████████| 2000/2000 [05:20<00:00,  6.25it/s]


Epoch 348 Mean Reward: 364.73531233215334


100%|██████████| 2000/2000 [05:19<00:00,  6.27it/s]


Epoch 349 Mean Reward: 367.68285874938965


100%|██████████| 2000/2000 [05:24<00:00,  6.17it/s]


Epoch 350 Mean Reward: 360.2324045715332


100%|██████████| 2000/2000 [05:55<00:00,  5.63it/s]


Epoch 351 Mean Reward: 343.52030402374265


100%|██████████| 2000/2000 [05:53<00:00,  5.65it/s]


Epoch 352 Mean Reward: 319.2782982940674


100%|██████████| 2000/2000 [05:39<00:00,  5.89it/s]


Epoch 353 Mean Reward: 335.4130317687988


100%|██████████| 2000/2000 [05:40<00:00,  5.87it/s]


Epoch 354 Mean Reward: 349.012688369751


100%|██████████| 2000/2000 [06:05<00:00,  5.48it/s]


Epoch 355 Mean Reward: 333.0995009994507


100%|██████████| 2000/2000 [06:13<00:00,  5.36it/s]


Epoch 356 Mean Reward: 340.10396362304687


100%|██████████| 2000/2000 [05:40<00:00,  5.88it/s]


Epoch 357 Mean Reward: 350.64435329437254


100%|██████████| 2000/2000 [05:46<00:00,  5.78it/s]


Epoch 358 Mean Reward: 351.2923265533447


100%|██████████| 2000/2000 [05:36<00:00,  5.94it/s]


Epoch 359 Mean Reward: 360.04456399536133


100%|██████████| 2000/2000 [06:12<00:00,  5.37it/s]


Epoch 360 Mean Reward: 352.63786170196533
Epoch 360 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 360 test with depth buffer:
Test Episode 1 Reward: -27.394241333007812
Test Episode 2 Reward: -2.69854736328125
Test Episode 3 Reward: 25.057830810546875
Test Episode 4 Reward: 25.057830810546875
Test Episode 5 Reward: 25.057830810546875
Test Episode 6 Reward: 25.057830810546875
Test Episode 7 Reward: 25.057830810546875
Test Episode 8 Reward: -97.31742858886719
Test Episode 9 Reward: -75.00848388671875
Test Episode 10 Reward: 25.057830810546875
Average Test Reward (with depth buffer:) -5.207171630859375
Epoch 360 test without depth buffer:
Test Episode 1 Reward: 210.15692138671875
Test Episode 2 Reward: -85.92189025878906
Test Episode 3 Reward: 25.057830810546875
Test Episode 4 Reward: 25.057830810546875
Test Episode 5 Reward: 79.10647583007812
Test Episode 6 Reward: 25.057830810546875
Test Episode 7 Reward: -115.20893859863281
Test Episode 8 Reward: 25.057830810546875
Test Episo

100%|██████████| 2000/2000 [06:08<00:00,  5.43it/s]


Epoch 361 Mean Reward: 383.7752922592163


100%|██████████| 2000/2000 [05:36<00:00,  5.94it/s]


Epoch 362 Mean Reward: 394.6653423309326


100%|██████████| 2000/2000 [05:50<00:00,  5.71it/s]


Epoch 363 Mean Reward: 384.7087325592041


100%|██████████| 2000/2000 [05:27<00:00,  6.11it/s]


Epoch 364 Mean Reward: 394.56705102539064


100%|██████████| 2000/2000 [05:26<00:00,  6.12it/s]


Epoch 365 Mean Reward: 397.77999002075194


100%|██████████| 2000/2000 [05:30<00:00,  6.05it/s]


Epoch 366 Mean Reward: 402.6998564147949


100%|██████████| 2000/2000 [05:25<00:00,  6.14it/s]


Epoch 367 Mean Reward: 376.96733481597903


100%|██████████| 2000/2000 [05:09<00:00,  6.47it/s]


Epoch 368 Mean Reward: 398.87308407592775


100%|██████████| 2000/2000 [05:07<00:00,  6.50it/s]


Epoch 369 Mean Reward: 394.099279296875


100%|██████████| 2000/2000 [04:56<00:00,  6.75it/s]


Epoch 370 Mean Reward: 408.5110432281494


100%|██████████| 2000/2000 [04:58<00:00,  6.69it/s]


Epoch 371 Mean Reward: 428.97869525146484


100%|██████████| 2000/2000 [05:11<00:00,  6.42it/s]


Epoch 372 Mean Reward: 450.5892828063965


100%|██████████| 2000/2000 [05:20<00:00,  6.25it/s]


Epoch 373 Mean Reward: 434.0620110244751


100%|██████████| 2000/2000 [05:17<00:00,  6.31it/s]


Epoch 374 Mean Reward: 400.9292630081177


100%|██████████| 2000/2000 [05:35<00:00,  5.96it/s]


Epoch 375 Mean Reward: 412.02961184692384


100%|██████████| 2000/2000 [05:35<00:00,  5.95it/s]


Epoch 376 Mean Reward: 421.495641708374


100%|██████████| 2000/2000 [05:37<00:00,  5.93it/s]


Epoch 377 Mean Reward: 415.56701176452634


100%|██████████| 2000/2000 [05:49<00:00,  5.73it/s]


Epoch 378 Mean Reward: 410.7796134719849


100%|██████████| 2000/2000 [05:23<00:00,  6.19it/s]


Epoch 379 Mean Reward: 412.6111278305054


100%|██████████| 2000/2000 [05:33<00:00,  6.00it/s]


Epoch 380 Mean Reward: 407.1249964828491
Epoch 380 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 380 test with depth buffer:
Test Episode 1 Reward: -115.99932861328125
Test Episode 2 Reward: -96.61578369140625
Test Episode 3 Reward: -96.61578369140625
Test Episode 4 Reward: 23.865310668945312
Test Episode 5 Reward: -96.61578369140625
Test Episode 6 Reward: 27.077377319335938
Test Episode 7 Reward: 40.39909362792969
Test Episode 8 Reward: -96.61578369140625
Test Episode 9 Reward: -94.20524597167969
Test Episode 10 Reward: -96.61578369140625
Average Test Reward (with depth buffer:) -60.19417114257813
Epoch 380 test without depth buffer:
Test Episode 1 Reward: -96.61578369140625
Test Episode 2 Reward: -96.61578369140625
Test Episode 3 Reward: -78.73976135253906
Test Episode 4 Reward: -96.61578369140625
Test Episode 5 Reward: -96.61578369140625
Test Episode 6 Reward: 81.71662902832031
Test Episode 7 Reward: -71.70115661621094
Test Episode 8 Reward: -96.61578369140625
Test Episode

100%|██████████| 2000/2000 [06:04<00:00,  5.49it/s]


Epoch 381 Mean Reward: 404.5763961715698


100%|██████████| 2000/2000 [05:48<00:00,  5.74it/s]


Epoch 382 Mean Reward: 411.5529667510986


100%|██████████| 2000/2000 [06:11<00:00,  5.39it/s]


Epoch 383 Mean Reward: 398.99005250549317


100%|██████████| 2000/2000 [05:27<00:00,  6.11it/s]


Epoch 384 Mean Reward: 400.8290301818848


100%|██████████| 2000/2000 [06:12<00:00,  5.37it/s]


Epoch 385 Mean Reward: 410.3765215377808


100%|██████████| 2000/2000 [06:25<00:00,  5.18it/s]


Epoch 386 Mean Reward: 396.36954135131833


100%|██████████| 2000/2000 [06:12<00:00,  5.37it/s]


Epoch 387 Mean Reward: 405.5828838195801


100%|██████████| 2000/2000 [05:40<00:00,  5.87it/s]


Epoch 388 Mean Reward: 409.5725094146729


100%|██████████| 2000/2000 [05:59<00:00,  5.56it/s]


Epoch 389 Mean Reward: 408.16689453125


100%|██████████| 2000/2000 [06:31<00:00,  5.11it/s]


Epoch 390 Mean Reward: 377.88397077178956


100%|██████████| 2000/2000 [06:25<00:00,  5.19it/s]


Epoch 391 Mean Reward: 348.5024795379639


100%|██████████| 2000/2000 [06:22<00:00,  5.23it/s]


Epoch 392 Mean Reward: 369.17087092590333


100%|██████████| 2000/2000 [06:32<00:00,  5.09it/s]


Epoch 393 Mean Reward: 367.70892375946045


100%|██████████| 2000/2000 [06:19<00:00,  5.27it/s]


Epoch 394 Mean Reward: 372.2078285446167


100%|██████████| 2000/2000 [06:21<00:00,  5.25it/s]


Epoch 395 Mean Reward: 365.18879167938235


100%|██████████| 2000/2000 [06:17<00:00,  5.30it/s]


Epoch 396 Mean Reward: 362.34161380004883


100%|██████████| 2000/2000 [06:28<00:00,  5.15it/s]


Epoch 397 Mean Reward: 369.3445495376587


100%|██████████| 2000/2000 [07:12<00:00,  4.62it/s]


Epoch 398 Mean Reward: 380.8345314254761


100%|██████████| 2000/2000 [07:43<00:00,  4.31it/s]


Epoch 399 Mean Reward: 372.17416942596435


100%|██████████| 2000/2000 [07:50<00:00,  4.25it/s]


Epoch 400 Mean Reward: 363.19639587402344
Epoch 400 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 400 test with depth buffer:
Test Episode 1 Reward: -98.86990356445312
Test Episode 2 Reward: -115.99299621582031
Test Episode 3 Reward: -115.99299621582031
Test Episode 4 Reward: -115.99299621582031
Test Episode 5 Reward: -115.99299621582031
Test Episode 6 Reward: -115.99299621582031
Test Episode 7 Reward: -54.17417907714844
Test Episode 8 Reward: -32.19956970214844
Test Episode 9 Reward: -64.91552734375
Test Episode 10 Reward: -90.23526000976562
Average Test Reward (with depth buffer:) -92.03594207763672
Epoch 400 test without depth buffer:
Test Episode 1 Reward: -60.7001953125
Test Episode 2 Reward: -115.99299621582031
Test Episode 3 Reward: -115.99299621582031
Test Episode 4 Reward: -115.99771118164062
Test Episode 5 Reward: -70.07719421386719
Test Episode 6 Reward: -115.99299621582031
Test Episode 7 Reward: -63.11714172363281
Test Episode 8 Reward: -115.99299621582031
Test Ep

In [9]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

for i in range(len(ckpts)):
    test_reward = test_agent(DQN, num_episodes=20,
                             training=False,
                             load_model=True,
                             depth=True,
                             model_dir=ckpts[i])
    print('Average Test Reward (with depth buffer):', test_reward)

    test_reward = test_agent(DQN, num_episodes=20,
                             training=False,
                             load_model=True,
                             depth=False,
                             model_dir=ckpts[i])
    print('Average Test Reward (without depth buffer):', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-280
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-280
Test Episode 1 Reward: 86.26907348632812
Test Episode 2 Reward: 86.26907348632812
Test Episode 3 Reward: -18.653610229492188
Test Episode 4 Reward: 86.26907348632812
Test Episode 5 Reward: 86.26907348632812
Test Episode 6 Reward: 86.26907348632812
Test Episode 7 Reward: -7.503387451171875
Test Episode 8 Reward: 17.1351318359375
Test Episode 9 Reward: -34.23899841308594
Test Episode 10 Reward: 40.25080871582031
Test Episode 11 Reward: 86.26907348632812
Test Episode 12 Reward: 86.26907348632812
Test Episode 13 Reward: 3.072601318359375
Test Episode 14 Reward: 86.26907348632812
Test Episode 15 Reward: -70.02059936523438
Test Episode 16 Reward: -16.3201904296875
Test Episode 17 Reward: 86.26907348632812
Test Episode 18 Reward: 86.26907348632812
Test Episode 19 Reward: 86.26907348632812
Test Episode 20 Reward: -3.498138427734375
Average Test Reward (with dep

Test Episode 19 Reward: -60.37471008300781
Test Episode 20 Reward: -25.396347045898438
Average Test Reward (without depth buffer): -69.25540618896484
Loading model from checkpoints\deadly_corridor.ckpt-360
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-360
Test Episode 1 Reward: 204.50531005859375
Test Episode 2 Reward: -115.98355102539062
Test Episode 3 Reward: -33.37709045410156
Test Episode 4 Reward: 72.03155517578125
Test Episode 5 Reward: -115.98355102539062
Test Episode 6 Reward: 28.599777221679688
Test Episode 7 Reward: -115.98355102539062
Test Episode 8 Reward: -87.50218200683594
Test Episode 9 Reward: -79.17860412597656
Test Episode 10 Reward: -115.98355102539062
Test Episode 11 Reward: -29.598464965820312
Test Episode 12 Reward: -79.91119384765625
Test Episode 13 Reward: -71.83348083496094
Test Episode 14 Reward: 167.47991943359375
Test Episode 15 Reward: 40.62907409667969
Test Episode 16 Reward: -115.94712829589844
Test Episode 17 Reward: -16.7998