In [1]:
import importlib.util
import time

import tensorflow as tf
import numpy as np

from skimage.transform import rescale
from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(False)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 400
steps_per_epoch = 2000
learning_rate = 0.005
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = rescale(image=image, scale=down_sample_ratio)
    image = image.astype(np.float32)
    image = np.expand_dims(image, axis=0)

    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            
            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer[:, :, :1].shape)
            else:
                depth_buffer = np.expand_dims(state.depth_buffer, axis=2)
                
            buffer = np.concatenate((state.screen_buffer, depth_buffer), axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self, epoch):
        self.learning_rate = 0.98*self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver(max_to_keep=10, reshape=True)
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            
#Substitute an array of zeros for the depth buffer if that setting is disabled

            if game.is_depth_buffer_enabled() == False:
                depth_buffer = np.zeros(state.screen_buffer[:, :, :1].shape)
            else:
                depth_buffer = np.expand_dims(state.depth_buffer, axis=2)
            
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                depth_buffer),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    depth_buffer),
                                                    axis=2),
                                                    down_sample_ratio)
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr(epoch)
    target_net.update_lr(epoch)
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 20 epochs
    
    if (epoch + 1) % 20 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

        print('Epoch {} test:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward:', test_reward)
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

  warn("The default mode, 'constant', will be changed to 'reflect' in "
100%|██████████| 2000/2000 [02:45<00:00, 12.10it/s]


Epoch 1 Mean Reward: -67.87550456237793


100%|██████████| 2000/2000 [02:51<00:00, 11.68it/s]


Epoch 2 Mean Reward: -66.58156318664551


100%|██████████| 2000/2000 [03:10<00:00, 10.49it/s]


Epoch 3 Mean Reward: -68.79853920745849


100%|██████████| 2000/2000 [02:18<00:00, 14.48it/s]


Epoch 4 Mean Reward: -70.25203237915039


100%|██████████| 2000/2000 [02:27<00:00, 13.59it/s]


Epoch 5 Mean Reward: -67.21812374877929


100%|██████████| 2000/2000 [02:14<00:00, 14.88it/s]


Epoch 6 Mean Reward: -68.78456167602539


100%|██████████| 2000/2000 [02:15<00:00, 14.71it/s]


Epoch 7 Mean Reward: -68.54410758972168


100%|██████████| 2000/2000 [02:26<00:00, 13.67it/s]


Epoch 8 Mean Reward: -68.19214138031006


100%|██████████| 2000/2000 [02:19<00:00, 14.32it/s]


Epoch 9 Mean Reward: -68.30715334320068


100%|██████████| 2000/2000 [02:31<00:00, 13.18it/s]


Epoch 10 Mean Reward: -66.59388121032715
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 10 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 459.4606475830078
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 131.40353393554688


100%|██████████| 2000/2000 [02:30<00:00, 13.25it/s]


Epoch 11 Mean Reward: -69.33564418029785


100%|██████████| 2000/2000 [02:18<00:00, 14.47it/s]


Epoch 12 Mean Reward: -70.73816717529297


100%|██████████| 2000/2000 [02:28<00:00, 13.45it/s]


Epoch 13 Mean Reward: -66.61880554199219


100%|██████████| 2000/2000 [02:20<00:00, 14.26it/s]


Epoch 14 Mean Reward: -66.88804089355469


100%|██████████| 2000/2000 [02:17<00:00, 14.56it/s]


Epoch 15 Mean Reward: -67.3063532485962


100%|██████████| 2000/2000 [02:27<00:00, 13.58it/s]


Epoch 16 Mean Reward: -65.75802836608887


100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Epoch 17 Mean Reward: -68.388800491333


100%|██████████| 2000/2000 [02:20<00:00, 14.21it/s]


Epoch 18 Mean Reward: -68.11395606994628


100%|██████████| 2000/2000 [02:17<00:00, 14.55it/s]


Epoch 19 Mean Reward: -68.38551134490967


100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Epoch 20 Mean Reward: -70.2096118774414
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test:
Test Episode 1 Reward: 111.15005493164062
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 744.9167175292969
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 123.77803039550781
Test Episode 6 Reward: 75.92106628417969
Test Episode 7 Reward: 467.2424621582031
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 95.74061584472656
Test Episode 10 Reward: 353.87196350097656
Average Test Reward: 225.74791412353517


100%|██████████| 2000/2000 [02:22<00:00, 14.00it/s]


Epoch 21 Mean Reward: -69.55482258605957


100%|██████████| 2000/2000 [02:34<00:00, 12.97it/s]


Epoch 22 Mean Reward: -68.76901966094971


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 23 Mean Reward: -65.51713765716553


100%|██████████| 2000/2000 [02:22<00:00, 14.05it/s]


Epoch 24 Mean Reward: -68.8001689453125


100%|██████████| 2000/2000 [02:21<00:00, 14.13it/s]


Epoch 25 Mean Reward: -70.92369088745117


100%|██████████| 2000/2000 [02:24<00:00, 13.81it/s]


Epoch 26 Mean Reward: -70.39821453094483


100%|██████████| 2000/2000 [02:30<00:00, 13.28it/s]


Epoch 27 Mean Reward: -67.73843720245361


100%|██████████| 2000/2000 [02:33<00:00, 13.04it/s]


Epoch 28 Mean Reward: -68.76229727172851


100%|██████████| 2000/2000 [02:20<00:00, 14.21it/s]


Epoch 29 Mean Reward: -68.24926608276367


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 30 Mean Reward: -67.13513795471191
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 30 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 90.10466003417969
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 673.9265899658203
Test Episode 8 Reward: 113.08839416503906
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 154.17888488769532


100%|██████████| 2000/2000 [02:24<00:00, 13.84it/s]


Epoch 31 Mean Reward: -68.59779259490966


100%|██████████| 2000/2000 [02:22<00:00, 14.01it/s]


Epoch 32 Mean Reward: -69.63888041687012


100%|██████████| 2000/2000 [02:23<00:00, 13.98it/s]


Epoch 33 Mean Reward: -69.77934391021728


100%|██████████| 2000/2000 [02:28<00:00, 13.48it/s]


Epoch 34 Mean Reward: -68.819165725708


100%|██████████| 2000/2000 [02:24<00:00, 13.82it/s]


Epoch 35 Mean Reward: -67.31697777557373


100%|██████████| 2000/2000 [02:37<00:00, 12.71it/s]


Epoch 36 Mean Reward: -64.84597813415527


100%|██████████| 2000/2000 [02:31<00:00, 13.18it/s]


Epoch 37 Mean Reward: -68.57865566253662


100%|██████████| 2000/2000 [02:26<00:00, 13.70it/s]


Epoch 38 Mean Reward: -69.28526348876953


100%|██████████| 2000/2000 [02:23<00:00, 13.94it/s]


Epoch 39 Mean Reward: -67.70714485168457


100%|██████████| 2000/2000 [02:31<00:00, 13.20it/s]


Epoch 40 Mean Reward: -70.49904663085937
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test:
Test Episode 1 Reward: 428.0621032714844
Test Episode 2 Reward: 60.562469482421875
Test Episode 3 Reward: 536.72802734375
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 421.5746154785156
Test Episode 8 Reward: 113.85456848144531
Test Episode 9 Reward: 70.27059936523438
Test Episode 10 Reward: 72.30764770507812
Average Test Reward: 198.821826171875


100%|██████████| 2000/2000 [02:19<00:00, 14.38it/s]


Epoch 41 Mean Reward: -69.39362239074707


100%|██████████| 2000/2000 [02:20<00:00, 14.26it/s]


Epoch 42 Mean Reward: -66.91982186889649


100%|██████████| 2000/2000 [02:25<00:00, 13.74it/s]


Epoch 43 Mean Reward: -68.15026795196533


100%|██████████| 2000/2000 [02:25<00:00, 13.75it/s]


Epoch 44 Mean Reward: -67.68542803192139


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 45 Mean Reward: -70.0615637664795


100%|██████████| 2000/2000 [02:37<00:00, 12.70it/s]


Epoch 46 Mean Reward: -66.86710385131836


100%|██████████| 2000/2000 [02:20<00:00, 14.20it/s]


Epoch 47 Mean Reward: -70.38133525848389


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 48 Mean Reward: -67.72954415893555


100%|██████████| 2000/2000 [02:25<00:00, 13.76it/s]


Epoch 49 Mean Reward: -70.04693647003174


100%|██████████| 2000/2000 [02:28<00:00, 13.49it/s]


Epoch 50 Mean Reward: -68.03389642333984
Epoch 50 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 50 test:
Test Episode 1 Reward: 121.45074462890625
Test Episode 2 Reward: 270.2367401123047
Test Episode 3 Reward: 135.0308837890625
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 159.30657958984375
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 96.62693786621094
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 125.74156036376954


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 51 Mean Reward: -67.12328476715088


100%|██████████| 2000/2000 [02:33<00:00, 13.05it/s]


Epoch 52 Mean Reward: -67.48051639556884


100%|██████████| 2000/2000 [02:23<00:00, 13.96it/s]


Epoch 53 Mean Reward: -66.20750537872314


100%|██████████| 2000/2000 [02:32<00:00, 13.12it/s]


Epoch 54 Mean Reward: -72.0390571975708


100%|██████████| 2000/2000 [02:32<00:00, 13.14it/s]


Epoch 55 Mean Reward: -66.98786591339112


100%|██████████| 2000/2000 [02:39<00:00, 12.53it/s]


Epoch 56 Mean Reward: -66.10080979919434


100%|██████████| 2000/2000 [02:40<00:00, 12.42it/s]


Epoch 57 Mean Reward: -70.09001685333251


100%|██████████| 2000/2000 [03:35<00:00,  9.26it/s]


Epoch 58 Mean Reward: -66.54890669250489


100%|██████████| 2000/2000 [04:01<00:00,  8.27it/s]


Epoch 59 Mean Reward: -69.39437655639648


100%|██████████| 2000/2000 [04:05<00:00,  8.14it/s]


Epoch 60 Mean Reward: -68.69337914276123
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test:
Test Episode 1 Reward: 809.82861328125
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 59.30363464355469
Test Episode 6 Reward: 828.5237274169922
Test Episode 7 Reward: 82.25401306152344
Test Episode 8 Reward: 259.3737335205078
Test Episode 9 Reward: 355.4592742919922
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 277.4553970336914


100%|██████████| 2000/2000 [04:12<00:00,  7.93it/s]


Epoch 61 Mean Reward: -70.55399589538574


100%|██████████| 2000/2000 [04:30<00:00,  7.40it/s]


Epoch 62 Mean Reward: -69.63753493499756


100%|██████████| 2000/2000 [04:07<00:00,  8.09it/s]


Epoch 63 Mean Reward: -68.51692747497559


100%|██████████| 2000/2000 [04:07<00:00,  8.07it/s]


Epoch 64 Mean Reward: -69.79406842803955


100%|██████████| 2000/2000 [04:09<00:00,  8.02it/s]


Epoch 65 Mean Reward: -68.85151891326905


100%|██████████| 2000/2000 [03:47<00:00,  8.78it/s]


Epoch 66 Mean Reward: -65.2504569015503


100%|██████████| 2000/2000 [03:50<00:00,  8.68it/s]


Epoch 67 Mean Reward: -69.0858529434204


100%|██████████| 2000/2000 [03:51<00:00,  8.63it/s]


Epoch 68 Mean Reward: -69.44511869812011


100%|██████████| 2000/2000 [03:59<00:00,  8.34it/s]


Epoch 69 Mean Reward: -65.04563938140869


100%|██████████| 2000/2000 [03:46<00:00,  8.82it/s]


Epoch 70 Mean Reward: -69.38699968719483
Epoch 70 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 70 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 100.42141723632812
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 353.7087097167969
Test Episode 9 Reward: 372.1200256347656
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 149.09193572998046


100%|██████████| 2000/2000 [03:51<00:00,  8.65it/s]


Epoch 71 Mean Reward: -68.08302321624755


100%|██████████| 2000/2000 [04:01<00:00,  8.27it/s]


Epoch 72 Mean Reward: -68.23130792999268


100%|██████████| 2000/2000 [04:14<00:00,  7.85it/s]


Epoch 73 Mean Reward: -68.59997923278809


100%|██████████| 2000/2000 [03:49<00:00,  8.70it/s]


Epoch 74 Mean Reward: -69.74730699920654


100%|██████████| 2000/2000 [03:53<00:00,  8.58it/s]


Epoch 75 Mean Reward: -67.82150989532471


100%|██████████| 2000/2000 [03:53<00:00,  8.57it/s]


Epoch 76 Mean Reward: -69.78921308135986


100%|██████████| 2000/2000 [03:42<00:00,  8.99it/s]


Epoch 77 Mean Reward: -70.91084601593018


100%|██████████| 2000/2000 [03:53<00:00,  8.58it/s]


Epoch 78 Mean Reward: -65.46253060150147


100%|██████████| 2000/2000 [03:58<00:00,  8.39it/s]


Epoch 79 Mean Reward: -67.00646781158447


100%|██████████| 2000/2000 [03:52<00:00,  8.59it/s]


Epoch 80 Mean Reward: -67.00631046295166
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test:
Test Episode 1 Reward: 393.8234405517578
Test Episode 2 Reward: 96.69816589355469
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 85.3355712890625
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 119.60940551757812
Test Episode 9 Reward: 511.02284240722656
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 168.12531433105468


100%|██████████| 2000/2000 [03:51<00:00,  8.64it/s]


Epoch 81 Mean Reward: -69.82136449432373


100%|██████████| 2000/2000 [03:40<00:00,  9.05it/s]


Epoch 82 Mean Reward: -68.94465606689454


100%|██████████| 2000/2000 [03:55<00:00,  8.49it/s]


Epoch 83 Mean Reward: -67.26946055603027


100%|██████████| 2000/2000 [03:48<00:00,  8.77it/s]


Epoch 84 Mean Reward: -68.48327179718018


100%|██████████| 2000/2000 [04:25<00:00,  7.52it/s]


Epoch 85 Mean Reward: -69.72733098602295


100%|██████████| 2000/2000 [04:05<00:00,  8.15it/s]


Epoch 86 Mean Reward: -67.43850717163086


100%|██████████| 2000/2000 [04:07<00:00,  8.10it/s]


Epoch 87 Mean Reward: -68.49454350280762


100%|██████████| 2000/2000 [04:06<00:00,  8.12it/s]


Epoch 88 Mean Reward: -68.34975713348389


100%|██████████| 2000/2000 [03:47<00:00,  8.78it/s]


Epoch 89 Mean Reward: -68.05256672668457


100%|██████████| 2000/2000 [04:04<00:00,  8.18it/s]


Epoch 90 Mean Reward: -70.31861235809326
Epoch 90 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 90 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 88.33134460449219
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 418.0141906738281
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 65.39042663574219
Test Episode 8 Reward: 96.23049926757812
Test Episode 9 Reward: 847.4380645751953
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 199.0168243408203


100%|██████████| 2000/2000 [03:52<00:00,  8.62it/s]


Epoch 91 Mean Reward: -66.6881096496582


100%|██████████| 2000/2000 [04:23<00:00,  7.59it/s]


Epoch 92 Mean Reward: -68.13646502685548


100%|██████████| 2000/2000 [04:14<00:00,  7.85it/s]


Epoch 93 Mean Reward: -68.27020141601562


100%|██████████| 2000/2000 [04:06<00:00,  8.12it/s]


Epoch 94 Mean Reward: -68.34605445098877


100%|██████████| 2000/2000 [04:14<00:00,  7.87it/s]


Epoch 95 Mean Reward: -66.87105669403076


100%|██████████| 2000/2000 [04:35<00:00,  7.25it/s]


Epoch 96 Mean Reward: -66.22005545043945


100%|██████████| 2000/2000 [04:18<00:00,  7.73it/s]


Epoch 97 Mean Reward: -68.85088806152343


100%|██████████| 2000/2000 [04:33<00:00,  7.30it/s]


Epoch 98 Mean Reward: -69.04489361572266


100%|██████████| 2000/2000 [04:23<00:00,  7.60it/s]


Epoch 99 Mean Reward: -68.4227057800293


100%|██████████| 2000/2000 [04:21<00:00,  7.65it/s]


Epoch 100 Mean Reward: -68.8252232208252
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test:
Test Episode 1 Reward: 444.36199951171875
Test Episode 2 Reward: 428.4467010498047
Test Episode 3 Reward: 81.13603210449219
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 64.61225891113281
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 108.12852478027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 30.140289306640625
Average Test Reward: 153.66367797851564


100%|██████████| 2000/2000 [04:26<00:00,  7.49it/s]


Epoch 101 Mean Reward: -67.52702273559571


100%|██████████| 2000/2000 [04:33<00:00,  7.31it/s]


Epoch 102 Mean Reward: -65.17905912017822


100%|██████████| 2000/2000 [04:51<00:00,  6.85it/s]


Epoch 103 Mean Reward: -67.34822142028808


100%|██████████| 2000/2000 [04:36<00:00,  7.25it/s]


Epoch 104 Mean Reward: -71.17800452423096


100%|██████████| 2000/2000 [04:32<00:00,  7.33it/s]


Epoch 105 Mean Reward: -69.77503202819824


100%|██████████| 2000/2000 [04:40<00:00,  7.12it/s]


Epoch 106 Mean Reward: -68.78148062133789


100%|██████████| 2000/2000 [04:28<00:00,  7.44it/s]


Epoch 107 Mean Reward: -67.15315423583985


100%|██████████| 2000/2000 [04:48<00:00,  6.94it/s]


Epoch 108 Mean Reward: -68.3887738723755


100%|██████████| 2000/2000 [04:20<00:00,  7.68it/s]


Epoch 109 Mean Reward: -66.39526329040527


100%|██████████| 2000/2000 [04:18<00:00,  7.75it/s]


Epoch 110 Mean Reward: -67.87321942901612
Epoch 110 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 110 test:
Test Episode 1 Reward: 52.56672668457031
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 114.64811706542969
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 88.95040893554688
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 451.35560607910156
Test Episode 10 Reward: 67.23016357421875
Average Test Reward: 124.95147399902343


100%|██████████| 2000/2000 [04:32<00:00,  7.34it/s]


Epoch 111 Mean Reward: -67.94104640197754


100%|██████████| 2000/2000 [04:26<00:00,  7.50it/s]


Epoch 112 Mean Reward: -68.36093236541748


100%|██████████| 2000/2000 [04:45<00:00,  7.01it/s]


Epoch 113 Mean Reward: -67.26762572479248


100%|██████████| 2000/2000 [04:42<00:00,  7.08it/s]


Epoch 114 Mean Reward: -68.29135388946533


100%|██████████| 2000/2000 [04:47<00:00,  6.95it/s]


Epoch 115 Mean Reward: -68.57570476531983


100%|██████████| 2000/2000 [03:02<00:00, 10.97it/s]


Epoch 116 Mean Reward: -67.91532218933105


100%|██████████| 2000/2000 [04:12<00:00,  7.92it/s]


Epoch 117 Mean Reward: -70.41714559173585


100%|██████████| 2000/2000 [04:53<00:00,  6.82it/s]


Epoch 118 Mean Reward: -67.93140323638916


100%|██████████| 2000/2000 [04:40<00:00,  7.12it/s]


Epoch 119 Mean Reward: -69.96585289001465


100%|██████████| 2000/2000 [04:50<00:00,  6.88it/s]


Epoch 120 Mean Reward: -66.3946865158081
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 0.39288330078125
Test Episode 3 Reward: 87.39956665039062
Test Episode 4 Reward: 103.87570190429688
Test Episode 5 Reward: 421.2755432128906
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 346.0754852294922
Test Episode 10 Reward: 365.3287048339844
Average Test Reward: 170.41588592529297


100%|██████████| 2000/2000 [04:22<00:00,  7.61it/s]


Epoch 121 Mean Reward: -38.62432094573975


100%|██████████| 2000/2000 [04:14<00:00,  7.85it/s]


Epoch 122 Mean Reward: -38.875940818786624


100%|██████████| 2000/2000 [03:45<00:00,  8.86it/s]


Epoch 123 Mean Reward: -33.488598777770996


100%|██████████| 2000/2000 [03:38<00:00,  9.17it/s]


Epoch 124 Mean Reward: -38.446234725952145


100%|██████████| 2000/2000 [03:59<00:00,  8.35it/s]


Epoch 125 Mean Reward: -35.18322422790527


100%|██████████| 2000/2000 [03:48<00:00,  8.75it/s]


Epoch 126 Mean Reward: -35.16875326538086


100%|██████████| 2000/2000 [03:54<00:00,  8.52it/s]


Epoch 127 Mean Reward: -34.22038224029541


100%|██████████| 2000/2000 [03:56<00:00,  8.46it/s]


Epoch 128 Mean Reward: -33.18421743774414


100%|██████████| 2000/2000 [04:01<00:00,  8.27it/s]


Epoch 129 Mean Reward: -33.86891304016113


100%|██████████| 2000/2000 [03:55<00:00,  8.50it/s]


Epoch 130 Mean Reward: -30.860848205566406
Epoch 130 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 130 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 405.8554992675781
Test Episode 5 Reward: 64.33917236328125
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 146.9215850830078
Test Episode 8 Reward: 34.00798034667969
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 122.08406982421874


100%|██████████| 2000/2000 [03:49<00:00,  8.73it/s]


Epoch 131 Mean Reward: -30.23406787109375


100%|██████████| 2000/2000 [03:55<00:00,  8.50it/s]


Epoch 132 Mean Reward: -32.48482796478272


100%|██████████| 2000/2000 [03:57<00:00,  8.42it/s]


Epoch 133 Mean Reward: -27.54054284667969


100%|██████████| 2000/2000 [03:47<00:00,  8.80it/s]


Epoch 134 Mean Reward: -27.514947677612305


100%|██████████| 2000/2000 [03:58<00:00,  8.38it/s]


Epoch 135 Mean Reward: -27.28574349975586


100%|██████████| 2000/2000 [03:47<00:00,  8.78it/s]


Epoch 136 Mean Reward: -27.372865982055664


100%|██████████| 2000/2000 [03:59<00:00,  8.37it/s]


Epoch 137 Mean Reward: -22.327216926574707


100%|██████████| 2000/2000 [03:49<00:00,  8.71it/s]


Epoch 138 Mean Reward: -24.48906114959717


100%|██████████| 2000/2000 [03:47<00:00,  8.78it/s]


Epoch 139 Mean Reward: -24.14696085357666


100%|██████████| 2000/2000 [03:46<00:00,  8.82it/s]


Epoch 140 Mean Reward: -22.166249588012697
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 335.7994384765625
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 423.88316345214844
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 151.93045501708986


100%|██████████| 2000/2000 [03:19<00:00, 10.04it/s]


Epoch 141 Mean Reward: -24.84781967163086


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Epoch 142 Mean Reward: -21.009192024230956


100%|██████████| 2000/2000 [02:45<00:00, 12.07it/s]


Epoch 143 Mean Reward: -21.396182548522948


100%|██████████| 2000/2000 [02:40<00:00, 12.48it/s]


Epoch 144 Mean Reward: -21.551687362670897


100%|██████████| 2000/2000 [02:43<00:00, 12.20it/s]


Epoch 145 Mean Reward: -13.416276573181152


100%|██████████| 2000/2000 [02:41<00:00, 12.41it/s]


Epoch 146 Mean Reward: -22.68340012359619


100%|██████████| 2000/2000 [03:00<00:00, 11.11it/s]


Epoch 147 Mean Reward: -16.055504928588867


100%|██████████| 2000/2000 [02:35<00:00, 12.86it/s]


Epoch 148 Mean Reward: -15.289417770385743


100%|██████████| 2000/2000 [02:35<00:00, 12.85it/s]


Epoch 149 Mean Reward: -20.13731216430664


100%|██████████| 2000/2000 [02:45<00:00, 12.06it/s]


Epoch 150 Mean Reward: -11.509699028015136
Epoch 150 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 150 test:
Test Episode 1 Reward: 119.47285461425781
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 403.4650115966797
Test Episode 4 Reward: 72.18951416015625
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 138.7840118408203
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 130.36278533935547


100%|██████████| 2000/2000 [03:09<00:00, 10.57it/s]


Epoch 151 Mean Reward: -14.918634254455567


100%|██████████| 2000/2000 [03:33<00:00,  9.35it/s]


Epoch 152 Mean Reward: -13.219747947692872


100%|██████████| 2000/2000 [03:09<00:00, 10.55it/s]


Epoch 153 Mean Reward: -12.62388427734375


100%|██████████| 2000/2000 [02:40<00:00, 12.48it/s]


Epoch 154 Mean Reward: -13.704833137512207


100%|██████████| 2000/2000 [02:44<00:00, 12.17it/s]


Epoch 155 Mean Reward: -10.229293197631836


100%|██████████| 2000/2000 [02:41<00:00, 12.38it/s]


Epoch 156 Mean Reward: -9.49588557434082


100%|██████████| 2000/2000 [02:45<00:00, 12.11it/s]


Epoch 157 Mean Reward: -8.063058662414551


100%|██████████| 2000/2000 [02:47<00:00, 11.97it/s]


Epoch 158 Mean Reward: -8.653043640136719


100%|██████████| 2000/2000 [02:51<00:00, 11.65it/s]


Epoch 159 Mean Reward: -5.61733364868164


100%|██████████| 2000/2000 [02:34<00:00, 12.97it/s]


Epoch 160 Mean Reward: -6.0091362533569335
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test:
Test Episode 1 Reward: 39.74714660644531
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 497.2102966308594
Test Episode 4 Reward: 382.9748840332031
Test Episode 5 Reward: 73.69210815429688
Test Episode 6 Reward: 25.948043823242188
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 86.28131103515625
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 148.56647644042968


100%|██████████| 2000/2000 [02:53<00:00, 11.56it/s]


Epoch 161 Mean Reward: -5.6386573638916015


100%|██████████| 2000/2000 [02:41<00:00, 12.41it/s]


Epoch 162 Mean Reward: -4.156605690002442


100%|██████████| 2000/2000 [02:39<00:00, 12.58it/s]


Epoch 163 Mean Reward: -4.08741178894043


100%|██████████| 2000/2000 [02:38<00:00, 12.59it/s]


Epoch 164 Mean Reward: -2.7947683715820313


100%|██████████| 2000/2000 [02:41<00:00, 12.37it/s]


Epoch 165 Mean Reward: -3.704343147277832


100%|██████████| 2000/2000 [02:45<00:00, 12.12it/s]


Epoch 166 Mean Reward: -0.6692816467285156


100%|██████████| 2000/2000 [02:51<00:00, 11.64it/s]


Epoch 167 Mean Reward: 0.6873627777099609


100%|██████████| 2000/2000 [02:35<00:00, 12.87it/s]


Epoch 168 Mean Reward: -0.9485891342163086


100%|██████████| 2000/2000 [02:41<00:00, 12.42it/s]


Epoch 169 Mean Reward: -0.8173789672851562


100%|██████████| 2000/2000 [02:39<00:00, 12.55it/s]


Epoch 170 Mean Reward: 0.2966471252441406
Epoch 170 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 170 test:
Test Episode 1 Reward: 562.3467712402344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 101.5794677734375
Test Episode 4 Reward: 333.9619140625
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 70.17048645019531
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 87.31512451171875
Average Test Reward: 163.0137481689453


100%|██████████| 2000/2000 [02:33<00:00, 13.02it/s]


Epoch 171 Mean Reward: 3.443192497253418


100%|██████████| 2000/2000 [02:42<00:00, 12.29it/s]


Epoch 172 Mean Reward: 6.234545303344727


100%|██████████| 2000/2000 [02:40<00:00, 12.48it/s]


Epoch 173 Mean Reward: 2.4040974578857424


100%|██████████| 2000/2000 [02:45<00:00, 12.05it/s]


Epoch 174 Mean Reward: 4.957713935852051


100%|██████████| 2000/2000 [02:54<00:00, 11.47it/s]


Epoch 175 Mean Reward: 7.897286613464355


100%|██████████| 2000/2000 [02:36<00:00, 12.79it/s]


Epoch 176 Mean Reward: 4.960542304992676


100%|██████████| 2000/2000 [02:40<00:00, 12.43it/s]


Epoch 177 Mean Reward: 6.421177986145019


100%|██████████| 2000/2000 [02:43<00:00, 12.20it/s]


Epoch 178 Mean Reward: 13.209124153137207


100%|██████████| 2000/2000 [02:41<00:00, 12.38it/s]


Epoch 179 Mean Reward: 9.337579887390136


100%|██████████| 2000/2000 [02:38<00:00, 12.62it/s]


Epoch 180 Mean Reward: 4.5559733200073245
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.07089233398438
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 99.68423461914062
Test Episode 5 Reward: 240.70114135742188
Test Episode 6 Reward: 103.87570190429688
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 110.80484313964844


100%|██████████| 2000/2000 [02:24<00:00, 13.82it/s]


Epoch 181 Mean Reward: 13.303528587341308


100%|██████████| 2000/2000 [02:23<00:00, 13.95it/s]


Epoch 182 Mean Reward: 15.08782804107666


100%|██████████| 2000/2000 [02:23<00:00, 13.96it/s]


Epoch 183 Mean Reward: 10.007271606445313


100%|██████████| 2000/2000 [02:22<00:00, 14.05it/s]


Epoch 184 Mean Reward: 12.021650611877442


100%|██████████| 2000/2000 [02:19<00:00, 14.29it/s]


Epoch 185 Mean Reward: 15.32793325805664


100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]


Epoch 186 Mean Reward: 18.5091887512207


100%|██████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 187 Mean Reward: 20.506225311279298


100%|██████████| 2000/2000 [02:31<00:00, 13.16it/s]


Epoch 188 Mean Reward: 21.110941398620607


100%|██████████| 2000/2000 [03:00<00:00, 11.11it/s]


Epoch 189 Mean Reward: 17.407901557922365


100%|██████████| 2000/2000 [02:33<00:00, 12.99it/s]


Epoch 190 Mean Reward: 19.455303367614746
Epoch 190 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 190 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 227.44454956054688
Test Episode 6 Reward: 129.404052734375
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 111.64705505371094


100%|██████████| 2000/2000 [02:35<00:00, 12.87it/s]


Epoch 191 Mean Reward: 21.188930442810058


100%|██████████| 2000/2000 [02:28<00:00, 13.45it/s]


Epoch 192 Mean Reward: 21.055794967651366


100%|██████████| 2000/2000 [02:25<00:00, 13.78it/s]


Epoch 193 Mean Reward: 26.381289596557618


100%|██████████| 2000/2000 [02:26<00:00, 13.70it/s]


Epoch 194 Mean Reward: 21.460615364074705


100%|██████████| 2000/2000 [02:25<00:00, 13.78it/s]


Epoch 195 Mean Reward: 21.798338096618654


100%|██████████| 2000/2000 [02:19<00:00, 14.33it/s]


Epoch 196 Mean Reward: 27.09383554840088


100%|██████████| 2000/2000 [02:20<00:00, 14.23it/s]


Epoch 197 Mean Reward: 27.172709831237793


100%|██████████| 2000/2000 [02:20<00:00, 14.21it/s]


Epoch 198 Mean Reward: 31.55754601287842


100%|██████████| 2000/2000 [02:22<00:00, 14.06it/s]


Epoch 199 Mean Reward: 32.08272999572754


100%|██████████| 2000/2000 [02:22<00:00, 14.00it/s]


Epoch 200 Mean Reward: 34.013041122436526
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 358.30120849609375
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 355.19195556640625
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 75.97764587402344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 312.53106689453125
Test Episode 10 Reward: 81.85357666015625
Average Test Reward: 165.8619171142578


100%|██████████| 2000/2000 [02:29<00:00, 13.39it/s]


Epoch 201 Mean Reward: 28.823400909423828


100%|██████████| 2000/2000 [02:30<00:00, 13.29it/s]


Epoch 202 Mean Reward: 29.21913020324707


100%|██████████| 2000/2000 [02:51<00:00, 11.66it/s]


Epoch 203 Mean Reward: 34.8981893157959


100%|██████████| 2000/2000 [03:18<00:00, 10.07it/s]


Epoch 204 Mean Reward: 32.434959716796875


100%|██████████| 2000/2000 [03:16<00:00, 10.16it/s]


Epoch 205 Mean Reward: 35.6346696395874


100%|██████████| 2000/2000 [03:19<00:00, 10.04it/s]


Epoch 206 Mean Reward: 36.16185885620117


100%|██████████| 2000/2000 [03:16<00:00, 10.18it/s]


Epoch 207 Mean Reward: 37.728590560913084


100%|██████████| 2000/2000 [03:18<00:00, 10.06it/s]


Epoch 208 Mean Reward: 37.78569792175293


100%|██████████| 2000/2000 [03:15<00:00, 10.25it/s]


Epoch 209 Mean Reward: 43.3010535736084


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 210 Mean Reward: 42.90624325561524
Epoch 210 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 210 test:
Test Episode 1 Reward: 101.16981506347656
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 111.34027099609375
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 319.0486755371094
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 791.2355194091797
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 189.25107421875


100%|██████████| 2000/2000 [02:22<00:00, 13.99it/s]


Epoch 211 Mean Reward: 40.642341995239256


100%|██████████| 2000/2000 [02:27<00:00, 13.61it/s]


Epoch 212 Mean Reward: 39.144420028686525


100%|██████████| 2000/2000 [02:22<00:00, 14.00it/s]


Epoch 213 Mean Reward: 43.941723762512204


100%|██████████| 2000/2000 [02:20<00:00, 14.26it/s]


Epoch 214 Mean Reward: 41.526331672668455


100%|██████████| 2000/2000 [02:20<00:00, 14.24it/s]


Epoch 215 Mean Reward: 38.04714651489258


100%|██████████| 2000/2000 [02:22<00:00, 14.03it/s]


Epoch 216 Mean Reward: 42.5999623336792


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 217 Mean Reward: 46.23389717102051


100%|██████████| 2000/2000 [02:22<00:00, 14.01it/s]


Epoch 218 Mean Reward: 48.195688430786134


100%|██████████| 2000/2000 [02:22<00:00, 14.07it/s]


Epoch 219 Mean Reward: 42.732639610290526


100%|██████████| 2000/2000 [02:22<00:00, 13.99it/s]


Epoch 220 Mean Reward: 47.39284512329102
Epoch 220 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 220 test:
Test Episode 1 Reward: -78.01362609863281
Test Episode 2 Reward: 28.536483764648438
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 247.46006774902344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 555.8242340087891
Test Episode 8 Reward: 74.81538391113281
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 19.711013793945312
Average Test Reward: 122.814453125


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 221 Mean Reward: 47.93266780090332


100%|██████████| 2000/2000 [02:25<00:00, 13.72it/s]


Epoch 222 Mean Reward: 49.66429384613037


100%|██████████| 2000/2000 [02:30<00:00, 13.31it/s]


Epoch 223 Mean Reward: 53.02730345153809


100%|██████████| 2000/2000 [02:33<00:00, 13.02it/s]


Epoch 224 Mean Reward: 54.55223918151855


100%|██████████| 2000/2000 [02:28<00:00, 13.51it/s]


Epoch 225 Mean Reward: 53.35799602508545


100%|██████████| 2000/2000 [02:26<00:00, 13.68it/s]


Epoch 226 Mean Reward: 48.326781784057616


100%|██████████| 2000/2000 [02:24<00:00, 13.84it/s]


Epoch 227 Mean Reward: 53.99417127227783


100%|██████████| 2000/2000 [02:25<00:00, 13.77it/s]


Epoch 228 Mean Reward: 58.216708854675296


100%|██████████| 2000/2000 [02:22<00:00, 13.99it/s]


Epoch 229 Mean Reward: 56.108456520080566


100%|██████████| 2000/2000 [02:22<00:00, 14.06it/s]


Epoch 230 Mean Reward: 54.226688842773434
Epoch 230 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 230 test:
Test Episode 1 Reward: 167.27049255371094
Test Episode 2 Reward: 357.5937194824219
Test Episode 3 Reward: 64.87336730957031
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 117.34916687011719
Test Episode 6 Reward: 57.58769226074219
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 130.21182250976562
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 127.46972351074218


100%|██████████| 2000/2000 [02:26<00:00, 13.68it/s]


Epoch 231 Mean Reward: 63.629612594604495


100%|██████████| 2000/2000 [02:24<00:00, 13.88it/s]


Epoch 232 Mean Reward: 60.31508169555664


100%|██████████| 2000/2000 [02:23<00:00, 13.97it/s]


Epoch 233 Mean Reward: 60.196221351623535


100%|██████████| 2000/2000 [02:24<00:00, 13.86it/s]


Epoch 234 Mean Reward: 59.769994384765624


100%|██████████| 2000/2000 [02:25<00:00, 13.77it/s]


Epoch 235 Mean Reward: 64.82163619995117


100%|██████████| 2000/2000 [02:28<00:00, 13.51it/s]


Epoch 236 Mean Reward: 59.22977375030518


100%|██████████| 2000/2000 [02:23<00:00, 13.91it/s]


Epoch 237 Mean Reward: 63.05951651763916


100%|██████████| 2000/2000 [02:29<00:00, 13.39it/s]


Epoch 238 Mean Reward: 67.8299345626831


100%|██████████| 2000/2000 [02:33<00:00, 13.01it/s]


Epoch 239 Mean Reward: 72.79512879180908


100%|██████████| 2000/2000 [02:50<00:00, 11.74it/s]


Epoch 240 Mean Reward: 70.72180371856689
Epoch 240 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 240 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.61981201171875
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 383.28330993652344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 74.77481079101562
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 121.73471374511719


100%|██████████| 2000/2000 [02:40<00:00, 12.47it/s]


Epoch 241 Mean Reward: 67.2228712234497


100%|██████████| 2000/2000 [02:33<00:00, 13.07it/s]


Epoch 242 Mean Reward: 70.52582410430908


100%|██████████| 2000/2000 [02:38<00:00, 12.65it/s]


Epoch 243 Mean Reward: 64.9798208770752


100%|██████████| 2000/2000 [02:36<00:00, 12.79it/s]


Epoch 244 Mean Reward: 72.1608419265747


100%|██████████| 2000/2000 [02:34<00:00, 12.95it/s]


Epoch 245 Mean Reward: 75.603068359375


100%|██████████| 2000/2000 [02:36<00:00, 12.79it/s]


Epoch 246 Mean Reward: 74.54957714080811


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Epoch 247 Mean Reward: 74.27691398620605


100%|██████████| 2000/2000 [02:32<00:00, 13.13it/s]


Epoch 248 Mean Reward: 79.02565460968017


100%|██████████| 2000/2000 [02:32<00:00, 13.09it/s]


Epoch 249 Mean Reward: 77.63297841644287


100%|██████████| 2000/2000 [02:34<00:00, 12.99it/s]


Epoch 250 Mean Reward: 76.79428048706055
Epoch 250 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 250 test:
Test Episode 1 Reward: 23.24420166015625
Test Episode 2 Reward: 91.40142822265625
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 472.8494110107422
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 361.553466796875
Test Episode 7 Reward: 110.23211669921875
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 59.768524169921875
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 149.8860122680664


100%|██████████| 2000/2000 [02:34<00:00, 12.94it/s]


Epoch 251 Mean Reward: 75.54071078491211


100%|██████████| 2000/2000 [02:25<00:00, 13.76it/s]


Epoch 252 Mean Reward: 79.55124577331543


100%|██████████| 2000/2000 [02:25<00:00, 13.76it/s]


Epoch 253 Mean Reward: 78.55428540802002


100%|██████████| 2000/2000 [02:30<00:00, 13.33it/s]


Epoch 254 Mean Reward: 79.63340370941162


100%|██████████| 2000/2000 [02:29<00:00, 13.41it/s]


Epoch 255 Mean Reward: 87.03408264160156


100%|██████████| 2000/2000 [02:21<00:00, 14.09it/s]


Epoch 256 Mean Reward: 81.07435569000243


100%|██████████| 2000/2000 [02:23<00:00, 13.91it/s]


Epoch 257 Mean Reward: 88.33829863739014


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 258 Mean Reward: 84.69595584869384


100%|██████████| 2000/2000 [02:26<00:00, 13.64it/s]


Epoch 259 Mean Reward: 88.6002033843994


100%|██████████| 2000/2000 [02:24<00:00, 13.81it/s]


Epoch 260 Mean Reward: 88.85096588897706
Epoch 260 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 260 test:
Test Episode 1 Reward: 423.21791076660156
Test Episode 2 Reward: 544.3343811035156
Test Episode 3 Reward: 64.04562377929688
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 279.9362487792969
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 413.3643798828125
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 219.96622619628906


100%|██████████| 2000/2000 [02:23<00:00, 13.95it/s]


Epoch 261 Mean Reward: 88.5737897567749


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 262 Mean Reward: 92.99667167663574


100%|██████████| 2000/2000 [02:32<00:00, 13.15it/s]


Epoch 263 Mean Reward: 85.46902200317383


100%|██████████| 2000/2000 [02:28<00:00, 13.42it/s]


Epoch 264 Mean Reward: 89.18258311462402


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 265 Mean Reward: 97.46333115386963


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 266 Mean Reward: 97.04025643157959


100%|██████████| 2000/2000 [02:23<00:00, 13.94it/s]


Epoch 267 Mean Reward: 93.4700132598877


100%|██████████| 2000/2000 [02:23<00:00, 13.97it/s]


Epoch 268 Mean Reward: 93.00535375213623


100%|██████████| 2000/2000 [02:21<00:00, 14.14it/s]


Epoch 269 Mean Reward: 95.21198562622071


100%|██████████| 2000/2000 [02:29<00:00, 13.42it/s]


Epoch 270 Mean Reward: 101.51630067443848
Epoch 270 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 270 test:
Test Episode 1 Reward: 83.36027526855469
Test Episode 2 Reward: 55.8548583984375
Test Episode 3 Reward: 381.83457946777344
Test Episode 4 Reward: 51.02587890625
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 379.12046813964844
Test Episode 7 Reward: 108.58279418945312
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 350.9234313964844
Average Test Reward: 169.5560516357422


100%|██████████| 2000/2000 [02:27<00:00, 13.60it/s]


Epoch 271 Mean Reward: 99.98124350738526


100%|██████████| 2000/2000 [02:23<00:00, 13.95it/s]


Epoch 272 Mean Reward: 96.36980139160156


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 273 Mean Reward: 101.54625790405274


100%|██████████| 2000/2000 [02:24<00:00, 13.82it/s]


Epoch 274 Mean Reward: 99.21434970855712


100%|██████████| 2000/2000 [02:22<00:00, 14.02it/s]


Epoch 275 Mean Reward: 94.93024606323242


100%|██████████| 2000/2000 [02:26<00:00, 13.63it/s]


Epoch 276 Mean Reward: 98.82743909454345


100%|██████████| 2000/2000 [02:30<00:00, 13.29it/s]


Epoch 277 Mean Reward: 98.89774664306641


100%|██████████| 2000/2000 [02:30<00:00, 13.29it/s]


Epoch 278 Mean Reward: 108.16316687011718


100%|██████████| 2000/2000 [02:36<00:00, 12.78it/s]


Epoch 279 Mean Reward: 104.72209861755371


100%|██████████| 2000/2000 [02:27<00:00, 13.59it/s]


Epoch 280 Mean Reward: 104.58754673004151
Epoch 280 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 280 test:
Test Episode 1 Reward: 436.63462829589844
Test Episode 2 Reward: 495.6855926513672
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 259.6506805419922
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 360.44825744628906
Test Episode 8 Reward: 364.9497985839844
Test Episode 9 Reward: 117.172607421875
Test Episode 10 Reward: 267.8540802001953
Average Test Reward: 258.7253875732422


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 281 Mean Reward: 103.16366205596924


100%|██████████| 2000/2000 [02:27<00:00, 13.59it/s]


Epoch 282 Mean Reward: 108.18365817260742


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 283 Mean Reward: 108.36881188964844


100%|██████████| 2000/2000 [02:31<00:00, 13.24it/s]


Epoch 284 Mean Reward: 104.97644292449951


100%|██████████| 2000/2000 [02:23<00:00, 13.90it/s]


Epoch 285 Mean Reward: 109.59324501800538


100%|██████████| 2000/2000 [02:23<00:00, 13.97it/s]


Epoch 286 Mean Reward: 108.71477503204346


100%|██████████| 2000/2000 [02:25<00:00, 13.79it/s]


Epoch 287 Mean Reward: 110.38066069030762


100%|██████████| 2000/2000 [02:25<00:00, 13.74it/s]


Epoch 288 Mean Reward: 112.1109813079834


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 289 Mean Reward: 109.61912941741943


100%|██████████| 2000/2000 [02:23<00:00, 13.89it/s]


Epoch 290 Mean Reward: 113.22713804626464
Epoch 290 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 290 test:
Test Episode 1 Reward: 104.83416748046875
Test Episode 2 Reward: 109.27589416503906
Test Episode 3 Reward: 409.41392517089844
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 406.53497314453125
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 452.6735076904297
Test Episode 8 Reward: 37.15325927734375
Test Episode 9 Reward: 66.25218200683594
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 187.09961395263673


100%|██████████| 2000/2000 [02:24<00:00, 13.84it/s]


Epoch 291 Mean Reward: 113.50881014251709


100%|██████████| 2000/2000 [02:28<00:00, 13.48it/s]


Epoch 292 Mean Reward: 112.26267636108399


100%|██████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 293 Mean Reward: 118.2152141647339


100%|██████████| 2000/2000 [02:28<00:00, 13.47it/s]


Epoch 294 Mean Reward: 117.30469587707519


100%|██████████| 2000/2000 [02:37<00:00, 12.69it/s]


Epoch 295 Mean Reward: 118.72471742248536


100%|██████████| 2000/2000 [03:01<00:00, 11.01it/s]


Epoch 296 Mean Reward: 119.94333255004882


100%|██████████| 2000/2000 [02:54<00:00, 11.44it/s]


Epoch 297 Mean Reward: 121.70556326293945


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 298 Mean Reward: 117.96974687957764


100%|██████████| 2000/2000 [02:59<00:00, 11.15it/s]


Epoch 299 Mean Reward: 117.13966707611084


100%|██████████| 2000/2000 [02:58<00:00, 11.18it/s]


Epoch 300 Mean Reward: 123.49971556854248
Epoch 300 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 300 test:
Test Episode 1 Reward: 325.65887451171875
Test Episode 2 Reward: 71.63816833496094
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 87.32046508789062
Test Episode 9 Reward: 96.662353515625
Test Episode 10 Reward: 42.83306884765625
Average Test Reward: 109.88766479492188


100%|██████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 301 Mean Reward: 127.11848014831543


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 302 Mean Reward: 125.79305637359619


100%|██████████| 2000/2000 [02:29<00:00, 13.39it/s]


Epoch 303 Mean Reward: 122.49240831756592


100%|██████████| 2000/2000 [02:34<00:00, 12.95it/s]


Epoch 304 Mean Reward: 121.51158947753906


100%|██████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 305 Mean Reward: 128.81298434448243


100%|██████████| 2000/2000 [02:27<00:00, 13.52it/s]


Epoch 306 Mean Reward: 123.07154546356202


100%|██████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 307 Mean Reward: 122.2884895477295


100%|██████████| 2000/2000 [02:31<00:00, 13.24it/s]


Epoch 308 Mean Reward: 133.91953103637695


100%|██████████| 2000/2000 [02:23<00:00, 13.94it/s]


Epoch 309 Mean Reward: 130.37705339050294


100%|██████████| 2000/2000 [02:24<00:00, 13.89it/s]


Epoch 310 Mean Reward: 135.07583484649658
Epoch 310 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 310 test:
Test Episode 1 Reward: 88.82139587402344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 365.1961975097656
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 743.0668640136719
Average Test Reward: 186.1753662109375


100%|██████████| 2000/2000 [02:25<00:00, 13.74it/s]


Epoch 311 Mean Reward: 130.70939418792724


100%|██████████| 2000/2000 [02:24<00:00, 13.87it/s]


Epoch 312 Mean Reward: 128.00812632751465


100%|██████████| 2000/2000 [02:26<00:00, 13.67it/s]


Epoch 313 Mean Reward: 133.26168072509765


100%|██████████| 2000/2000 [02:25<00:00, 13.77it/s]


Epoch 314 Mean Reward: 126.78801050567627


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 315 Mean Reward: 134.25693068695068


100%|██████████| 2000/2000 [02:27<00:00, 13.60it/s]


Epoch 316 Mean Reward: 138.16819677734375


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 317 Mean Reward: 133.5257292404175


100%|██████████| 2000/2000 [02:26<00:00, 13.66it/s]


Epoch 318 Mean Reward: 129.11085552215576


100%|██████████| 2000/2000 [02:25<00:00, 13.78it/s]


Epoch 319 Mean Reward: 137.10815016937255


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 320 Mean Reward: 134.0439838256836
Epoch 320 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 320 test:
Test Episode 1 Reward: 84.38627624511719
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 399.2431335449219
Test Episode 8 Reward: 343.638671875
Test Episode 9 Reward: 55.08589172363281
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 145.20704345703126


100%|██████████| 2000/2000 [02:25<00:00, 13.71it/s]


Epoch 321 Mean Reward: 139.7811756591797


100%|██████████| 2000/2000 [02:26<00:00, 13.68it/s]


Epoch 322 Mean Reward: 140.85573989868163


100%|██████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 323 Mean Reward: 138.0089122390747


100%|██████████| 2000/2000 [02:25<00:00, 13.77it/s]


Epoch 324 Mean Reward: 139.6562544555664


100%|██████████| 2000/2000 [02:37<00:00, 12.69it/s]


Epoch 325 Mean Reward: 140.0277593460083


100%|██████████| 2000/2000 [02:30<00:00, 13.29it/s]


Epoch 326 Mean Reward: 141.11491610717775


100%|██████████| 2000/2000 [02:27<00:00, 13.58it/s]


Epoch 327 Mean Reward: 141.28938678741454


100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Epoch 328 Mean Reward: 144.86174224853517


100%|██████████| 2000/2000 [02:24<00:00, 13.86it/s]


Epoch 329 Mean Reward: 143.0063125152588


100%|██████████| 2000/2000 [02:26<00:00, 13.68it/s]


Epoch 330 Mean Reward: 144.50166162872316
Epoch 330 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 330 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 404.86895751953125
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 122.57545471191406
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 453.12303161621094
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 31.0908203125
Average Test Reward: 158.1374725341797


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 331 Mean Reward: 143.71023934936522


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 332 Mean Reward: 144.91499367523193


100%|██████████| 2000/2000 [02:26<00:00, 13.64it/s]


Epoch 333 Mean Reward: 147.15932440948487


100%|██████████| 2000/2000 [02:32<00:00, 13.14it/s]


Epoch 334 Mean Reward: 147.10029855346679


100%|██████████| 2000/2000 [02:33<00:00, 13.04it/s]


Epoch 335 Mean Reward: 142.01220748138428


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Epoch 336 Mean Reward: 145.77269679260255


100%|██████████| 2000/2000 [02:26<00:00, 13.61it/s]


Epoch 337 Mean Reward: 149.8691291809082


100%|██████████| 2000/2000 [02:27<00:00, 13.57it/s]


Epoch 338 Mean Reward: 145.40118473815917


100%|██████████| 2000/2000 [02:31<00:00, 13.17it/s]


Epoch 339 Mean Reward: 148.03415155029296


100%|██████████| 2000/2000 [02:30<00:00, 13.32it/s]


Epoch 340 Mean Reward: 147.85685977172852
Epoch 340 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 340 test:
Test Episode 1 Reward: 130.24093627929688
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 163.0269317626953
Test Episode 8 Reward: 83.32136535644531
Test Episode 9 Reward: 340.4814910888672
Test Episode 10 Reward: 351.0386199951172
Average Test Reward: 154.2873062133789


100%|██████████| 2000/2000 [02:25<00:00, 13.70it/s]


Epoch 341 Mean Reward: 154.61944054412842


100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Epoch 342 Mean Reward: 150.22274297332763


100%|██████████| 2000/2000 [02:26<00:00, 13.64it/s]


Epoch 343 Mean Reward: 145.83095833587646


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 344 Mean Reward: 151.66565408325195


100%|██████████| 2000/2000 [02:23<00:00, 13.96it/s]


Epoch 345 Mean Reward: 149.92573482513427


100%|██████████| 2000/2000 [02:23<00:00, 13.97it/s]


Epoch 346 Mean Reward: 149.19011762237548


100%|██████████| 2000/2000 [02:24<00:00, 13.86it/s]


Epoch 347 Mean Reward: 154.60252834320067


100%|██████████| 2000/2000 [02:24<00:00, 13.88it/s]


Epoch 348 Mean Reward: 147.42406478881836


100%|██████████| 2000/2000 [02:29<00:00, 13.35it/s]


Epoch 349 Mean Reward: 158.53872409057618


100%|██████████| 2000/2000 [02:26<00:00, 13.61it/s]


Epoch 350 Mean Reward: 159.60687886047364
Epoch 350 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 350 test:
Test Episode 1 Reward: 59.083709716796875
Test Episode 2 Reward: 129.714599609375
Test Episode 3 Reward: 94.61294555664062
Test Episode 4 Reward: 554.4261169433594
Test Episode 5 Reward: 343.6560974121094
Test Episode 6 Reward: 778.3361358642578
Test Episode 7 Reward: 110.80348205566406
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 92.28749084472656
Test Episode 10 Reward: 57.42742919921875
Average Test Reward: 231.5300750732422


100%|██████████| 2000/2000 [02:29<00:00, 13.38it/s]


Epoch 351 Mean Reward: 151.42050246429443


100%|██████████| 2000/2000 [02:31<00:00, 13.22it/s]


Epoch 352 Mean Reward: 155.1132053451538


100%|██████████| 2000/2000 [02:33<00:00, 13.02it/s]


Epoch 353 Mean Reward: 153.68421374511718


100%|██████████| 2000/2000 [02:31<00:00, 13.23it/s]


Epoch 354 Mean Reward: 158.60867393493652


100%|██████████| 2000/2000 [02:35<00:00, 12.83it/s]


Epoch 355 Mean Reward: 155.54192456054687


100%|██████████| 2000/2000 [02:29<00:00, 13.41it/s]


Epoch 356 Mean Reward: 155.65593061065672


100%|██████████| 2000/2000 [02:29<00:00, 13.34it/s]


Epoch 357 Mean Reward: 157.28478862762452


100%|██████████| 2000/2000 [02:29<00:00, 13.38it/s]


Epoch 358 Mean Reward: 163.45138753509522


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 359 Mean Reward: 155.44209258270263


100%|██████████| 2000/2000 [02:31<00:00, 13.20it/s]


Epoch 360 Mean Reward: 155.48956537628175
Epoch 360 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 360 test:
Test Episode 1 Reward: 98.796630859375
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 44.793853759765625
Test Episode 5 Reward: 52.55488586425781
Test Episode 6 Reward: 140.21417236328125
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 483.1988830566406
Test Episode 10 Reward: 372.0876770019531
Average Test Reward: 157.1457077026367


100%|██████████| 2000/2000 [02:32<00:00, 13.10it/s]


Epoch 361 Mean Reward: 157.01255365753173


100%|██████████| 2000/2000 [02:30<00:00, 13.26it/s]


Epoch 362 Mean Reward: 156.06422957611085


100%|██████████| 2000/2000 [02:30<00:00, 13.28it/s]


Epoch 363 Mean Reward: 159.90442721557616


100%|██████████| 2000/2000 [02:31<00:00, 13.18it/s]


Epoch 364 Mean Reward: 160.90305407714843


100%|██████████| 2000/2000 [02:29<00:00, 13.36it/s]


Epoch 365 Mean Reward: 157.7660632019043


100%|██████████| 2000/2000 [02:34<00:00, 12.96it/s]


Epoch 366 Mean Reward: 159.94758576202392


100%|██████████| 2000/2000 [02:29<00:00, 13.34it/s]


Epoch 367 Mean Reward: 154.4008077774048


100%|██████████| 2000/2000 [02:27<00:00, 13.54it/s]


Epoch 368 Mean Reward: 159.06101593780517


100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]


Epoch 369 Mean Reward: 159.84108518981932


100%|██████████| 2000/2000 [02:21<00:00, 14.17it/s]


Epoch 370 Mean Reward: 158.78162098693846
Epoch 370 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 370 test:
Test Episode 1 Reward: 64.31797790527344
Test Episode 2 Reward: 419.51637268066406
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 74.39488220214844
Test Episode 7 Reward: 365.012939453125
Test Episode 8 Reward: 71.84408569335938
Test Episode 9 Reward: 386.15370178222656
Test Episode 10 Reward: 367.8740539550781
Average Test Reward: 203.39722442626953


100%|██████████| 2000/2000 [02:20<00:00, 14.21it/s]


Epoch 371 Mean Reward: 157.68590926361085


100%|██████████| 2000/2000 [02:22<00:00, 14.06it/s]


Epoch 372 Mean Reward: 160.35800583648683


100%|██████████| 2000/2000 [02:24<00:00, 13.87it/s]


Epoch 373 Mean Reward: 159.23162016296388


100%|██████████| 2000/2000 [02:20<00:00, 14.27it/s]


Epoch 374 Mean Reward: 154.84114337921142


100%|██████████| 2000/2000 [02:20<00:00, 14.28it/s]


Epoch 375 Mean Reward: 155.1146449279785


100%|██████████| 2000/2000 [02:23<00:00, 13.97it/s]


Epoch 376 Mean Reward: 156.7030251083374


100%|██████████| 2000/2000 [02:23<00:00, 13.94it/s]


Epoch 377 Mean Reward: 157.21926063537597


100%|██████████| 2000/2000 [02:21<00:00, 14.13it/s]


Epoch 378 Mean Reward: 158.43293243408203


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 379 Mean Reward: 156.0287890548706


100%|██████████| 2000/2000 [02:20<00:00, 14.20it/s]


Epoch 380 Mean Reward: 156.3571813583374
Epoch 380 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 380 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 446.8427734375
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 90.31855773925781
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 118.47615051269531
Test Episode 10 Reward: 94.95274353027344
Average Test Reward: 132.03066864013672


100%|██████████| 2000/2000 [02:22<00:00, 14.01it/s]


Epoch 381 Mean Reward: 162.67809741210937


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 382 Mean Reward: 164.11837662506105


100%|██████████| 2000/2000 [02:23<00:00, 13.98it/s]


Epoch 383 Mean Reward: 155.8616279449463


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 384 Mean Reward: 157.05999005889893


100%|██████████| 2000/2000 [02:28<00:00, 13.51it/s]


Epoch 385 Mean Reward: 159.29416758728027


100%|██████████| 2000/2000 [02:24<00:00, 13.82it/s]


Epoch 386 Mean Reward: 159.81199477386474


100%|██████████| 2000/2000 [02:23<00:00, 13.93it/s]


Epoch 387 Mean Reward: 152.95711037445068


100%|██████████| 2000/2000 [02:27<00:00, 13.58it/s]


Epoch 388 Mean Reward: 158.27680979919433


100%|██████████| 2000/2000 [02:24<00:00, 13.87it/s]


Epoch 389 Mean Reward: 160.64708879089355


100%|██████████| 2000/2000 [02:23<00:00, 13.95it/s]


Epoch 390 Mean Reward: 150.14258264923095
Epoch 390 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 390 test:
Test Episode 1 Reward: 114.62875366210938
Test Episode 2 Reward: 94.95274353027344
Test Episode 3 Reward: 408.9574432373047
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 94.95274353027344
Test Episode 7 Reward: 124.65696716308594
Test Episode 8 Reward: 85.2744140625
Test Episode 9 Reward: 104.12887573242188
Test Episode 10 Reward: 84.6474609375
Average Test Reward: 130.21048889160156


100%|██████████| 2000/2000 [02:28<00:00, 13.45it/s]


Epoch 391 Mean Reward: 155.408916557312


100%|██████████| 2000/2000 [02:22<00:00, 14.01it/s]


Epoch 392 Mean Reward: 161.4412425994873


100%|██████████| 2000/2000 [02:22<00:00, 14.01it/s]


Epoch 393 Mean Reward: 158.4338655014038


100%|██████████| 2000/2000 [02:23<00:00, 13.97it/s]


Epoch 394 Mean Reward: 163.9844273147583


100%|██████████| 2000/2000 [02:22<00:00, 14.01it/s]


Epoch 395 Mean Reward: 162.34578240966798


100%|██████████| 2000/2000 [02:22<00:00, 14.07it/s]


Epoch 396 Mean Reward: 158.48597412872314


100%|██████████| 2000/2000 [02:25<00:00, 13.75it/s]


Epoch 397 Mean Reward: 165.1335715484619


100%|██████████| 2000/2000 [02:21<00:00, 14.10it/s]


Epoch 398 Mean Reward: 157.7378997116089


100%|██████████| 2000/2000 [02:19<00:00, 14.31it/s]


Epoch 399 Mean Reward: 156.39778678131103


100%|██████████| 2000/2000 [02:21<00:00, 14.16it/s]


Epoch 400 Mean Reward: 161.1288366470337
Epoch 400 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 400 test:
Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 181.20811462402344
Test Episode 3 Reward: 94.95274353027344
Test Episode 4 Reward: 94.95274353027344
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 392.19850158691406
Test Episode 7 Reward: 100.43850708007812
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 94.95274353027344
Test Episode 10 Reward: 489.7906494140625
Average Test Reward: 173.33522338867186
[(277.4553970336914, 60), (258.7253875732422, 280), (231.5300750732422, 350), (225.74791412353517, 20), (219.96622619628906, 260), (203.39722442626953, 370), (199.0168243408203, 90), (198.821826171875, 40), (189.25107421875, 210), (187.09961395263673, 290), (186.1753662109375, 310), (173.33522338867186, 400), (170.41588592529297, 120), (169.5560516357422, 270), (168.12531433105468, 80), (165.8619171142578, 200), (163.01374816

In [6]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

test_reward = test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=ckpts[-1])
print('Average Test Reward:', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-360
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-360


  warn("The default mode, 'constant', will be changed to 'reflect' in "


Test Episode 1 Reward: 94.95274353027344
Test Episode 2 Reward: 43.86524963378906
Test Episode 3 Reward: 49.52287292480469
Test Episode 4 Reward: 68.5986328125
Test Episode 5 Reward: 94.95274353027344
Test Episode 6 Reward: 38.95274353027344
Test Episode 7 Reward: 94.95274353027344
Test Episode 8 Reward: 94.95274353027344
Test Episode 9 Reward: 7.710235595703125
Test Episode 10 Reward: 77.83247375488281
Test Episode 11 Reward: 94.95274353027344
Test Episode 12 Reward: 114.81394958496094
Test Episode 13 Reward: 116.73039245605469
Test Episode 14 Reward: 94.95274353027344
Test Episode 15 Reward: 94.95274353027344
Test Episode 16 Reward: 382.83551025390625
Test Episode 17 Reward: 359.22511291503906
Test Episode 18 Reward: 371.7436218261719
Test Episode 19 Reward: 94.95274353027344
Test Episode 20 Reward: 94.95274353027344
Average Test Reward: 124.32027435302734
