In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 400
steps_per_epoch = 2000
learning_rate = 0.005
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            buffer = np.concatenate((state.screen_buffer,
                                     np.expand_dims(state.depth_buffer,
                                                    axis=2)),
                                    axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self, epoch):
        self.learning_rate = 0.98*self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                np.expand_dims(state.depth_buffer, axis=2)),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    np.expand_dims(state.depth_buffer, axis=2)),
                                                    axis=2),
                                                    down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
        
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr(epoch)
    target_net.update_lr(epoch)
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

        print('Epoch {} test:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward:', test_reward)
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [03:17<00:00, 10.14it/s]


Epoch 1 Mean Reward: 133.93492488861085


100%|██████████| 2000/2000 [02:17<00:00, 14.50it/s]


Epoch 2 Mean Reward: 131.562821975708


100%|██████████| 2000/2000 [02:09<00:00, 15.47it/s]


Epoch 3 Mean Reward: 132.64748331451415


100%|██████████| 2000/2000 [02:18<00:00, 14.47it/s]


Epoch 4 Mean Reward: 129.6057086791992


100%|██████████| 2000/2000 [02:33<00:00, 13.03it/s]


Epoch 5 Mean Reward: 130.34570848083496


100%|██████████| 2000/2000 [02:20<00:00, 14.21it/s]


Epoch 6 Mean Reward: 132.9259105911255


100%|██████████| 2000/2000 [02:18<00:00, 14.46it/s]


Epoch 7 Mean Reward: 128.9242252960205


100%|██████████| 2000/2000 [02:28<00:00, 13.45it/s]


Epoch 8 Mean Reward: 131.26713368225097


100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]


Epoch 9 Mean Reward: 130.07278826904297


100%|██████████| 2000/2000 [02:18<00:00, 14.40it/s]


Epoch 10 Mean Reward: 132.73794338989256
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 10 test:
Test Episode 1 Reward: 288.933349609375
Test Episode 2 Reward: 461.32579040527344
Test Episode 3 Reward: 279.26426696777344
Test Episode 4 Reward: 259.16624450683594
Test Episode 5 Reward: 288.933349609375
Test Episode 6 Reward: 355.9276885986328
Test Episode 7 Reward: 288.933349609375
Test Episode 8 Reward: 288.933349609375
Test Episode 9 Reward: 288.933349609375
Test Episode 10 Reward: 288.933349609375
Average Test Reward: 308.928408813


100%|██████████| 2000/2000 [02:13<00:00, 15.01it/s]


Epoch 11 Mean Reward: 131.00028078460693


100%|██████████| 2000/2000 [02:13<00:00, 14.97it/s]


Epoch 12 Mean Reward: 133.1030963821411


100%|██████████| 2000/2000 [02:19<00:00, 14.36it/s]


Epoch 13 Mean Reward: 131.08015776062012


100%|██████████| 2000/2000 [02:29<00:00, 13.35it/s]


Epoch 14 Mean Reward: 133.23720618438722


100%|██████████| 2000/2000 [02:17<00:00, 14.60it/s]


Epoch 15 Mean Reward: 130.66719827270506


100%|██████████| 2000/2000 [02:57<00:00, 11.29it/s]


Epoch 16 Mean Reward: 130.12384811401367


100%|██████████| 2000/2000 [03:20<00:00,  9.96it/s]


Epoch 17 Mean Reward: 130.1507855911255


100%|██████████| 2000/2000 [03:33<00:00,  9.36it/s]


Epoch 18 Mean Reward: 132.20118627929688


100%|██████████| 2000/2000 [04:04<00:00,  8.18it/s]


Epoch 19 Mean Reward: 133.0462350769043


100%|██████████| 2000/2000 [03:54<00:00,  8.53it/s]


Epoch 20 Mean Reward: 131.66592566680907
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test:
Test Episode 1 Reward: 626.3114471435547
Test Episode 2 Reward: 294.17149353027344
Test Episode 3 Reward: 294.17149353027344
Test Episode 4 Reward: 331.57989501953125
Test Episode 5 Reward: 294.17149353027344
Test Episode 6 Reward: 295.50608825683594
Test Episode 7 Reward: 276.93711853027344
Test Episode 8 Reward: 510.5562744140625
Test Episode 9 Reward: 271.6495666503906
Test Episode 10 Reward: 294.17149353027344
Average Test Reward: 348.922636414


100%|██████████| 2000/2000 [03:00<00:00, 11.06it/s]


Epoch 21 Mean Reward: 129.65494860076905


100%|██████████| 2000/2000 [02:33<00:00, 13.00it/s]


Epoch 22 Mean Reward: 128.75511841583253


100%|██████████| 2000/2000 [02:36<00:00, 12.74it/s]


Epoch 23 Mean Reward: 131.4754843673706


100%|██████████| 2000/2000 [02:59<00:00, 11.13it/s]


Epoch 24 Mean Reward: 129.56012959289552


100%|██████████| 2000/2000 [03:00<00:00, 11.06it/s]


Epoch 25 Mean Reward: 131.6890545425415


100%|██████████| 2000/2000 [02:42<00:00, 12.30it/s]


Epoch 26 Mean Reward: 130.34807764434814


100%|██████████| 2000/2000 [02:35<00:00, 12.86it/s]


Epoch 27 Mean Reward: 132.28181185150146


100%|██████████| 2000/2000 [02:45<00:00, 12.09it/s]


Epoch 28 Mean Reward: 129.54599499511718


100%|██████████| 2000/2000 [02:53<00:00, 11.52it/s]


Epoch 29 Mean Reward: 133.66890339660645


100%|██████████| 2000/2000 [02:46<00:00, 11.98it/s]


Epoch 30 Mean Reward: 130.66076070404054
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 30 test:
Test Episode 1 Reward: 298.17909240722656
Test Episode 2 Reward: 287.59173583984375
Test Episode 3 Reward: 287.59173583984375
Test Episode 4 Reward: 287.59173583984375
Test Episode 5 Reward: 287.59173583984375
Test Episode 6 Reward: 287.59173583984375
Test Episode 7 Reward: 287.59173583984375
Test Episode 8 Reward: 287.59173583984375
Test Episode 9 Reward: 287.59173583984375
Test Episode 10 Reward: 287.59173583984375
Average Test Reward: 288.650471497


100%|██████████| 2000/2000 [02:52<00:00, 11.62it/s]


Epoch 31 Mean Reward: 132.8915485687256


100%|██████████| 2000/2000 [02:36<00:00, 12.78it/s]


Epoch 32 Mean Reward: 128.09580780029296


100%|██████████| 2000/2000 [02:47<00:00, 11.94it/s]


Epoch 33 Mean Reward: 132.1351231994629


100%|██████████| 2000/2000 [02:37<00:00, 12.68it/s]


Epoch 34 Mean Reward: 131.781195602417


100%|██████████| 2000/2000 [02:09<00:00, 15.45it/s]


Epoch 35 Mean Reward: 130.18927787017822


100%|██████████| 2000/2000 [02:14<00:00, 14.88it/s]


Epoch 36 Mean Reward: 134.06125072479247


100%|██████████| 2000/2000 [02:11<00:00, 15.25it/s]


Epoch 37 Mean Reward: 131.5949529724121


100%|██████████| 2000/2000 [01:55<00:00, 17.25it/s]


Epoch 38 Mean Reward: 131.5716524810791


100%|██████████| 2000/2000 [01:53<00:00, 17.57it/s]


Epoch 39 Mean Reward: 128.95631762695314


100%|██████████| 2000/2000 [01:54<00:00, 17.54it/s]


Epoch 40 Mean Reward: 135.76236150360108
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 517.4681549072266
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 240.5364227294922
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 311.61915588378906
Test Episode 9 Reward: 627.3293609619141
Test Episode 10 Reward: 294.95274353027344
Average Test Reward: 346.666955566


100%|██████████| 2000/2000 [02:09<00:00, 15.46it/s]


Epoch 41 Mean Reward: 129.56607960510254


100%|██████████| 2000/2000 [02:01<00:00, 16.49it/s]


Epoch 42 Mean Reward: 131.87327655029296


100%|██████████| 2000/2000 [01:54<00:00, 17.42it/s]


Epoch 43 Mean Reward: 132.45122970581053


100%|██████████| 2000/2000 [01:53<00:00, 17.69it/s]


Epoch 44 Mean Reward: 129.69351055908203


100%|██████████| 2000/2000 [01:53<00:00, 17.60it/s]


Epoch 45 Mean Reward: 129.02075144195555


100%|██████████| 2000/2000 [01:55<00:00, 17.33it/s]


Epoch 46 Mean Reward: 129.47149725341797


100%|██████████| 2000/2000 [01:53<00:00, 17.61it/s]


Epoch 47 Mean Reward: 131.22543043518067


100%|██████████| 2000/2000 [01:53<00:00, 17.67it/s]


Epoch 48 Mean Reward: 132.31783506011962


100%|██████████| 2000/2000 [02:04<00:00, 16.10it/s]


Epoch 49 Mean Reward: 132.78627760314941


100%|██████████| 2000/2000 [01:53<00:00, 17.67it/s]


Epoch 50 Mean Reward: 130.59035663604737
Epoch 50 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 50 test:
Test Episode 1 Reward: 311.4622344970703
Test Episode 2 Reward: 294.17149353027344
Test Episode 3 Reward: 294.17149353027344
Test Episode 4 Reward: 468.6620178222656
Test Episode 5 Reward: 294.17149353027344
Test Episode 6 Reward: 294.17149353027344
Test Episode 7 Reward: 294.17149353027344
Test Episode 8 Reward: 294.17149353027344
Test Episode 9 Reward: 249.67764282226562
Test Episode 10 Reward: 593.3132629394531
Average Test Reward: 338.814411926


100%|██████████| 2000/2000 [01:56<00:00, 17.10it/s]


Epoch 51 Mean Reward: 133.5544284362793


100%|██████████| 2000/2000 [01:55<00:00, 17.26it/s]


Epoch 52 Mean Reward: 129.441777885437


100%|██████████| 2000/2000 [02:12<00:00, 15.14it/s]


Epoch 53 Mean Reward: 130.89973543548584


100%|██████████| 2000/2000 [02:06<00:00, 15.83it/s]


Epoch 54 Mean Reward: 131.64153871154784


100%|██████████| 2000/2000 [02:05<00:00, 15.91it/s]


Epoch 55 Mean Reward: 129.78496310424805


100%|██████████| 2000/2000 [02:15<00:00, 14.75it/s]


Epoch 56 Mean Reward: 132.0399391479492


100%|██████████| 2000/2000 [02:01<00:00, 16.46it/s]


Epoch 57 Mean Reward: 128.7772922973633


100%|██████████| 2000/2000 [02:05<00:00, 15.99it/s]


Epoch 58 Mean Reward: 129.52112742614747


100%|██████████| 2000/2000 [02:07<00:00, 15.72it/s]


Epoch 59 Mean Reward: 131.88244493103028


100%|██████████| 2000/2000 [02:08<00:00, 15.61it/s]


Epoch 60 Mean Reward: 132.43470239257812
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test:
Test Episode 1 Reward: 294.17149353027344
Test Episode 2 Reward: 294.17149353027344
Test Episode 3 Reward: 294.17149353027344
Test Episode 4 Reward: 304.38446044921875
Test Episode 5 Reward: 606.2024841308594
Test Episode 6 Reward: 324.56675720214844
Test Episode 7 Reward: 294.17149353027344
Test Episode 8 Reward: 294.17149353027344
Test Episode 9 Reward: 434.30613708496094
Test Episode 10 Reward: 288.7121276855469
Average Test Reward: 342.90294342


100%|██████████| 2000/2000 [02:09<00:00, 15.45it/s]


Epoch 61 Mean Reward: 131.18982219696045


100%|██████████| 2000/2000 [02:00<00:00, 16.62it/s]


Epoch 62 Mean Reward: 134.66027096557616


100%|██████████| 2000/2000 [02:11<00:00, 15.16it/s]


Epoch 63 Mean Reward: 130.91221844482422


100%|██████████| 2000/2000 [01:49<00:00, 18.25it/s]


Epoch 64 Mean Reward: 132.44051485443114


100%|██████████| 2000/2000 [01:55<00:00, 17.28it/s]


Epoch 65 Mean Reward: 130.95274501800537


100%|██████████| 2000/2000 [01:56<00:00, 17.22it/s]


Epoch 66 Mean Reward: 134.3291145477295


100%|██████████| 2000/2000 [02:03<00:00, 16.14it/s]


Epoch 67 Mean Reward: 129.66474165344238


100%|██████████| 2000/2000 [02:04<00:00, 16.12it/s]


Epoch 68 Mean Reward: 131.18157988739014


100%|██████████| 2000/2000 [02:02<00:00, 16.34it/s]


Epoch 69 Mean Reward: 129.67326216888426


100%|██████████| 2000/2000 [02:07<00:00, 15.66it/s]


Epoch 70 Mean Reward: 132.21515841674804
Epoch 70 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 70 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 350.7017364501953
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 532.6937561035156
Test Episode 6 Reward: 259.28871154785156
Test Episode 7 Reward: 302.6962585449219
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 294.95274353027344
Test Episode 10 Reward: 294.95274353027344
Average Test Reward: 321.509692383


100%|██████████| 2000/2000 [02:14<00:00, 14.89it/s]


Epoch 71 Mean Reward: 131.79251457977296


100%|██████████| 2000/2000 [02:02<00:00, 16.27it/s]


Epoch 72 Mean Reward: 132.56423719787597


100%|██████████| 2000/2000 [02:02<00:00, 16.26it/s]


Epoch 73 Mean Reward: 131.0107639694214


100%|██████████| 2000/2000 [01:56<00:00, 17.23it/s]


Epoch 74 Mean Reward: 131.27923159790038


100%|██████████| 2000/2000 [01:55<00:00, 17.30it/s]


Epoch 75 Mean Reward: 130.89069940948485


100%|██████████| 2000/2000 [01:53<00:00, 17.68it/s]


Epoch 76 Mean Reward: 132.24361709594726


100%|██████████| 2000/2000 [01:56<00:00, 17.22it/s]


Epoch 77 Mean Reward: 130.49219216156007


100%|██████████| 2000/2000 [02:02<00:00, 16.35it/s]


Epoch 78 Mean Reward: 132.8803335647583


100%|██████████| 2000/2000 [01:56<00:00, 17.21it/s]


Epoch 79 Mean Reward: 129.73296813201904


100%|██████████| 2000/2000 [01:52<00:00, 17.71it/s]


Epoch 80 Mean Reward: 131.53002435302736
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test:
Test Episode 1 Reward: 549.262939453125
Test Episode 2 Reward: 558.4200592041016
Test Episode 3 Reward: 294.17149353027344
Test Episode 4 Reward: 590.065673828125
Test Episode 5 Reward: 294.17149353027344
Test Episode 6 Reward: 294.17149353027344
Test Episode 7 Reward: 516.5216979980469
Test Episode 8 Reward: 564.1181640625
Test Episode 9 Reward: 276.39678955078125
Test Episode 10 Reward: 294.17149353027344
Average Test Reward: 423.147129822


100%|██████████| 2000/2000 [01:54<00:00, 17.48it/s]


Epoch 81 Mean Reward: 134.09131903076172


100%|██████████| 2000/2000 [01:57<00:00, 17.06it/s]


Epoch 82 Mean Reward: 130.82856620788573


100%|██████████| 2000/2000 [01:57<00:00, 16.97it/s]


Epoch 83 Mean Reward: 130.84726680755614


100%|██████████| 2000/2000 [01:56<00:00, 17.10it/s]


Epoch 84 Mean Reward: 133.2044536743164


100%|██████████| 2000/2000 [02:02<00:00, 16.39it/s]


Epoch 85 Mean Reward: 133.67643098449707


100%|██████████| 2000/2000 [02:02<00:00, 16.30it/s]


Epoch 86 Mean Reward: 133.62575225067138


100%|██████████| 2000/2000 [01:59<00:00, 16.70it/s]


Epoch 87 Mean Reward: 131.28410820770264


100%|██████████| 2000/2000 [01:54<00:00, 17.52it/s]


Epoch 88 Mean Reward: 131.0620828781128


100%|██████████| 2000/2000 [01:51<00:00, 17.97it/s]


Epoch 89 Mean Reward: 128.81283348083497


100%|██████████| 2000/2000 [01:55<00:00, 17.31it/s]


Epoch 90 Mean Reward: 132.33297324371338
Epoch 90 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 90 test:
Test Episode 1 Reward: 629.3338775634766
Test Episode 2 Reward: 280.9598846435547
Test Episode 3 Reward: 666.7679595947266
Test Episode 4 Reward: 280.9598846435547
Test Episode 5 Reward: 280.9598846435547
Test Episode 6 Reward: 252.07742309570312
Test Episode 7 Reward: 280.9598846435547
Test Episode 8 Reward: 352.12400817871094
Test Episode 9 Reward: 567.7730865478516
Test Episode 10 Reward: 280.9598846435547
Average Test Reward: 387.28757782


100%|██████████| 2000/2000 [01:57<00:00, 16.97it/s]


Epoch 91 Mean Reward: 130.40738422393798


100%|██████████| 2000/2000 [01:53<00:00, 17.64it/s]


Epoch 92 Mean Reward: 131.55112338256836


100%|██████████| 2000/2000 [02:02<00:00, 16.39it/s]


Epoch 93 Mean Reward: 132.38552894592286


100%|██████████| 2000/2000 [01:53<00:00, 17.63it/s]


Epoch 94 Mean Reward: 129.75303490448


100%|██████████| 2000/2000 [01:51<00:00, 17.87it/s]


Epoch 95 Mean Reward: 130.95549930572508


100%|██████████| 2000/2000 [01:52<00:00, 17.75it/s]


Epoch 96 Mean Reward: 129.7568998184204


100%|██████████| 2000/2000 [01:55<00:00, 17.38it/s]


Epoch 97 Mean Reward: 130.2829044113159


100%|██████████| 2000/2000 [01:52<00:00, 17.72it/s]


Epoch 98 Mean Reward: 129.53502997589112


100%|██████████| 2000/2000 [01:51<00:00, 17.92it/s]


Epoch 99 Mean Reward: 130.39276188659667


100%|██████████| 2000/2000 [01:52<00:00, 17.74it/s]


Epoch 100 Mean Reward: 131.86783885192872
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test:
Test Episode 1 Reward: 290.9730682373047
Test Episode 2 Reward: 596.4975280761719
Test Episode 3 Reward: 558.5509796142578
Test Episode 4 Reward: 292.5507507324219
Test Episode 5 Reward: 290.9730682373047
Test Episode 6 Reward: 290.9730682373047
Test Episode 7 Reward: 290.9730682373047
Test Episode 8 Reward: 335.8080291748047
Test Episode 9 Reward: 290.9730682373047
Test Episode 10 Reward: 290.9730682373047
Average Test Reward: 352.924569702


100%|██████████| 2000/2000 [02:07<00:00, 15.72it/s]


Epoch 101 Mean Reward: 129.76447325897217


100%|██████████| 2000/2000 [01:58<00:00, 16.82it/s]


Epoch 102 Mean Reward: 131.50191776275634


100%|██████████| 2000/2000 [02:03<00:00, 16.13it/s]


Epoch 103 Mean Reward: 131.8704903869629


100%|██████████| 2000/2000 [01:56<00:00, 17.15it/s]


Epoch 104 Mean Reward: 131.382634765625


100%|██████████| 2000/2000 [01:57<00:00, 16.95it/s]


Epoch 105 Mean Reward: 129.74796076202392


100%|██████████| 2000/2000 [02:02<00:00, 16.33it/s]


Epoch 106 Mean Reward: 132.1029656829834


100%|██████████| 2000/2000 [01:59<00:00, 16.70it/s]


Epoch 107 Mean Reward: 131.24478758239746


100%|██████████| 2000/2000 [02:06<00:00, 15.80it/s]


Epoch 108 Mean Reward: 131.25033266448975


100%|██████████| 2000/2000 [02:05<00:00, 15.99it/s]


Epoch 109 Mean Reward: 130.95396229553222


100%|██████████| 2000/2000 [02:04<00:00, 16.05it/s]


Epoch 110 Mean Reward: 131.47763483428955
Epoch 110 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 110 test:
Test Episode 1 Reward: 295.3915100097656
Test Episode 2 Reward: 448.4543151855469
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 456.5648498535156
Test Episode 7 Reward: 268.1620788574219
Test Episode 8 Reward: 265.5399932861328
Test Episode 9 Reward: 172.29501342773438
Test Episode 10 Reward: 294.95274353027344
Average Test Reward: 308.621873474


100%|██████████| 2000/2000 [02:01<00:00, 16.41it/s]


Epoch 111 Mean Reward: 131.42180362701416


100%|██████████| 2000/2000 [02:02<00:00, 16.39it/s]


Epoch 112 Mean Reward: 130.17544211578368


100%|██████████| 2000/2000 [02:06<00:00, 15.75it/s]


Epoch 113 Mean Reward: 130.18141051483155


100%|██████████| 2000/2000 [02:19<00:00, 14.29it/s]


Epoch 114 Mean Reward: 132.9155608215332


100%|██████████| 2000/2000 [02:19<00:00, 14.32it/s]


Epoch 115 Mean Reward: 132.8792040939331


100%|██████████| 2000/2000 [02:16<00:00, 14.70it/s]


Epoch 116 Mean Reward: 132.3436014099121


100%|██████████| 2000/2000 [02:15<00:00, 14.80it/s]


Epoch 117 Mean Reward: 132.76997064208985


100%|██████████| 2000/2000 [02:13<00:00, 15.00it/s]


Epoch 118 Mean Reward: 130.23555868530272


100%|██████████| 2000/2000 [02:06<00:00, 15.77it/s]


Epoch 119 Mean Reward: 131.67790588378907


100%|██████████| 2000/2000 [02:09<00:00, 15.48it/s]


Epoch 120 Mean Reward: 132.27045896911622
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test:
Test Episode 1 Reward: 252.64703369140625
Test Episode 2 Reward: 311.12889099121094
Test Episode 3 Reward: 421.2587585449219
Test Episode 4 Reward: 282.21958923339844
Test Episode 5 Reward: 282.21958923339844
Test Episode 6 Reward: 282.21958923339844
Test Episode 7 Reward: 483.1804504394531
Test Episode 8 Reward: 282.21958923339844
Test Episode 9 Reward: 282.21958923339844
Test Episode 10 Reward: 251.06436157226562
Average Test Reward: 313.037744141


100%|██████████| 2000/2000 [02:02<00:00, 16.30it/s]


Epoch 121 Mean Reward: 162.40010106658934


100%|██████████| 2000/2000 [02:12<00:00, 15.09it/s]


Epoch 122 Mean Reward: 161.94734897613526


100%|██████████| 2000/2000 [02:05<00:00, 15.99it/s]


Epoch 123 Mean Reward: 162.2810246810913


100%|██████████| 2000/2000 [02:06<00:00, 15.86it/s]


Epoch 124 Mean Reward: 159.59966011810303


100%|██████████| 2000/2000 [02:09<00:00, 15.43it/s]


Epoch 125 Mean Reward: 161.2336898651123


100%|██████████| 2000/2000 [02:18<00:00, 14.43it/s]


Epoch 126 Mean Reward: 161.0473669128418


100%|██████████| 2000/2000 [02:24<00:00, 13.88it/s]


Epoch 127 Mean Reward: 165.69314173126222


100%|██████████| 2000/2000 [02:24<00:00, 13.88it/s]


Epoch 128 Mean Reward: 162.70233042907714


100%|██████████| 2000/2000 [02:25<00:00, 13.75it/s]


Epoch 129 Mean Reward: 164.98834187316893


100%|██████████| 2000/2000 [02:25<00:00, 13.72it/s]


Epoch 130 Mean Reward: 163.53504652404786
Epoch 130 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 130 test:
Test Episode 1 Reward: 417.2556457519531
Test Episode 2 Reward: 236.56298828125
Test Episode 3 Reward: 319.2701721191406
Test Episode 4 Reward: 274.3919982910156
Test Episode 5 Reward: 449.92181396484375
Test Episode 6 Reward: 274.3919982910156
Test Episode 7 Reward: 449.92181396484375
Test Episode 8 Reward: 274.3919982910156
Test Episode 9 Reward: 449.92181396484375
Test Episode 10 Reward: 261.8384094238281
Average Test Reward: 340.786865234


100%|██████████| 2000/2000 [02:30<00:00, 13.29it/s]


Epoch 131 Mean Reward: 167.82408461761474


100%|██████████| 2000/2000 [02:36<00:00, 12.78it/s]


Epoch 132 Mean Reward: 169.8171347885132


100%|██████████| 2000/2000 [02:32<00:00, 13.13it/s]


Epoch 133 Mean Reward: 168.50940073394776


100%|██████████| 2000/2000 [02:31<00:00, 13.19it/s]


Epoch 134 Mean Reward: 168.31395846557618


100%|██████████| 2000/2000 [02:31<00:00, 13.21it/s]


Epoch 135 Mean Reward: 169.71679455566405


100%|██████████| 2000/2000 [02:33<00:00, 13.05it/s]


Epoch 136 Mean Reward: 169.3445317840576


100%|██████████| 2000/2000 [02:33<00:00, 13.03it/s]


Epoch 137 Mean Reward: 168.3155149230957


100%|██████████| 2000/2000 [02:38<00:00, 12.63it/s]


Epoch 138 Mean Reward: 170.9488907394409


100%|██████████| 2000/2000 [02:42<00:00, 12.34it/s]


Epoch 139 Mean Reward: 173.7492399597168


100%|██████████| 2000/2000 [02:36<00:00, 12.80it/s]


Epoch 140 Mean Reward: 171.87410739135743
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test:
Test Episode 1 Reward: 271.9046936035156
Test Episode 2 Reward: 271.9046936035156
Test Episode 3 Reward: 317.6734619140625
Test Episode 4 Reward: 279.1693115234375
Test Episode 5 Reward: 303.5264129638672
Test Episode 6 Reward: 251.2866973876953
Test Episode 7 Reward: 271.9046936035156
Test Episode 8 Reward: 231.20059204101562
Test Episode 9 Reward: 271.9046936035156
Test Episode 10 Reward: 448.56910705566406
Average Test Reward: 291.90443573


100%|██████████| 2000/2000 [02:34<00:00, 12.97it/s]


Epoch 141 Mean Reward: 171.2896877746582


100%|██████████| 2000/2000 [02:35<00:00, 12.86it/s]


Epoch 142 Mean Reward: 174.1893318862915


100%|██████████| 2000/2000 [02:37<00:00, 12.69it/s]


Epoch 143 Mean Reward: 176.60724507904052


100%|██████████| 2000/2000 [02:43<00:00, 12.23it/s]


Epoch 144 Mean Reward: 177.23552213287354


100%|██████████| 2000/2000 [02:31<00:00, 13.16it/s]


Epoch 145 Mean Reward: 178.29634242248534


100%|██████████| 2000/2000 [02:30<00:00, 13.25it/s]


Epoch 146 Mean Reward: 176.23557376861572


100%|██████████| 2000/2000 [02:32<00:00, 13.12it/s]


Epoch 147 Mean Reward: 183.5037951889038


100%|██████████| 2000/2000 [02:29<00:00, 13.40it/s]


Epoch 148 Mean Reward: 184.19388452148436


100%|██████████| 2000/2000 [02:28<00:00, 13.45it/s]


Epoch 149 Mean Reward: 185.91744248962402


100%|██████████| 2000/2000 [02:26<00:00, 13.69it/s]


Epoch 150 Mean Reward: 179.4279231338501
Epoch 150 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 150 test:
Test Episode 1 Reward: 450.6252136230469
Test Episode 2 Reward: 319.2608642578125
Test Episode 3 Reward: 224.41148376464844
Test Episode 4 Reward: 450.6252136230469
Test Episode 5 Reward: 395.8513488769531
Test Episode 6 Reward: 258.0380554199219
Test Episode 7 Reward: 226.81756591796875
Test Episode 8 Reward: 562.4291534423828
Test Episode 9 Reward: 542.4486389160156
Test Episode 10 Reward: 438.45338439941406
Average Test Reward: 386.896092224


100%|██████████| 2000/2000 [02:40<00:00, 12.46it/s]


Epoch 151 Mean Reward: 178.370710357666


100%|██████████| 2000/2000 [02:29<00:00, 13.35it/s]


Epoch 152 Mean Reward: 180.24370245361328


100%|██████████| 2000/2000 [02:33<00:00, 13.06it/s]


Epoch 153 Mean Reward: 180.9450472946167


100%|██████████| 2000/2000 [02:33<00:00, 13.04it/s]


Epoch 154 Mean Reward: 178.48510147857667


100%|██████████| 2000/2000 [02:32<00:00, 13.09it/s]


Epoch 155 Mean Reward: 180.6392017059326


100%|██████████| 2000/2000 [02:27<00:00, 13.57it/s]


Epoch 156 Mean Reward: 183.05324063873292


100%|██████████| 2000/2000 [02:33<00:00, 13.06it/s]


Epoch 157 Mean Reward: 180.50339821624755


100%|██████████| 2000/2000 [02:24<00:00, 13.86it/s]


Epoch 158 Mean Reward: 181.25199557495117


100%|██████████| 2000/2000 [02:33<00:00, 13.06it/s]


Epoch 159 Mean Reward: 185.5660417098999


100%|██████████| 2000/2000 [02:26<00:00, 13.64it/s]


Epoch 160 Mean Reward: 182.96463228607178
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test:
Test Episode 1 Reward: 267.6560821533203
Test Episode 2 Reward: 267.6560821533203
Test Episode 3 Reward: 173.51446533203125
Test Episode 4 Reward: 267.6560821533203
Test Episode 5 Reward: 235.2886505126953
Test Episode 6 Reward: 531.3635559082031
Test Episode 7 Reward: 267.6560821533203
Test Episode 8 Reward: 280.7586212158203
Test Episode 9 Reward: 367.7068176269531
Test Episode 10 Reward: 565.0999755859375
Average Test Reward: 322.435641479


100%|██████████| 2000/2000 [02:29<00:00, 13.33it/s]


Epoch 161 Mean Reward: 189.77964054870606


100%|██████████| 2000/2000 [02:32<00:00, 13.12it/s]


Epoch 162 Mean Reward: 193.97101763153077


100%|██████████| 2000/2000 [02:38<00:00, 12.62it/s]


Epoch 163 Mean Reward: 191.09699501800537


100%|██████████| 2000/2000 [02:32<00:00, 13.08it/s]


Epoch 164 Mean Reward: 194.1622812805176


100%|██████████| 2000/2000 [02:33<00:00, 13.02it/s]


Epoch 165 Mean Reward: 193.03774383544922


100%|██████████| 2000/2000 [02:34<00:00, 12.94it/s]


Epoch 166 Mean Reward: 198.43969659423828


100%|██████████| 2000/2000 [02:37<00:00, 12.66it/s]


Epoch 167 Mean Reward: 197.04480265045166


100%|██████████| 2000/2000 [02:31<00:00, 13.19it/s]


Epoch 168 Mean Reward: 195.5994295578003


100%|██████████| 2000/2000 [02:32<00:00, 13.11it/s]


Epoch 169 Mean Reward: 199.63691944885255


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 170 Mean Reward: 202.21635007476806
Epoch 170 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 170 test:
Test Episode 1 Reward: 266.52447509765625
Test Episode 2 Reward: 284.39183044433594
Test Episode 3 Reward: 319.6436004638672
Test Episode 4 Reward: 284.39183044433594
Test Episode 5 Reward: 560.9954071044922
Test Episode 6 Reward: 630.369384765625
Test Episode 7 Reward: 284.39183044433594
Test Episode 8 Reward: 509.2178649902344
Test Episode 9 Reward: 569.4853820800781
Test Episode 10 Reward: 284.39183044433594
Average Test Reward: 399.380343628


100%|██████████| 2000/2000 [02:28<00:00, 13.48it/s]


Epoch 171 Mean Reward: 194.8099264678955


100%|██████████| 2000/2000 [02:31<00:00, 13.16it/s]


Epoch 172 Mean Reward: 196.65312705993654


100%|██████████| 2000/2000 [02:35<00:00, 12.86it/s]


Epoch 173 Mean Reward: 192.69376835632323


100%|██████████| 2000/2000 [02:32<00:00, 13.13it/s]


Epoch 174 Mean Reward: 197.6567331161499


100%|██████████| 2000/2000 [02:38<00:00, 12.64it/s]


Epoch 175 Mean Reward: 196.01966999816895


100%|██████████| 2000/2000 [02:31<00:00, 13.18it/s]


Epoch 176 Mean Reward: 197.1432302017212


100%|██████████| 2000/2000 [02:29<00:00, 13.35it/s]


Epoch 177 Mean Reward: 200.43974779510498


100%|██████████| 2000/2000 [02:31<00:00, 13.21it/s]


Epoch 178 Mean Reward: 199.26730879974366


100%|██████████| 2000/2000 [02:26<00:00, 13.66it/s]


Epoch 179 Mean Reward: 197.2812927246094


100%|██████████| 2000/2000 [02:27<00:00, 13.56it/s]


Epoch 180 Mean Reward: 200.39874492645265
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test:
Test Episode 1 Reward: 266.6203155517578
Test Episode 2 Reward: 266.6203155517578
Test Episode 3 Reward: 503.7605438232422
Test Episode 4 Reward: 266.6203155517578
Test Episode 5 Reward: 266.6203155517578
Test Episode 6 Reward: 246.1166534423828
Test Episode 7 Reward: 537.1387481689453
Test Episode 8 Reward: 276.6288604736328
Test Episode 9 Reward: 266.6203155517578
Test Episode 10 Reward: 652.0753631591797
Average Test Reward: 354.882174683


100%|██████████| 2000/2000 [02:29<00:00, 13.34it/s]


Epoch 181 Mean Reward: 195.09358588409424


100%|██████████| 2000/2000 [02:30<00:00, 13.29it/s]


Epoch 182 Mean Reward: 198.71092143249513


100%|██████████| 2000/2000 [02:38<00:00, 12.61it/s]


Epoch 183 Mean Reward: 196.16852840423584


100%|██████████| 2000/2000 [02:33<00:00, 13.02it/s]


Epoch 184 Mean Reward: 200.47471948242188


100%|██████████| 2000/2000 [02:27<00:00, 13.60it/s]


Epoch 185 Mean Reward: 195.15769480895997


100%|██████████| 2000/2000 [02:32<00:00, 13.08it/s]


Epoch 186 Mean Reward: 198.2842169036865


100%|██████████| 2000/2000 [02:30<00:00, 13.28it/s]


Epoch 187 Mean Reward: 201.73552376556395


100%|██████████| 2000/2000 [02:29<00:00, 13.41it/s]


Epoch 188 Mean Reward: 200.92620138549805


100%|██████████| 2000/2000 [02:29<00:00, 13.42it/s]


Epoch 189 Mean Reward: 200.70356437683105


100%|██████████| 2000/2000 [02:31<00:00, 13.19it/s]


Epoch 190 Mean Reward: 199.9667286758423
Epoch 190 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 190 test:
Test Episode 1 Reward: 332.4478302001953
Test Episode 2 Reward: 171.1023406982422
Test Episode 3 Reward: 332.4478302001953
Test Episode 4 Reward: 332.4478302001953
Test Episode 5 Reward: 196.92572021484375
Test Episode 6 Reward: 197.82107543945312
Test Episode 7 Reward: 332.4478302001953
Test Episode 8 Reward: 332.4478302001953
Test Episode 9 Reward: 277.78565979003906
Test Episode 10 Reward: 263.3539581298828
Average Test Reward: 276.922790527


100%|██████████| 2000/2000 [02:31<00:00, 13.20it/s]


Epoch 191 Mean Reward: 214.62941815948486


100%|██████████| 2000/2000 [02:31<00:00, 13.23it/s]


Epoch 192 Mean Reward: 211.62313498687743


100%|██████████| 2000/2000 [02:34<00:00, 12.97it/s]


Epoch 193 Mean Reward: 219.78686965179443


100%|██████████| 2000/2000 [02:38<00:00, 12.66it/s]


Epoch 194 Mean Reward: 217.41974646759033


100%|██████████| 2000/2000 [02:24<00:00, 13.85it/s]


Epoch 195 Mean Reward: 210.70055032348634


100%|██████████| 2000/2000 [02:23<00:00, 13.94it/s]


Epoch 196 Mean Reward: 215.1014181137085


100%|██████████| 2000/2000 [02:24<00:00, 13.88it/s]


Epoch 197 Mean Reward: 215.83342404937744


100%|██████████| 2000/2000 [02:34<00:00, 12.91it/s]


Epoch 198 Mean Reward: 214.6204877319336


100%|██████████| 2000/2000 [02:23<00:00, 13.95it/s]


Epoch 199 Mean Reward: 223.35595570373536


100%|██████████| 2000/2000 [02:21<00:00, 14.14it/s]


Epoch 200 Mean Reward: 222.02500271606445
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test:
Test Episode 1 Reward: 269.8148193359375
Test Episode 2 Reward: 640.9584350585938
Test Episode 3 Reward: 269.8148193359375
Test Episode 4 Reward: 269.8148193359375
Test Episode 5 Reward: 269.8148193359375
Test Episode 6 Reward: 264.75335693359375
Test Episode 7 Reward: 243.0145721435547
Test Episode 8 Reward: 218.30191040039062
Test Episode 9 Reward: 552.2734375
Test Episode 10 Reward: 269.8148193359375
Average Test Reward: 326.837580872


100%|██████████| 2000/2000 [02:26<00:00, 13.64it/s]


Epoch 201 Mean Reward: 227.79771073913574


100%|██████████| 2000/2000 [02:26<00:00, 13.63it/s]


Epoch 202 Mean Reward: 227.75534088897706


100%|██████████| 2000/2000 [02:29<00:00, 13.38it/s]


Epoch 203 Mean Reward: 230.68150428009034


100%|██████████| 2000/2000 [02:39<00:00, 12.57it/s]


Epoch 204 Mean Reward: 229.1433711242676


100%|██████████| 2000/2000 [03:00<00:00, 11.08it/s]


Epoch 205 Mean Reward: 226.01440603637695


100%|██████████| 2000/2000 [03:04<00:00, 10.81it/s]


Epoch 206 Mean Reward: 229.04340671539308


100%|██████████| 2000/2000 [03:15<00:00, 10.25it/s]


Epoch 207 Mean Reward: 226.47810746002196


100%|██████████| 2000/2000 [03:10<00:00, 10.49it/s]


Epoch 208 Mean Reward: 220.7642706756592


100%|██████████| 2000/2000 [03:06<00:00, 10.71it/s]


Epoch 209 Mean Reward: 230.72091079711913


100%|██████████| 2000/2000 [03:11<00:00, 10.44it/s]


Epoch 210 Mean Reward: 228.11326020812987
Epoch 210 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 210 test:
Test Episode 1 Reward: 279.0960693359375
Test Episode 2 Reward: 448.8590545654297
Test Episode 3 Reward: 305.1009979248047
Test Episode 4 Reward: 279.0960693359375
Test Episode 5 Reward: 279.0960693359375
Test Episode 6 Reward: 279.0960693359375
Test Episode 7 Reward: 426.9442443847656
Test Episode 8 Reward: 279.0960693359375
Test Episode 9 Reward: 267.5223693847656
Test Episode 10 Reward: 279.0960693359375
Average Test Reward: 312.300308228


100%|██████████| 2000/2000 [03:20<00:00,  9.97it/s]


Epoch 211 Mean Reward: 231.86454457855226


100%|██████████| 2000/2000 [03:19<00:00, 10.03it/s]


Epoch 212 Mean Reward: 231.3814091873169


100%|██████████| 2000/2000 [03:14<00:00, 10.28it/s]


Epoch 213 Mean Reward: 238.72246062469483


100%|██████████| 2000/2000 [03:21<00:00,  9.94it/s]


Epoch 214 Mean Reward: 233.14604692077637


100%|██████████| 2000/2000 [03:08<00:00, 10.58it/s]


Epoch 215 Mean Reward: 236.89901845550537


100%|██████████| 2000/2000 [03:27<00:00,  9.66it/s]


Epoch 216 Mean Reward: 241.2445146331787


100%|██████████| 2000/2000 [03:20<00:00,  9.97it/s]


Epoch 217 Mean Reward: 240.34644022369386


100%|██████████| 2000/2000 [03:02<00:00, 10.94it/s]


Epoch 218 Mean Reward: 241.78044709014893


100%|██████████| 2000/2000 [03:23<00:00,  9.82it/s]


Epoch 219 Mean Reward: 243.19210262298583


100%|██████████| 2000/2000 [03:23<00:00,  9.81it/s]


Epoch 220 Mean Reward: 246.31503253173827
Epoch 220 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 220 test:
Test Episode 1 Reward: 454.28428649902344
Test Episode 2 Reward: 216.39602661132812
Test Episode 3 Reward: 267.684326171875
Test Episode 4 Reward: 276.8591003417969
Test Episode 5 Reward: 276.8591003417969
Test Episode 6 Reward: 276.8591003417969
Test Episode 7 Reward: 276.8591003417969
Test Episode 8 Reward: 276.8591003417969
Test Episode 9 Reward: 249.96182250976562
Test Episode 10 Reward: 277.0830383300781
Average Test Reward: 284.970500183


100%|██████████| 2000/2000 [04:07<00:00,  8.08it/s]


Epoch 221 Mean Reward: 248.9952032546997


100%|██████████| 2000/2000 [03:53<00:00,  8.56it/s]


Epoch 222 Mean Reward: 239.77019317626954


100%|██████████| 2000/2000 [03:54<00:00,  8.53it/s]


Epoch 223 Mean Reward: 242.55553414154053


100%|██████████| 2000/2000 [04:00<00:00,  8.33it/s]


Epoch 224 Mean Reward: 248.47738477325439


100%|██████████| 2000/2000 [03:53<00:00,  8.55it/s]


Epoch 225 Mean Reward: 250.1922244949341


100%|██████████| 2000/2000 [03:19<00:00, 10.05it/s]


Epoch 226 Mean Reward: 256.3982054672241


100%|██████████| 2000/2000 [03:14<00:00, 10.28it/s]


Epoch 227 Mean Reward: 252.18438117218017


100%|██████████| 2000/2000 [03:12<00:00, 10.37it/s]


Epoch 228 Mean Reward: 256.89030351257327


100%|██████████| 2000/2000 [03:18<00:00, 10.08it/s]


Epoch 229 Mean Reward: 259.4459737548828


100%|██████████| 2000/2000 [03:16<00:00, 10.18it/s]


Epoch 230 Mean Reward: 251.52684924316407
Epoch 230 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 230 test:
Test Episode 1 Reward: 521.4165344238281
Test Episode 2 Reward: 410.74993896484375
Test Episode 3 Reward: 234.6516571044922
Test Episode 4 Reward: 521.4165344238281
Test Episode 5 Reward: 521.4165344238281
Test Episode 6 Reward: 521.4165344238281
Test Episode 7 Reward: 521.4165344238281
Test Episode 8 Reward: 521.4165344238281
Test Episode 9 Reward: 521.4165344238281
Test Episode 10 Reward: 256.5417022705078
Average Test Reward: 455.185903931


100%|██████████| 2000/2000 [03:31<00:00,  9.47it/s]


Epoch 231 Mean Reward: 274.30054119873046


100%|██████████| 2000/2000 [03:12<00:00, 10.40it/s]


Epoch 232 Mean Reward: 278.76415469360353


100%|██████████| 2000/2000 [03:19<00:00, 10.04it/s]


Epoch 233 Mean Reward: 285.84143939971926


100%|██████████| 2000/2000 [02:56<00:00, 11.32it/s]


Epoch 234 Mean Reward: 281.30158618927004


100%|██████████| 2000/2000 [02:50<00:00, 11.70it/s]


Epoch 235 Mean Reward: 267.56237419128416


100%|██████████| 2000/2000 [03:04<00:00, 10.83it/s]


Epoch 236 Mean Reward: 274.1611530227661


100%|██████████| 2000/2000 [02:45<00:00, 12.12it/s]


Epoch 237 Mean Reward: 272.01087646484376


100%|██████████| 2000/2000 [02:44<00:00, 12.16it/s]


Epoch 238 Mean Reward: 270.8272590637207


100%|██████████| 2000/2000 [02:40<00:00, 12.50it/s]


Epoch 239 Mean Reward: 273.6701259841919


100%|██████████| 2000/2000 [02:37<00:00, 12.66it/s]


Epoch 240 Mean Reward: 278.4096883544922
Epoch 240 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 240 test:
Test Episode 1 Reward: 389.49436950683594
Test Episode 2 Reward: 262.24639892578125
Test Episode 3 Reward: 262.24639892578125
Test Episode 4 Reward: 294.0542297363281
Test Episode 5 Reward: 262.24639892578125
Test Episode 6 Reward: 262.24639892578125
Test Episode 7 Reward: 286.6294708251953
Test Episode 8 Reward: 535.7721405029297
Test Episode 9 Reward: 262.24639892578125
Test Episode 10 Reward: 262.24639892578125
Average Test Reward: 307.942860413


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 241 Mean Reward: 268.4767507095337


100%|██████████| 2000/2000 [02:36<00:00, 12.82it/s]


Epoch 242 Mean Reward: 265.65922663116453


100%|██████████| 2000/2000 [02:25<00:00, 13.74it/s]


Epoch 243 Mean Reward: 269.9940614089966


100%|██████████| 2000/2000 [02:26<00:00, 13.65it/s]


Epoch 244 Mean Reward: 265.4862030792236


100%|██████████| 2000/2000 [02:21<00:00, 14.14it/s]


Epoch 245 Mean Reward: 267.43635009002685


100%|██████████| 2000/2000 [02:22<00:00, 14.02it/s]


Epoch 246 Mean Reward: 266.9960319519043


100%|██████████| 2000/2000 [02:24<00:00, 13.88it/s]


Epoch 247 Mean Reward: 272.09288021850585


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 248 Mean Reward: 268.12383200836183


100%|██████████| 2000/2000 [02:26<00:00, 13.69it/s]


Epoch 249 Mean Reward: 264.9540575714111


100%|██████████| 2000/2000 [02:22<00:00, 14.06it/s]


Epoch 250 Mean Reward: 272.96288808441165
Epoch 250 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 250 test:
Test Episode 1 Reward: 450.4099578857422
Test Episode 2 Reward: 268.9580535888672
Test Episode 3 Reward: 455.39947509765625
Test Episode 4 Reward: 540.0018005371094
Test Episode 5 Reward: 217.0631561279297
Test Episode 6 Reward: 239.59201049804688
Test Episode 7 Reward: 570.2007598876953
Test Episode 8 Reward: 455.39947509765625
Test Episode 9 Reward: 243.95277404785156
Test Episode 10 Reward: 455.39947509765625
Average Test Reward: 389.637693787


100%|██████████| 2000/2000 [02:28<00:00, 13.47it/s]


Epoch 251 Mean Reward: 274.30524211120604


100%|██████████| 2000/2000 [02:36<00:00, 12.79it/s]


Epoch 252 Mean Reward: 278.77591482543943


100%|██████████| 2000/2000 [02:24<00:00, 13.89it/s]


Epoch 253 Mean Reward: 277.14466133880615


100%|██████████| 2000/2000 [02:22<00:00, 14.03it/s]


Epoch 254 Mean Reward: 274.4538245315552


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 255 Mean Reward: 280.3421340866089


100%|██████████| 2000/2000 [02:22<00:00, 14.02it/s]


Epoch 256 Mean Reward: 281.10317990875245


100%|██████████| 2000/2000 [02:28<00:00, 13.47it/s]


Epoch 257 Mean Reward: 278.0898499221802


100%|██████████| 2000/2000 [02:37<00:00, 12.66it/s]


Epoch 258 Mean Reward: 289.4666047897339


100%|██████████| 2000/2000 [02:27<00:00, 13.54it/s]


Epoch 259 Mean Reward: 278.0668166275024


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 260 Mean Reward: 281.4605467224121
Epoch 260 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 260 test:
Test Episode 1 Reward: 238.94927978515625
Test Episode 2 Reward: 218.98370361328125
Test Episode 3 Reward: 213.90342712402344
Test Episode 4 Reward: 247.54525756835938
Test Episode 5 Reward: 214.03472900390625
Test Episode 6 Reward: 119.32264709472656
Test Episode 7 Reward: 200.48741149902344
Test Episode 8 Reward: 234.26870727539062
Test Episode 9 Reward: 144.26406860351562
Test Episode 10 Reward: 200.48741149902344
Average Test Reward: 203.224664307


100%|██████████| 2000/2000 [02:27<00:00, 13.58it/s]


Epoch 261 Mean Reward: 268.371397644043


100%|██████████| 2000/2000 [02:31<00:00, 13.17it/s]


Epoch 262 Mean Reward: 276.9663824005127


100%|██████████| 2000/2000 [02:35<00:00, 12.89it/s]


Epoch 263 Mean Reward: 276.01190537261965


100%|██████████| 2000/2000 [02:38<00:00, 12.65it/s]


Epoch 264 Mean Reward: 277.13894660949705


100%|██████████| 2000/2000 [02:25<00:00, 13.77it/s]


Epoch 265 Mean Reward: 279.5138913345337


100%|██████████| 2000/2000 [02:27<00:00, 13.59it/s]


Epoch 266 Mean Reward: 269.98788369750974


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 267 Mean Reward: 277.35893409729005


100%|██████████| 2000/2000 [02:27<00:00, 13.55it/s]


Epoch 268 Mean Reward: 275.34634316253664


100%|██████████| 2000/2000 [02:31<00:00, 13.24it/s]


Epoch 269 Mean Reward: 280.79437628173827


100%|██████████| 2000/2000 [02:36<00:00, 12.80it/s]


Epoch 270 Mean Reward: 284.38556170654294
Epoch 270 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 270 test:
Test Episode 1 Reward: 273.8999938964844
Test Episode 2 Reward: 221.9283447265625
Test Episode 3 Reward: 258.49082946777344
Test Episode 4 Reward: 258.49082946777344
Test Episode 5 Reward: 258.49082946777344
Test Episode 6 Reward: 121.72651672363281
Test Episode 7 Reward: 292.1607666015625
Test Episode 8 Reward: 258.49082946777344
Test Episode 9 Reward: 184.27685546875
Test Episode 10 Reward: 258.49082946777344
Average Test Reward: 238.644662476


100%|██████████| 2000/2000 [02:22<00:00, 14.06it/s]


Epoch 271 Mean Reward: 281.09706055450437


100%|██████████| 2000/2000 [02:21<00:00, 14.12it/s]


Epoch 272 Mean Reward: 286.8785110321045


100%|██████████| 2000/2000 [02:23<00:00, 13.92it/s]


Epoch 273 Mean Reward: 284.49105026245115


100%|██████████| 2000/2000 [02:26<00:00, 13.62it/s]


Epoch 274 Mean Reward: 293.1556699371338


100%|██████████| 2000/2000 [02:34<00:00, 12.95it/s]


Epoch 275 Mean Reward: 292.65295517730715


100%|██████████| 2000/2000 [02:29<00:00, 13.36it/s]


Epoch 276 Mean Reward: 296.441609703064


100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]


Epoch 277 Mean Reward: 298.1528808441162


100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Epoch 278 Mean Reward: 294.9451218032837


100%|██████████| 2000/2000 [02:24<00:00, 13.87it/s]


Epoch 279 Mean Reward: 297.18708047485353


100%|██████████| 2000/2000 [02:24<00:00, 13.88it/s]


Epoch 280 Mean Reward: 296.4979356994629
Epoch 280 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 280 test:
Test Episode 1 Reward: 246.42237854003906
Test Episode 2 Reward: 246.42237854003906
Test Episode 3 Reward: 246.42237854003906
Test Episode 4 Reward: 94.65963745117188
Test Episode 5 Reward: 246.42237854003906
Test Episode 6 Reward: 376.4867706298828
Test Episode 7 Reward: 246.42237854003906
Test Episode 8 Reward: 255.04010009765625
Test Episode 9 Reward: 251.67262268066406
Test Episode 10 Reward: 149.829833984375
Average Test Reward: 235.980085754


100%|██████████| 2000/2000 [02:28<00:00, 13.44it/s]


Epoch 281 Mean Reward: 336.43244271087644


100%|██████████| 2000/2000 [02:20<00:00, 14.28it/s]


Epoch 282 Mean Reward: 331.9265958175659


100%|██████████| 2000/2000 [02:20<00:00, 14.21it/s]


Epoch 283 Mean Reward: 333.53879790496825


100%|██████████| 2000/2000 [02:17<00:00, 14.55it/s]


Epoch 284 Mean Reward: 332.87947908782957


100%|██████████| 2000/2000 [02:15<00:00, 14.79it/s]


Epoch 285 Mean Reward: 332.140616394043


100%|██████████| 2000/2000 [02:10<00:00, 15.32it/s]


Epoch 286 Mean Reward: 339.0289132156372


100%|██████████| 2000/2000 [02:15<00:00, 14.76it/s]


Epoch 287 Mean Reward: 334.13032803344726


100%|██████████| 2000/2000 [02:27<00:00, 13.53it/s]


Epoch 288 Mean Reward: 339.9930466003418


100%|██████████| 2000/2000 [02:16<00:00, 14.69it/s]


Epoch 289 Mean Reward: 336.12112451934814


100%|██████████| 2000/2000 [02:15<00:00, 14.75it/s]


Epoch 290 Mean Reward: 338.4434233093262
Epoch 290 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 290 test:
Test Episode 1 Reward: 161.17970275878906
Test Episode 2 Reward: 425.27796936035156
Test Episode 3 Reward: 622.3391571044922
Test Episode 4 Reward: 190.55169677734375
Test Episode 5 Reward: 222.77401733398438
Test Episode 6 Reward: 392.12779235839844
Test Episode 7 Reward: 301.71397399902344
Test Episode 8 Reward: 225.2935333251953
Test Episode 9 Reward: 408.73069763183594
Test Episode 10 Reward: 225.2935333251953
Average Test Reward: 317.528207397


100%|██████████| 2000/2000 [02:13<00:00, 15.01it/s]


Epoch 291 Mean Reward: 348.7790925064087


100%|██████████| 2000/2000 [02:17<00:00, 14.59it/s]


Epoch 292 Mean Reward: 350.72058988189696


100%|██████████| 2000/2000 [02:23<00:00, 13.95it/s]


Epoch 293 Mean Reward: 354.92567977905276


100%|██████████| 2000/2000 [02:32<00:00, 13.16it/s]


Epoch 294 Mean Reward: 350.7605796661377


100%|██████████| 2000/2000 [02:21<00:00, 14.18it/s]


Epoch 295 Mean Reward: 355.51677866363525


100%|██████████| 2000/2000 [02:18<00:00, 14.41it/s]


Epoch 296 Mean Reward: 357.53587394714356


100%|██████████| 2000/2000 [02:16<00:00, 14.63it/s]


Epoch 297 Mean Reward: 356.9904749298096


100%|██████████| 2000/2000 [02:18<00:00, 14.44it/s]


Epoch 298 Mean Reward: 366.74143672943114


100%|██████████| 2000/2000 [02:18<00:00, 14.45it/s]


Epoch 299 Mean Reward: 365.8348461380005


100%|██████████| 2000/2000 [02:16<00:00, 14.68it/s]


Epoch 300 Mean Reward: 360.56500399017335
Epoch 300 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 300 test:
Test Episode 1 Reward: 384.39727783203125
Test Episode 2 Reward: 384.39727783203125
Test Episode 3 Reward: 384.39727783203125
Test Episode 4 Reward: 237.18504333496094
Test Episode 5 Reward: 384.39727783203125
Test Episode 6 Reward: 238.70880126953125
Test Episode 7 Reward: 164.9032745361328
Test Episode 8 Reward: 218.62503051757812
Test Episode 9 Reward: 384.39727783203125
Test Episode 10 Reward: 384.39727783203125
Average Test Reward: 316.580581665


100%|██████████| 2000/2000 [02:25<00:00, 13.73it/s]


Epoch 301 Mean Reward: 352.9038675765991


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 302 Mean Reward: 344.25371945953367


100%|██████████| 2000/2000 [02:20<00:00, 14.23it/s]


Epoch 303 Mean Reward: 356.21454829406736


100%|██████████| 2000/2000 [02:18<00:00, 14.42it/s]


Epoch 304 Mean Reward: 359.93483665466306


100%|██████████| 2000/2000 [02:21<00:00, 14.17it/s]


Epoch 305 Mean Reward: 358.40963526916505


100%|██████████| 2000/2000 [02:22<00:00, 14.04it/s]


Epoch 306 Mean Reward: 355.80173461914063


100%|██████████| 2000/2000 [02:34<00:00, 12.94it/s]


Epoch 307 Mean Reward: 365.2139354248047


100%|██████████| 2000/2000 [02:18<00:00, 14.42it/s]


Epoch 308 Mean Reward: 367.5714106369019


100%|██████████| 2000/2000 [02:19<00:00, 14.31it/s]


Epoch 309 Mean Reward: 361.3537587585449


100%|██████████| 2000/2000 [02:18<00:00, 14.49it/s]


Epoch 310 Mean Reward: 371.03650872039793
Epoch 310 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 310 test:
Test Episode 1 Reward: 233.87403869628906
Test Episode 2 Reward: 249.55030822753906
Test Episode 3 Reward: 232.5773468017578
Test Episode 4 Reward: 210.86683654785156
Test Episode 5 Reward: 406.0144500732422
Test Episode 6 Reward: 240.80345153808594
Test Episode 7 Reward: 267.4884948730469
Test Episode 8 Reward: 239.22105407714844
Test Episode 9 Reward: 214.9190673828125
Test Episode 10 Reward: 237.12026977539062
Average Test Reward: 253.243531799


100%|██████████| 2000/2000 [02:15<00:00, 14.71it/s]


Epoch 311 Mean Reward: 379.15284004974365


100%|██████████| 2000/2000 [02:15<00:00, 14.76it/s]


Epoch 312 Mean Reward: 374.43123834228516


100%|██████████| 2000/2000 [02:30<00:00, 13.27it/s]


Epoch 313 Mean Reward: 388.8567406082153


100%|██████████| 2000/2000 [02:18<00:00, 14.43it/s]


Epoch 314 Mean Reward: 390.865551071167


100%|██████████| 2000/2000 [02:14<00:00, 14.83it/s]


Epoch 315 Mean Reward: 387.66589378356935


100%|██████████| 2000/2000 [02:13<00:00, 14.96it/s]


Epoch 316 Mean Reward: 397.3771871871948


100%|██████████| 2000/2000 [02:14<00:00, 14.87it/s]


Epoch 317 Mean Reward: 386.00983683776855


100%|██████████| 2000/2000 [02:19<00:00, 14.39it/s]


Epoch 318 Mean Reward: 398.1701009902954


100%|██████████| 2000/2000 [02:17<00:00, 14.56it/s]


Epoch 319 Mean Reward: 404.5109271316528


100%|██████████| 2000/2000 [02:27<00:00, 13.57it/s]


Epoch 320 Mean Reward: 395.54954834747315
Epoch 320 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 320 test:
Test Episode 1 Reward: 373.3124237060547
Test Episode 2 Reward: 373.3124237060547
Test Episode 3 Reward: 297.5037384033203
Test Episode 4 Reward: 373.3124237060547
Test Episode 5 Reward: 373.3124237060547
Test Episode 6 Reward: 147.09390258789062
Test Episode 7 Reward: 807.9222564697266
Test Episode 8 Reward: 373.3124237060547
Test Episode 9 Reward: 373.3124237060547
Test Episode 10 Reward: 219.6455078125
Average Test Reward: 371.203994751


100%|██████████| 2000/2000 [02:23<00:00, 13.91it/s]


Epoch 321 Mean Reward: 369.00830465698243


100%|██████████| 2000/2000 [02:22<00:00, 14.08it/s]


Epoch 322 Mean Reward: 390.9386907119751


100%|██████████| 2000/2000 [02:17<00:00, 14.49it/s]


Epoch 323 Mean Reward: 401.96267807006836


100%|██████████| 2000/2000 [02:22<00:00, 14.00it/s]


Epoch 324 Mean Reward: 404.619373008728


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 325 Mean Reward: 400.56418352508547


100%|██████████| 2000/2000 [02:33<00:00, 13.06it/s]


Epoch 326 Mean Reward: 400.2545313796997


100%|██████████| 2000/2000 [02:18<00:00, 14.39it/s]


Epoch 327 Mean Reward: 410.7340474319458


100%|██████████| 2000/2000 [02:19<00:00, 14.36it/s]


Epoch 328 Mean Reward: 413.37623669433594


100%|██████████| 2000/2000 [02:20<00:00, 14.22it/s]


Epoch 329 Mean Reward: 421.0205208816528


100%|██████████| 2000/2000 [02:21<00:00, 14.12it/s]


Epoch 330 Mean Reward: 411.6798667221069
Epoch 330 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 330 test:
Test Episode 1 Reward: 189.09568786621094
Test Episode 2 Reward: 182.9643096923828
Test Episode 3 Reward: 189.09568786621094
Test Episode 4 Reward: 104.80805969238281
Test Episode 5 Reward: 103.28202819824219
Test Episode 6 Reward: 189.09568786621094
Test Episode 7 Reward: 189.09568786621094
Test Episode 8 Reward: 189.09568786621094
Test Episode 9 Reward: 189.09568786621094
Test Episode 10 Reward: 180.4085235595703
Average Test Reward: 170.603704834


100%|██████████| 2000/2000 [02:19<00:00, 14.38it/s]


Epoch 331 Mean Reward: 416.457382019043


100%|██████████| 2000/2000 [02:38<00:00, 12.59it/s]


Epoch 332 Mean Reward: 422.0310179672241


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 333 Mean Reward: 427.63915870666506


100%|██████████| 2000/2000 [02:20<00:00, 14.28it/s]


Epoch 334 Mean Reward: 418.1269108963013


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 335 Mean Reward: 424.2243171081543


100%|██████████| 2000/2000 [02:22<00:00, 14.00it/s]


Epoch 336 Mean Reward: 418.97834407806397


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 337 Mean Reward: 429.91872412872317


100%|██████████| 2000/2000 [02:35<00:00, 12.84it/s]


Epoch 338 Mean Reward: 436.41240598297117


100%|██████████| 2000/2000 [02:32<00:00, 13.08it/s]


Epoch 339 Mean Reward: 438.193361656189


100%|██████████| 2000/2000 [02:32<00:00, 13.08it/s]


Epoch 340 Mean Reward: 445.481962310791
Epoch 340 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 340 test:
Test Episode 1 Reward: 340.831787109375
Test Episode 2 Reward: 340.831787109375
Test Episode 3 Reward: 340.831787109375
Test Episode 4 Reward: 146.87339782714844
Test Episode 5 Reward: 340.831787109375
Test Episode 6 Reward: 340.831787109375
Test Episode 7 Reward: 305.3451385498047
Test Episode 8 Reward: 340.831787109375
Test Episode 9 Reward: 147.3666229248047
Test Episode 10 Reward: 340.831787109375
Average Test Reward: 298.540766907


100%|██████████| 2000/2000 [02:19<00:00, 14.29it/s]


Epoch 341 Mean Reward: 476.25898149871824


100%|██████████| 2000/2000 [02:25<00:00, 13.74it/s]


Epoch 342 Mean Reward: 471.5369625854492


100%|██████████| 2000/2000 [02:23<00:00, 13.91it/s]


Epoch 343 Mean Reward: 480.4099699707031


100%|██████████| 2000/2000 [02:35<00:00, 12.89it/s]


Epoch 344 Mean Reward: 482.12242934417725


100%|██████████| 2000/2000 [02:21<00:00, 14.18it/s]


Epoch 345 Mean Reward: 482.78283528137206


100%|██████████| 2000/2000 [02:23<00:00, 13.92it/s]


Epoch 346 Mean Reward: 485.08477740478514


100%|██████████| 2000/2000 [02:20<00:00, 14.26it/s]


Epoch 347 Mean Reward: 482.33854129791257


100%|██████████| 2000/2000 [02:20<00:00, 14.23it/s]


Epoch 348 Mean Reward: 488.4900844345093


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 349 Mean Reward: 495.63917024993896


100%|██████████| 2000/2000 [02:29<00:00, 13.34it/s]


Epoch 350 Mean Reward: 481.8550838928223
Epoch 350 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 350 test:
Test Episode 1 Reward: 303.40599060058594
Test Episode 2 Reward: 256.92462158203125
Test Episode 3 Reward: 444.858642578125
Test Episode 4 Reward: 303.40599060058594
Test Episode 5 Reward: 303.40599060058594
Test Episode 6 Reward: 303.40599060058594
Test Episode 7 Reward: 303.40599060058594
Test Episode 8 Reward: 303.40599060058594
Test Episode 9 Reward: 237.35595703125
Test Episode 10 Reward: 303.40599060058594
Average Test Reward: 306.29811554


100%|██████████| 2000/2000 [02:27<00:00, 13.60it/s]


Epoch 351 Mean Reward: 492.99869477844237


100%|██████████| 2000/2000 [02:22<00:00, 14.01it/s]


Epoch 352 Mean Reward: 509.84230615997313


100%|██████████| 2000/2000 [02:24<00:00, 13.84it/s]


Epoch 353 Mean Reward: 506.1850289916992


100%|██████████| 2000/2000 [02:21<00:00, 14.13it/s]


Epoch 354 Mean Reward: 499.2849413833618


100%|██████████| 2000/2000 [02:24<00:00, 13.84it/s]


Epoch 355 Mean Reward: 510.79199195861815


100%|██████████| 2000/2000 [02:43<00:00, 12.20it/s]


Epoch 356 Mean Reward: 509.58644453430173


100%|██████████| 2000/2000 [02:28<00:00, 13.47it/s]


Epoch 357 Mean Reward: 518.038185005188


100%|██████████| 2000/2000 [02:21<00:00, 14.15it/s]


Epoch 358 Mean Reward: 522.3775153274536


100%|██████████| 2000/2000 [02:23<00:00, 13.94it/s]


Epoch 359 Mean Reward: 524.2571730651855


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 360 Mean Reward: 533.2664591217041
Epoch 360 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 360 test:
Test Episode 1 Reward: 130.53460693359375
Test Episode 2 Reward: 222.86463928222656
Test Episode 3 Reward: 177.531005859375
Test Episode 4 Reward: 185.19390869140625
Test Episode 5 Reward: 211.11927795410156
Test Episode 6 Reward: 308.01539611816406
Test Episode 7 Reward: 308.01539611816406
Test Episode 8 Reward: 260.4720458984375
Test Episode 9 Reward: 308.01539611816406
Test Episode 10 Reward: 162.8271942138672
Average Test Reward: 227.458886719


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 361 Mean Reward: 542.9121098480225


100%|██████████| 2000/2000 [02:29<00:00, 13.36it/s]


Epoch 362 Mean Reward: 526.7027412338257


100%|██████████| 2000/2000 [02:23<00:00, 13.98it/s]


Epoch 363 Mean Reward: 534.941901184082


100%|██████████| 2000/2000 [02:21<00:00, 14.18it/s]


Epoch 364 Mean Reward: 533.2261844558716


100%|██████████| 2000/2000 [02:21<00:00, 14.11it/s]


Epoch 365 Mean Reward: 534.6554468994141


100%|██████████| 2000/2000 [02:20<00:00, 14.22it/s]


Epoch 366 Mean Reward: 533.2415688934326


100%|██████████| 2000/2000 [02:22<00:00, 14.00it/s]


Epoch 367 Mean Reward: 531.8920048828124


100%|██████████| 2000/2000 [02:28<00:00, 13.48it/s]


Epoch 368 Mean Reward: 519.2394010009766


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 369 Mean Reward: 538.7336185531616


100%|██████████| 2000/2000 [02:25<00:00, 13.72it/s]


Epoch 370 Mean Reward: 525.3632051773071
Epoch 370 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 370 test:
Test Episode 1 Reward: 251.71701049804688
Test Episode 2 Reward: 298.4422302246094
Test Episode 3 Reward: 298.4422302246094
Test Episode 4 Reward: 385.61846923828125
Test Episode 5 Reward: 298.4422302246094
Test Episode 6 Reward: 298.4422302246094
Test Episode 7 Reward: 298.4422302246094
Test Episode 8 Reward: 298.4422302246094
Test Episode 9 Reward: 298.4422302246094
Test Episode 10 Reward: 192.25808715820312
Average Test Reward: 291.868917847


100%|██████████| 2000/2000 [02:15<00:00, 14.74it/s]


Epoch 371 Mean Reward: 483.96362155151365


100%|██████████| 2000/2000 [02:15<00:00, 14.75it/s]


Epoch 372 Mean Reward: 493.3089933853149


100%|██████████| 2000/2000 [02:17<00:00, 14.53it/s]


Epoch 373 Mean Reward: 485.97368661499024


100%|██████████| 2000/2000 [02:15<00:00, 14.77it/s]


Epoch 374 Mean Reward: 479.1789835281372


100%|██████████| 2000/2000 [02:24<00:00, 13.86it/s]


Epoch 375 Mean Reward: 492.3927512359619


100%|██████████| 2000/2000 [02:15<00:00, 14.77it/s]


Epoch 376 Mean Reward: 476.662039024353


100%|██████████| 2000/2000 [02:11<00:00, 15.25it/s]


Epoch 377 Mean Reward: 477.2345709075928


100%|██████████| 2000/2000 [02:12<00:00, 15.11it/s]


Epoch 378 Mean Reward: 472.96784623718264


100%|██████████| 2000/2000 [02:17<00:00, 14.55it/s]


Epoch 379 Mean Reward: 486.0954833755493


100%|██████████| 2000/2000 [02:17<00:00, 14.54it/s]


Epoch 380 Mean Reward: 478.36287078094483
Epoch 380 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 380 test:
Test Episode 1 Reward: 341.03431701660156
Test Episode 2 Reward: 341.03431701660156
Test Episode 3 Reward: 341.03431701660156
Test Episode 4 Reward: 197.7645263671875
Test Episode 5 Reward: 228.6368865966797
Test Episode 6 Reward: 170.04713439941406
Test Episode 7 Reward: 191.3456268310547
Test Episode 8 Reward: 180.31809997558594
Test Episode 9 Reward: 341.03431701660156
Test Episode 10 Reward: 341.03431701660156
Average Test Reward: 267.328385925


100%|██████████| 2000/2000 [02:30<00:00, 13.32it/s]


Epoch 381 Mean Reward: 516.1898448410034


100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]


Epoch 382 Mean Reward: 527.4877632598877


100%|██████████| 2000/2000 [02:16<00:00, 14.66it/s]


Epoch 383 Mean Reward: 503.5096832199097


100%|██████████| 2000/2000 [02:17<00:00, 14.50it/s]


Epoch 384 Mean Reward: 513.9422453536987


100%|██████████| 2000/2000 [02:14<00:00, 14.89it/s]


Epoch 385 Mean Reward: 490.3824281387329


100%|██████████| 2000/2000 [02:14<00:00, 14.86it/s]


Epoch 386 Mean Reward: 486.0221572113037


100%|██████████| 2000/2000 [02:14<00:00, 14.85it/s]


Epoch 387 Mean Reward: 480.979529335022


100%|██████████| 2000/2000 [02:23<00:00, 13.90it/s]


Epoch 388 Mean Reward: 494.7644642944336


100%|██████████| 2000/2000 [02:18<00:00, 14.42it/s]


Epoch 389 Mean Reward: 485.8881989593506


100%|██████████| 2000/2000 [02:13<00:00, 14.97it/s]


Epoch 390 Mean Reward: 481.265037437439
Epoch 390 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 390 test:
Test Episode 1 Reward: 352.1805419921875
Test Episode 2 Reward: 312.3889617919922
Test Episode 3 Reward: 352.1805419921875
Test Episode 4 Reward: 348.4792785644531
Test Episode 5 Reward: 321.97564697265625
Test Episode 6 Reward: 336.86474609375
Test Episode 7 Reward: 155.67501831054688
Test Episode 8 Reward: 352.1805419921875
Test Episode 9 Reward: 147.5967559814453
Test Episode 10 Reward: 430.54017639160156
Average Test Reward: 311.006221008


100%|██████████| 2000/2000 [02:15<00:00, 14.79it/s]


Epoch 391 Mean Reward: 557.8372322540283


100%|██████████| 2000/2000 [02:18<00:00, 14.49it/s]


Epoch 392 Mean Reward: 507.38216136932374


100%|██████████| 2000/2000 [02:17<00:00, 14.59it/s]


Epoch 393 Mean Reward: 501.50571934509276


100%|██████████| 2000/2000 [02:32<00:00, 13.11it/s]


Epoch 394 Mean Reward: 505.4930548400879


100%|██████████| 2000/2000 [02:20<00:00, 14.20it/s]


Epoch 395 Mean Reward: 504.011437286377


100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]


Epoch 396 Mean Reward: 512.0064322967529


100%|██████████| 2000/2000 [02:21<00:00, 14.16it/s]


Epoch 397 Mean Reward: 506.2286399078369


100%|██████████| 2000/2000 [02:21<00:00, 14.09it/s]


Epoch 398 Mean Reward: 519.1274895095826


100%|██████████| 2000/2000 [02:20<00:00, 14.22it/s]


Epoch 399 Mean Reward: 507.04965323638913


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Epoch 400 Mean Reward: 522.2345840148926
Epoch 400 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 400 test:
Test Episode 1 Reward: 363.9658203125
Test Episode 2 Reward: 285.51239013671875
Test Episode 3 Reward: 202.37554931640625
Test Episode 4 Reward: 363.9658203125
Test Episode 5 Reward: 215.6171112060547
Test Episode 6 Reward: 210.72605895996094
Test Episode 7 Reward: 357.2716369628906
Test Episode 8 Reward: 352.268798828125
Test Episode 9 Reward: 363.9658203125
Test Episode 10 Reward: 363.9658203125
Average Test Reward: 307.963482666
[(455.18590393066404, 230), (423.14712982177736, 80), (399.38034362792968, 170), (389.63769378662107, 250), (387.28757781982421, 90), (386.89609222412111, 150), (371.20399475097656, 320), (354.88217468261718, 180), (352.92456970214846, 100), (348.9226364135742, 20), (346.66695556640627, 40), (342.90294342041017, 60), (340.786865234375, 130), (338.81441192626954, 50), (326.83758087158202, 200), (322.43564147949218, 160), (321.50969238281249, 70

In [8]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

test_reward = test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=ckpts[-1])
print('Average Test Reward:', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-400
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-400
Test Episode 1 Reward: 589.4776611328125
Test Episode 2 Reward: 261.21299743652344
Test Episode 3 Reward: 281.9462585449219
Test Episode 4 Reward: 534.4662933349609
Test Episode 5 Reward: 136.14505004882812
Test Episode 6 Reward: 534.4662933349609
Test Episode 7 Reward: 230.34506225585938
Test Episode 8 Reward: 534.4662933349609
Test Episode 9 Reward: 534.4662933349609
Test Episode 10 Reward: 534.4662933349609
Test Episode 11 Reward: 228.29205322265625
Test Episode 12 Reward: 163.851318359375
Test Episode 13 Reward: 293.5017852783203
Test Episode 14 Reward: 563.1641998291016
Test Episode 15 Reward: 408.70989990234375
Test Episode 16 Reward: 534.4662933349609
Test Episode 17 Reward: 188.1556854248047
Test Episode 18 Reward: 224.1065673828125
Test Episode 19 Reward: 312.36061096191406
Test Episode 20 Reward: 534.4662933349609
Average Test Reward: 381.1266