In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 200
steps_per_epoch = 4000
learning_rate = 0.00025
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            buffer = np.concatenate((state.screen_buffer,
                                     np.expand_dims(state.depth_buffer,
                                                    axis=2)),
                                    axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

gamma = 0
t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
#Increase the discount factor at each epoch until it reaches approximately 0.99
    
    gamma = 1-.9775*(1-gamma)
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                np.expand_dims(state.depth_buffer, axis=2)),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    np.expand_dims(state.depth_buffer, axis=2)),
                                                    axis=2),
                                                    down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

        print('Epoch {} test:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward:', test_reward)
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 4000/4000 [04:40<00:00, 14.26it/s]


Epoch 1 Mean Reward: 130.0768296508789


100%|██████████| 4000/4000 [05:48<00:00, 11.48it/s]


Epoch 2 Mean Reward: 131.73243970489503


100%|██████████| 4000/4000 [06:28<00:00, 10.29it/s]


Epoch 3 Mean Reward: 131.85058742904664


100%|██████████| 4000/4000 [06:21<00:00, 10.47it/s]


Epoch 4 Mean Reward: 131.64112398529053


100%|██████████| 4000/4000 [06:23<00:00, 10.43it/s]


Epoch 5 Mean Reward: 132.79028734970092


100%|██████████| 4000/4000 [06:19<00:00, 10.53it/s]


Epoch 6 Mean Reward: 132.47072367095947


100%|██████████| 4000/4000 [06:18<00:00, 10.56it/s]


Epoch 7 Mean Reward: 131.9784510269165


100%|██████████| 4000/4000 [06:12<00:00, 10.74it/s]


Epoch 8 Mean Reward: 131.6268042488098


100%|██████████| 4000/4000 [05:56<00:00, 11.21it/s]


Epoch 9 Mean Reward: 131.8623759727478


100%|██████████| 4000/4000 [05:31<00:00, 12.07it/s]


Epoch 10 Mean Reward: 130.9585214767456
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 10 test:
Test Episode 1 Reward: 313.8831787109375
Test Episode 2 Reward: 641.9961242675781
Test Episode 3 Reward: 286.08424377441406
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 276.3576354980469
Test Episode 6 Reward: 430.6498718261719
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 280.4354553222656
Test Episode 9 Reward: 1010.2255096435547
Test Episode 10 Reward: 557.3707733154297
Average Test Reward: 438.690827942


100%|██████████| 4000/4000 [05:55<00:00, 11.25it/s]


Epoch 11 Mean Reward: 132.45178295135497


100%|██████████| 4000/4000 [05:39<00:00, 11.78it/s]


Epoch 12 Mean Reward: 131.2323169555664


100%|██████████| 4000/4000 [05:37<00:00, 11.85it/s]


Epoch 13 Mean Reward: 132.7087819480896


100%|██████████| 4000/4000 [05:17<00:00, 12.59it/s]


Epoch 14 Mean Reward: 130.8846902656555


100%|██████████| 4000/4000 [04:56<00:00, 13.47it/s]


Epoch 15 Mean Reward: 131.4717735710144


100%|██████████| 4000/4000 [05:00<00:00, 13.30it/s]


Epoch 16 Mean Reward: 129.20871812820434


100%|██████████| 4000/4000 [04:42<00:00, 14.14it/s]


Epoch 17 Mean Reward: 132.23716312789918


100%|██████████| 4000/4000 [04:27<00:00, 14.97it/s]


Epoch 18 Mean Reward: 131.71068187332153


100%|██████████| 4000/4000 [03:58<00:00, 16.76it/s]


Epoch 19 Mean Reward: 132.19569327163697


100%|██████████| 4000/4000 [04:08<00:00, 16.10it/s]


Epoch 20 Mean Reward: 130.53333921051026
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test:
Test Episode 1 Reward: 318.52259826660156
Test Episode 2 Reward: 291.6661071777344
Test Episode 3 Reward: 222.78773498535156
Test Episode 4 Reward: 618.0926361083984
Test Episode 5 Reward: 474.5926513671875
Test Episode 6 Reward: 508.9869842529297
Test Episode 7 Reward: 315.7142028808594
Test Episode 8 Reward: 577.1536407470703
Test Episode 9 Reward: 291.6661071777344
Test Episode 10 Reward: 524.3182373046875
Average Test Reward: 414.350090027


100%|██████████| 4000/4000 [04:04<00:00, 16.38it/s]


Epoch 21 Mean Reward: 132.990032913208


100%|██████████| 4000/4000 [04:19<00:00, 15.39it/s]


Epoch 22 Mean Reward: 131.97918474960326


100%|██████████| 4000/4000 [04:08<00:00, 16.10it/s]


Epoch 23 Mean Reward: 131.72417219161989


100%|██████████| 4000/4000 [04:02<00:00, 16.52it/s]


Epoch 24 Mean Reward: 133.03893059921265


100%|██████████| 4000/4000 [03:59<00:00, 16.68it/s]


Epoch 25 Mean Reward: 132.31222089767456


100%|██████████| 4000/4000 [04:00<00:00, 16.64it/s]


Epoch 26 Mean Reward: 131.97058220291137


100%|██████████| 4000/4000 [03:50<00:00, 17.36it/s]


Epoch 27 Mean Reward: 132.05873808670043


100%|██████████| 4000/4000 [03:49<00:00, 17.43it/s]


Epoch 28 Mean Reward: 130.07299286270143


100%|██████████| 4000/4000 [03:50<00:00, 17.32it/s]


Epoch 29 Mean Reward: 130.4089214363098


100%|██████████| 4000/4000 [03:50<00:00, 17.39it/s]


Epoch 30 Mean Reward: 131.6499128265381
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 30 test:
Test Episode 1 Reward: 274.81134033203125
Test Episode 2 Reward: 274.81134033203125
Test Episode 3 Reward: 274.81134033203125
Test Episode 4 Reward: 274.81134033203125
Test Episode 5 Reward: 274.81134033203125
Test Episode 6 Reward: 274.81134033203125
Test Episode 7 Reward: 493.90296936035156
Test Episode 8 Reward: 274.81134033203125
Test Episode 9 Reward: 564.6123657226562
Test Episode 10 Reward: 276.22637939453125
Average Test Reward: 325.84210968


100%|██████████| 4000/4000 [03:48<00:00, 17.49it/s]


Epoch 31 Mean Reward: 131.73776056671142


100%|██████████| 4000/4000 [03:57<00:00, 16.82it/s]


Epoch 32 Mean Reward: 129.15716524505615


100%|██████████| 4000/4000 [04:01<00:00, 16.60it/s]


Epoch 33 Mean Reward: 130.74460025024413


100%|██████████| 4000/4000 [04:01<00:00, 16.56it/s]


Epoch 34 Mean Reward: 132.87508403015136


100%|██████████| 4000/4000 [03:57<00:00, 16.83it/s]


Epoch 35 Mean Reward: 131.93770933914183


100%|██████████| 4000/4000 [03:48<00:00, 17.48it/s]


Epoch 36 Mean Reward: 131.74214099502564


100%|██████████| 4000/4000 [03:45<00:00, 17.73it/s]


Epoch 37 Mean Reward: 131.21318769073486


100%|██████████| 4000/4000 [03:47<00:00, 17.56it/s]


Epoch 38 Mean Reward: 130.7073992729187


100%|██████████| 4000/4000 [03:49<00:00, 17.41it/s]


Epoch 39 Mean Reward: 132.07349169540404


100%|██████████| 4000/4000 [03:50<00:00, 17.33it/s]


Epoch 40 Mean Reward: 130.7550929031372
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test:
Test Episode 1 Reward: 700.8740692138672
Test Episode 2 Reward: 301.76324462890625
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 327.1780242919922
Test Episode 7 Reward: 298.47496032714844
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 273.32342529296875
Test Episode 10 Reward: 294.95274353027344
Average Test Reward: 337.637744141


100%|██████████| 4000/4000 [03:51<00:00, 17.30it/s]


Epoch 41 Mean Reward: 132.35336581039428


100%|██████████| 4000/4000 [03:56<00:00, 16.89it/s]


Epoch 42 Mean Reward: 130.14656861114503


100%|██████████| 4000/4000 [04:11<00:00, 15.91it/s]


Epoch 43 Mean Reward: 132.43070035934448


100%|██████████| 4000/4000 [04:58<00:00, 13.42it/s]


Epoch 44 Mean Reward: 131.926813911438


100%|██████████| 4000/4000 [04:09<00:00, 16.02it/s]


Epoch 45 Mean Reward: 130.01943951034545


100%|██████████| 4000/4000 [04:14<00:00, 15.69it/s]


Epoch 46 Mean Reward: 132.598670589447


100%|██████████| 4000/4000 [04:11<00:00, 15.91it/s]


Epoch 47 Mean Reward: 129.2146865539551


100%|██████████| 4000/4000 [04:06<00:00, 16.25it/s]


Epoch 48 Mean Reward: 131.96025432586669


100%|██████████| 4000/4000 [04:12<00:00, 15.87it/s]


Epoch 49 Mean Reward: 131.39969960021972


100%|██████████| 4000/4000 [04:15<00:00, 15.68it/s]


Epoch 50 Mean Reward: 131.46448286437987
Epoch 50 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 50 test:
Test Episode 1 Reward: 256.72059631347656
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 226.9559326171875
Test Episode 6 Reward: 294.95274353027344
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 294.95274353027344
Test Episode 10 Reward: 294.95274353027344
Average Test Reward: 284.329847717


100%|██████████| 4000/4000 [04:35<00:00, 14.51it/s]


Epoch 51 Mean Reward: 131.48463440704344


100%|██████████| 4000/4000 [04:43<00:00, 14.09it/s]


Epoch 52 Mean Reward: 132.1972994155884


100%|██████████| 4000/4000 [05:02<00:00, 13.22it/s]


Epoch 53 Mean Reward: 131.0674154968262


100%|██████████| 4000/4000 [05:02<00:00, 13.23it/s]


Epoch 54 Mean Reward: 130.12156269073486


100%|██████████| 4000/4000 [05:38<00:00, 11.82it/s]


Epoch 55 Mean Reward: 131.47851750183105


100%|██████████| 4000/4000 [05:10<00:00, 12.88it/s]


Epoch 56 Mean Reward: 130.05734298706054


100%|██████████| 4000/4000 [04:52<00:00, 13.69it/s]


Epoch 57 Mean Reward: 131.90295315933227


100%|██████████| 4000/4000 [05:00<00:00, 13.29it/s]


Epoch 58 Mean Reward: 129.97406331253052


100%|██████████| 4000/4000 [04:37<00:00, 14.43it/s]


Epoch 59 Mean Reward: 132.26497598648072


100%|██████████| 4000/4000 [04:46<00:00, 13.95it/s]


Epoch 60 Mean Reward: 131.96677264404298
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test:
Test Episode 1 Reward: 570.4558563232422
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 974.9571533203125
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 515.4047088623047
Test Episode 7 Reward: 537.3904876708984
Test Episode 8 Reward: 693.4804534912109
Test Episode 9 Reward: 301.9579620361328
Test Episode 10 Reward: 374.98167419433594
Average Test Reward: 485.348652649


100%|██████████| 4000/4000 [04:26<00:00, 15.00it/s]


Epoch 61 Mean Reward: 165.42624842834473


100%|██████████| 4000/4000 [04:12<00:00, 15.84it/s]


Epoch 62 Mean Reward: 166.48757865142824


100%|██████████| 4000/4000 [04:05<00:00, 16.31it/s]


Epoch 63 Mean Reward: 166.33145136260987


100%|██████████| 4000/4000 [04:08<00:00, 16.07it/s]


Epoch 64 Mean Reward: 168.31967315292357


100%|██████████| 4000/4000 [03:50<00:00, 17.36it/s]


Epoch 65 Mean Reward: 172.8091115913391


100%|██████████| 4000/4000 [03:56<00:00, 16.90it/s]


Epoch 66 Mean Reward: 172.42236248779298


100%|██████████| 4000/4000 [04:04<00:00, 16.38it/s]


Epoch 67 Mean Reward: 173.81450440597536


100%|██████████| 4000/4000 [04:15<00:00, 15.64it/s]


Epoch 68 Mean Reward: 175.51306463623047


100%|██████████| 4000/4000 [04:17<00:00, 15.55it/s]


Epoch 69 Mean Reward: 176.59462302398683


100%|██████████| 4000/4000 [04:25<00:00, 15.08it/s]


Epoch 70 Mean Reward: 179.48847420883178
Epoch 70 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 70 test:
Test Episode 1 Reward: 510.80609130859375
Test Episode 2 Reward: 428.7430419921875
Test Episode 3 Reward: 428.7430419921875
Test Episode 4 Reward: 428.7430419921875
Test Episode 5 Reward: 428.7430419921875
Test Episode 6 Reward: 284.4189910888672
Test Episode 7 Reward: 428.7430419921875
Test Episode 8 Reward: 242.90948486328125
Test Episode 9 Reward: 334.2296142578125
Test Episode 10 Reward: 564.8265228271484
Average Test Reward: 408.090591431


100%|██████████| 4000/4000 [04:30<00:00, 14.77it/s]


Epoch 71 Mean Reward: 181.69944111251831


100%|██████████| 4000/4000 [04:33<00:00, 14.64it/s]


Epoch 72 Mean Reward: 183.97219063568116


100%|██████████| 4000/4000 [04:23<00:00, 15.16it/s]


Epoch 73 Mean Reward: 183.54749786376954


100%|██████████| 4000/4000 [04:31<00:00, 14.72it/s]


Epoch 74 Mean Reward: 183.63579685592651


100%|██████████| 4000/4000 [04:39<00:00, 14.33it/s]


Epoch 75 Mean Reward: 187.90072861099245


100%|██████████| 4000/4000 [04:39<00:00, 14.31it/s]


Epoch 76 Mean Reward: 190.31685130310058


100%|██████████| 4000/4000 [04:49<00:00, 13.81it/s]


Epoch 77 Mean Reward: 189.65962398147582


100%|██████████| 4000/4000 [04:57<00:00, 13.45it/s]


Epoch 78 Mean Reward: 190.82087535858153


100%|██████████| 4000/4000 [04:55<00:00, 13.52it/s]


Epoch 79 Mean Reward: 193.31218514633179


100%|██████████| 4000/4000 [04:53<00:00, 13.61it/s]


Epoch 80 Mean Reward: 192.7180360069275
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test:
Test Episode 1 Reward: 257.60589599609375
Test Episode 2 Reward: 277.4475402832031
Test Episode 3 Reward: 277.4475402832031
Test Episode 4 Reward: 111.24708557128906
Test Episode 5 Reward: 199.1931610107422
Test Episode 6 Reward: 277.4475402832031
Test Episode 7 Reward: 610.9441833496094
Test Episode 8 Reward: 277.4475402832031
Test Episode 9 Reward: 277.4475402832031
Test Episode 10 Reward: 277.4475402832031
Average Test Reward: 284.367556763


100%|██████████| 4000/4000 [04:47<00:00, 13.91it/s]


Epoch 81 Mean Reward: 196.29880209732056


100%|██████████| 4000/4000 [04:54<00:00, 13.58it/s]


Epoch 82 Mean Reward: 199.0619907989502


100%|██████████| 4000/4000 [04:47<00:00, 13.93it/s]


Epoch 83 Mean Reward: 198.53071001434327


100%|██████████| 4000/4000 [04:53<00:00, 13.62it/s]


Epoch 84 Mean Reward: 202.02558534622193


100%|██████████| 4000/4000 [04:49<00:00, 13.81it/s]


Epoch 85 Mean Reward: 202.79804793167114


100%|██████████| 4000/4000 [04:57<00:00, 13.45it/s]


Epoch 86 Mean Reward: 203.12727702713013


100%|██████████| 4000/4000 [04:55<00:00, 13.55it/s]


Epoch 87 Mean Reward: 206.2581333656311


100%|██████████| 4000/4000 [04:53<00:00, 13.64it/s]


Epoch 88 Mean Reward: 206.8240604133606


100%|██████████| 4000/4000 [05:04<00:00, 13.15it/s]


Epoch 89 Mean Reward: 206.9593797416687


100%|██████████| 4000/4000 [05:52<00:00, 11.36it/s]


Epoch 90 Mean Reward: 215.00862691116333
Epoch 90 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 90 test:
Test Episode 1 Reward: 269.4128723144531
Test Episode 2 Reward: 269.4128723144531
Test Episode 3 Reward: 472.83290100097656
Test Episode 4 Reward: 597.7119750976562
Test Episode 5 Reward: 285.1324462890625
Test Episode 6 Reward: 289.2076110839844
Test Episode 7 Reward: 218.61282348632812
Test Episode 8 Reward: 269.4128723144531
Test Episode 9 Reward: 289.1766815185547
Test Episode 10 Reward: 269.4128723144531
Average Test Reward: 323.032592773


100%|██████████| 4000/4000 [06:01<00:00, 11.07it/s]


Epoch 91 Mean Reward: 214.1088186378479


100%|██████████| 4000/4000 [05:41<00:00, 11.72it/s]


Epoch 92 Mean Reward: 214.7110295753479


100%|██████████| 4000/4000 [05:30<00:00, 12.09it/s]


Epoch 93 Mean Reward: 213.08982878112792


100%|██████████| 4000/4000 [05:52<00:00, 11.36it/s]


Epoch 94 Mean Reward: 218.75476781463624


100%|██████████| 4000/4000 [05:45<00:00, 11.57it/s]


Epoch 95 Mean Reward: 217.33760649108888


100%|██████████| 4000/4000 [05:34<00:00, 11.97it/s]


Epoch 96 Mean Reward: 219.7101915397644


100%|██████████| 4000/4000 [04:42<00:00, 14.17it/s]


Epoch 97 Mean Reward: 223.43283879852294


100%|██████████| 4000/4000 [04:33<00:00, 14.63it/s]


Epoch 98 Mean Reward: 224.01340418624878


100%|██████████| 4000/4000 [04:21<00:00, 15.28it/s]


Epoch 99 Mean Reward: 226.79300812911987


100%|██████████| 4000/4000 [04:12<00:00, 15.83it/s]


Epoch 100 Mean Reward: 226.20066255569458
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test:
Test Episode 1 Reward: 285.4504699707031
Test Episode 2 Reward: 285.4504699707031
Test Episode 3 Reward: 285.4504699707031
Test Episode 4 Reward: 180.01486206054688
Test Episode 5 Reward: 285.4504699707031
Test Episode 6 Reward: 285.4504699707031
Test Episode 7 Reward: 557.2093200683594
Test Episode 8 Reward: 262.2245178222656
Test Episode 9 Reward: 223.8101806640625
Test Episode 10 Reward: 285.4504699707031
Average Test Reward: 293.596170044


100%|██████████| 4000/4000 [04:28<00:00, 14.90it/s]


Epoch 101 Mean Reward: 234.1022548980713


100%|██████████| 4000/4000 [04:25<00:00, 15.09it/s]


Epoch 102 Mean Reward: 236.37710931015016


100%|██████████| 4000/4000 [04:14<00:00, 15.74it/s]


Epoch 103 Mean Reward: 237.6172262802124


100%|██████████| 4000/4000 [04:19<00:00, 15.41it/s]


Epoch 104 Mean Reward: 238.78235174942017


100%|██████████| 4000/4000 [04:07<00:00, 16.15it/s]


Epoch 105 Mean Reward: 237.60985239028932


100%|██████████| 4000/4000 [03:56<00:00, 16.91it/s]


Epoch 106 Mean Reward: 242.525998626709


100%|██████████| 4000/4000 [04:06<00:00, 16.20it/s]


Epoch 107 Mean Reward: 243.74155359649657


100%|██████████| 4000/4000 [04:02<00:00, 16.52it/s]


Epoch 108 Mean Reward: 247.57620064163208


100%|██████████| 4000/4000 [04:14<00:00, 15.69it/s]


Epoch 109 Mean Reward: 243.91584283447267


100%|██████████| 4000/4000 [03:56<00:00, 16.92it/s]


Epoch 110 Mean Reward: 246.95599893951416
Epoch 110 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 110 test:
Test Episode 1 Reward: 301.3356628417969
Test Episode 2 Reward: 270.10693359375
Test Episode 3 Reward: 270.10693359375
Test Episode 4 Reward: 270.10693359375
Test Episode 5 Reward: 286.1065673828125
Test Episode 6 Reward: 261.53526306152344
Test Episode 7 Reward: 973.6039886474609
Test Episode 8 Reward: 270.10693359375
Test Episode 9 Reward: 410.96580505371094
Test Episode 10 Reward: 325.51023864746094
Average Test Reward: 363.948526001


100%|██████████| 4000/4000 [04:03<00:00, 16.43it/s]


Epoch 111 Mean Reward: 259.4404361190796


100%|██████████| 4000/4000 [04:16<00:00, 15.61it/s]


Epoch 112 Mean Reward: 259.1875207595825


100%|██████████| 4000/4000 [04:16<00:00, 15.60it/s]


Epoch 113 Mean Reward: 261.2108068695068


100%|██████████| 4000/4000 [04:14<00:00, 15.69it/s]


Epoch 114 Mean Reward: 258.3662541503906


100%|██████████| 4000/4000 [04:07<00:00, 16.17it/s]


Epoch 115 Mean Reward: 265.43562325668336


100%|██████████| 4000/4000 [04:13<00:00, 15.76it/s]


Epoch 116 Mean Reward: 267.82926725387574


100%|██████████| 4000/4000 [03:59<00:00, 16.68it/s]


Epoch 117 Mean Reward: 264.5720516090393


100%|██████████| 4000/4000 [03:56<00:00, 16.92it/s]


Epoch 118 Mean Reward: 265.4151044387817


100%|██████████| 4000/4000 [04:09<00:00, 16.00it/s]


Epoch 119 Mean Reward: 271.88853927993773


100%|██████████| 4000/4000 [03:58<00:00, 16.78it/s]


Epoch 120 Mean Reward: 272.28604389190673
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test:
Test Episode 1 Reward: 271.63800048828125
Test Episode 2 Reward: 458.1286926269531
Test Episode 3 Reward: 414.6296691894531
Test Episode 4 Reward: 382.9407653808594
Test Episode 5 Reward: 414.6296691894531
Test Episode 6 Reward: 414.6296691894531
Test Episode 7 Reward: 308.9632873535156
Test Episode 8 Reward: 414.6296691894531
Test Episode 9 Reward: 290.2635955810547
Test Episode 10 Reward: 414.6296691894531
Average Test Reward: 378.508268738


100%|██████████| 4000/4000 [03:56<00:00, 16.93it/s]


Epoch 121 Mean Reward: 274.80478967285154


100%|██████████| 4000/4000 [03:59<00:00, 16.72it/s]


Epoch 122 Mean Reward: 281.82977279663083


100%|██████████| 4000/4000 [04:16<00:00, 15.61it/s]


Epoch 123 Mean Reward: 282.39421026992795


100%|██████████| 4000/4000 [04:15<00:00, 15.65it/s]


Epoch 124 Mean Reward: 286.843053768158


100%|██████████| 4000/4000 [04:25<00:00, 15.07it/s]


Epoch 125 Mean Reward: 282.444007850647


100%|██████████| 4000/4000 [04:13<00:00, 15.81it/s]


Epoch 126 Mean Reward: 287.7027420539856


100%|██████████| 4000/4000 [04:28<00:00, 14.91it/s]


Epoch 127 Mean Reward: 284.5917051773071


100%|██████████| 4000/4000 [04:09<00:00, 16.00it/s]


Epoch 128 Mean Reward: 292.7207597465515


100%|██████████| 4000/4000 [04:12<00:00, 15.87it/s]


Epoch 129 Mean Reward: 292.73213494873045


100%|██████████| 4000/4000 [04:33<00:00, 14.64it/s]


Epoch 130 Mean Reward: 296.0211950187683
Epoch 130 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 130 test:
Test Episode 1 Reward: 332.67950439453125
Test Episode 2 Reward: 446.35328674316406
Test Episode 3 Reward: 262.7099151611328
Test Episode 4 Reward: 544.7663879394531
Test Episode 5 Reward: 260.09214782714844
Test Episode 6 Reward: 260.09214782714844
Test Episode 7 Reward: 260.09214782714844
Test Episode 8 Reward: 554.3865661621094
Test Episode 9 Reward: 260.09214782714844
Test Episode 10 Reward: 287.56414794921875
Average Test Reward: 346.882839966


100%|██████████| 4000/4000 [04:18<00:00, 15.49it/s]


Epoch 131 Mean Reward: 300.6677103652954


100%|██████████| 4000/4000 [04:17<00:00, 15.53it/s]


Epoch 132 Mean Reward: 302.45128509521487


100%|██████████| 4000/4000 [04:29<00:00, 14.84it/s]


Epoch 133 Mean Reward: 307.1645518989563


100%|██████████| 4000/4000 [04:27<00:00, 14.97it/s]


Epoch 134 Mean Reward: 308.55006903076173


100%|██████████| 4000/4000 [04:17<00:00, 15.53it/s]


Epoch 135 Mean Reward: 313.87099045562746


100%|██████████| 4000/4000 [04:16<00:00, 15.60it/s]


Epoch 136 Mean Reward: 314.729307510376


100%|██████████| 4000/4000 [04:34<00:00, 14.58it/s]


Epoch 137 Mean Reward: 314.07102401351926


100%|██████████| 4000/4000 [04:32<00:00, 14.67it/s]


Epoch 138 Mean Reward: 321.94925247573855


100%|██████████| 4000/4000 [04:31<00:00, 14.76it/s]


Epoch 139 Mean Reward: 316.2787634391785


100%|██████████| 4000/4000 [04:33<00:00, 14.65it/s]


Epoch 140 Mean Reward: 325.6750234298706
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test:
Test Episode 1 Reward: 354.1299133300781
Test Episode 2 Reward: 354.1299133300781
Test Episode 3 Reward: 187.0941619873047
Test Episode 4 Reward: 354.1299133300781
Test Episode 5 Reward: 354.1299133300781
Test Episode 6 Reward: 354.1299133300781
Test Episode 7 Reward: 354.1299133300781
Test Episode 8 Reward: 401.9322509765625
Test Episode 9 Reward: 181.90589904785156
Test Episode 10 Reward: 422.9242706298828
Average Test Reward: 331.863606262


100%|██████████| 4000/4000 [04:11<00:00, 15.90it/s]


Epoch 141 Mean Reward: 307.5256817970276


100%|██████████| 4000/4000 [04:21<00:00, 15.28it/s]


Epoch 142 Mean Reward: 317.64130151367186


100%|██████████| 4000/4000 [04:47<00:00, 13.92it/s]


Epoch 143 Mean Reward: 312.35162882614134


100%|██████████| 4000/4000 [04:44<00:00, 14.08it/s]


Epoch 144 Mean Reward: 317.82687058258057


100%|██████████| 4000/4000 [04:20<00:00, 15.38it/s]


Epoch 145 Mean Reward: 317.0373086433411


100%|██████████| 4000/4000 [04:12<00:00, 15.87it/s]


Epoch 146 Mean Reward: 320.4730117225647


100%|██████████| 4000/4000 [04:29<00:00, 14.82it/s]


Epoch 147 Mean Reward: 325.8952921028137


100%|██████████| 4000/4000 [04:25<00:00, 15.09it/s]


Epoch 148 Mean Reward: 333.26720097351074


100%|██████████| 4000/4000 [04:15<00:00, 15.64it/s]


Epoch 149 Mean Reward: 332.9845788154602


100%|██████████| 4000/4000 [04:25<00:00, 15.08it/s]


Epoch 150 Mean Reward: 338.5743555297852
Epoch 150 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 150 test:
Test Episode 1 Reward: 204.5345001220703
Test Episode 2 Reward: 243.20802307128906
Test Episode 3 Reward: 375.4474182128906
Test Episode 4 Reward: 396.4209442138672
Test Episode 5 Reward: 484.1993103027344
Test Episode 6 Reward: 586.4548950195312
Test Episode 7 Reward: 413.80592346191406
Test Episode 8 Reward: 375.4474182128906
Test Episode 9 Reward: 277.0556335449219
Test Episode 10 Reward: 441.7074279785156
Average Test Reward: 379.828149414


100%|██████████| 4000/4000 [04:15<00:00, 15.64it/s]


Epoch 151 Mean Reward: 353.18999223327637


100%|██████████| 4000/4000 [04:17<00:00, 15.53it/s]


Epoch 152 Mean Reward: 352.48493274307253


100%|██████████| 4000/4000 [04:30<00:00, 14.77it/s]


Epoch 153 Mean Reward: 354.19313568878175


100%|██████████| 4000/4000 [04:22<00:00, 15.26it/s]


Epoch 154 Mean Reward: 357.1776263961792


100%|██████████| 4000/4000 [04:17<00:00, 15.55it/s]


Epoch 155 Mean Reward: 363.1110735969543


100%|██████████| 4000/4000 [04:25<00:00, 15.07it/s]


Epoch 156 Mean Reward: 365.6291100769043


100%|██████████| 4000/4000 [04:14<00:00, 15.69it/s]


Epoch 157 Mean Reward: 367.25894065093996


100%|██████████| 4000/4000 [04:15<00:00, 15.64it/s]


Epoch 158 Mean Reward: 372.2971561126709


100%|██████████| 4000/4000 [04:24<00:00, 15.12it/s]


Epoch 159 Mean Reward: 375.1867770957947


100%|██████████| 4000/4000 [04:24<00:00, 15.15it/s]


Epoch 160 Mean Reward: 376.30051553344725
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test:
Test Episode 1 Reward: 229.4343719482422
Test Episode 2 Reward: 134.20089721679688
Test Episode 3 Reward: 134.20089721679688
Test Episode 4 Reward: 158.81008911132812
Test Episode 5 Reward: 373.1354217529297
Test Episode 6 Reward: 238.17813110351562
Test Episode 7 Reward: 134.20089721679688
Test Episode 8 Reward: 134.20089721679688
Test Episode 9 Reward: 425.47032165527344
Test Episode 10 Reward: 393.10401916503906
Average Test Reward: 235.49359436


100%|██████████| 4000/4000 [04:40<00:00, 14.24it/s]


Epoch 161 Mean Reward: 408.7112506599426


100%|██████████| 4000/4000 [04:45<00:00, 13.99it/s]


Epoch 162 Mean Reward: 416.3682893562317


100%|██████████| 4000/4000 [04:50<00:00, 13.77it/s]


Epoch 163 Mean Reward: 417.9947291412353


100%|██████████| 4000/4000 [04:41<00:00, 14.23it/s]


Epoch 164 Mean Reward: 415.39729680252077


100%|██████████| 4000/4000 [04:50<00:00, 13.79it/s]


Epoch 165 Mean Reward: 434.4940998802185


100%|██████████| 4000/4000 [04:50<00:00, 13.75it/s]


Epoch 166 Mean Reward: 437.55276114654544


100%|██████████| 4000/4000 [05:04<00:00, 13.16it/s]


Epoch 167 Mean Reward: 442.06678796386717


100%|██████████| 4000/4000 [04:53<00:00, 13.62it/s]


Epoch 168 Mean Reward: 447.2917109603882


100%|██████████| 4000/4000 [04:42<00:00, 14.18it/s]


Epoch 169 Mean Reward: 452.8744013824463


100%|██████████| 4000/4000 [05:03<00:00, 13.20it/s]


Epoch 170 Mean Reward: 453.83132860565183
Epoch 170 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 170 test:
Test Episode 1 Reward: 342.3291015625
Test Episode 2 Reward: 289.14306640625
Test Episode 3 Reward: 194.29954528808594
Test Episode 4 Reward: 386.7600555419922
Test Episode 5 Reward: 206.5336456298828
Test Episode 6 Reward: 342.3291015625
Test Episode 7 Reward: 483.4337463378906
Test Episode 8 Reward: 193.93678283691406
Test Episode 9 Reward: 208.21070861816406
Test Episode 10 Reward: 342.3291015625
Average Test Reward: 298.930485535


100%|██████████| 4000/4000 [05:06<00:00, 13.06it/s]


Epoch 171 Mean Reward: 456.2386840744019


100%|██████████| 4000/4000 [05:21<00:00, 12.46it/s]


Epoch 172 Mean Reward: 452.837574306488


100%|██████████| 4000/4000 [05:05<00:00, 13.11it/s]


Epoch 173 Mean Reward: 455.88442151260375


100%|██████████| 4000/4000 [04:52<00:00, 13.69it/s]


Epoch 174 Mean Reward: 459.4341123199463


100%|██████████| 4000/4000 [05:00<00:00, 13.32it/s]


Epoch 175 Mean Reward: 469.0754037475586


100%|██████████| 4000/4000 [04:55<00:00, 13.56it/s]


Epoch 176 Mean Reward: 460.81252319335937


100%|██████████| 4000/4000 [05:33<00:00, 11.99it/s]


Epoch 177 Mean Reward: 471.1287050704956


100%|██████████| 4000/4000 [05:34<00:00, 11.96it/s]


Epoch 178 Mean Reward: 473.3209258766174


100%|██████████| 4000/4000 [05:05<00:00, 13.08it/s]


Epoch 179 Mean Reward: 475.9151225509644


100%|██████████| 4000/4000 [05:13<00:00, 12.75it/s]


Epoch 180 Mean Reward: 479.72685542678835
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test:
Test Episode 1 Reward: 526.3045806884766
Test Episode 2 Reward: 155.55596923828125
Test Episode 3 Reward: 292.4201202392578
Test Episode 4 Reward: 317.9814453125
Test Episode 5 Reward: 289.29644775390625
Test Episode 6 Reward: 119.51292419433594
Test Episode 7 Reward: 254.12400817871094
Test Episode 8 Reward: 292.4201202392578
Test Episode 9 Reward: 161.8614501953125
Test Episode 10 Reward: 292.4201202392578
Average Test Reward: 270.189718628


100%|██████████| 4000/4000 [05:17<00:00, 12.60it/s]


Epoch 181 Mean Reward: 539.463283039093


100%|██████████| 4000/4000 [04:54<00:00, 13.57it/s]


Epoch 182 Mean Reward: 549.4247138557434


100%|██████████| 4000/4000 [04:40<00:00, 14.24it/s]


Epoch 183 Mean Reward: 551.1349490432739


100%|██████████| 4000/4000 [05:58<00:00, 11.17it/s]


Epoch 184 Mean Reward: 557.2832827606201


100%|██████████| 4000/4000 [06:45<00:00,  9.86it/s]


Epoch 185 Mean Reward: 555.8414978179932


100%|██████████| 4000/4000 [07:06<00:00,  9.39it/s]


Epoch 186 Mean Reward: 544.4319603767395


100%|██████████| 4000/4000 [05:08<00:00, 12.97it/s]


Epoch 187 Mean Reward: 541.9849907188416


100%|██████████| 4000/4000 [04:49<00:00, 13.82it/s]


Epoch 188 Mean Reward: 542.8068380928039


100%|██████████| 4000/4000 [04:43<00:00, 14.09it/s]


Epoch 189 Mean Reward: 541.1985078201294


100%|██████████| 4000/4000 [04:37<00:00, 14.43it/s]


Epoch 190 Mean Reward: 547.7929331550598
Epoch 190 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 190 test:
Test Episode 1 Reward: 84.07478332519531
Test Episode 2 Reward: 84.02421569824219
Test Episode 3 Reward: 84.02421569824219
Test Episode 4 Reward: 84.02421569824219
Test Episode 5 Reward: 84.00192260742188
Test Episode 6 Reward: 358.1158905029297
Test Episode 7 Reward: 84.02421569824219
Test Episode 8 Reward: 84.02421569824219
Test Episode 9 Reward: 84.02421569824219
Test Episode 10 Reward: 84.02421569824219
Average Test Reward: 111.436210632


100%|██████████| 4000/4000 [09:13<00:00,  7.23it/s]


Epoch 191 Mean Reward: 545.5392677345276


100%|██████████| 4000/4000 [07:10<00:00,  9.30it/s]


Epoch 192 Mean Reward: 554.2566869049073


100%|██████████| 4000/4000 [05:01<00:00, 13.27it/s]


Epoch 193 Mean Reward: 545.9510677986145


100%|██████████| 4000/4000 [04:54<00:00, 13.60it/s]


Epoch 194 Mean Reward: 550.7100837402344


100%|██████████| 4000/4000 [07:29<00:00,  8.89it/s]


Epoch 195 Mean Reward: 549.3324090461731


100%|██████████| 4000/4000 [07:23<00:00,  9.02it/s]


Epoch 196 Mean Reward: 546.9373362350464


100%|██████████| 4000/4000 [07:29<00:00,  8.89it/s]


Epoch 197 Mean Reward: 554.103631980896


100%|██████████| 4000/4000 [07:03<00:00,  9.44it/s]


Epoch 198 Mean Reward: 530.021135761261


100%|██████████| 4000/4000 [06:20<00:00, 10.50it/s]


Epoch 199 Mean Reward: 535.5757898445129


100%|██████████| 4000/4000 [05:28<00:00, 12.17it/s]


Epoch 200 Mean Reward: 529.2551986923218
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test:
Test Episode 1 Reward: 184.3282470703125
Test Episode 2 Reward: 177.83558654785156
Test Episode 3 Reward: 403.83863830566406
Test Episode 4 Reward: 403.83863830566406
Test Episode 5 Reward: 271.7496337890625
Test Episode 6 Reward: 403.83863830566406
Test Episode 7 Reward: 456.1239318847656
Test Episode 8 Reward: 187.3578643798828
Test Episode 9 Reward: 403.83863830566406
Test Episode 10 Reward: 314.6588439941406
Average Test Reward: 320.740866089
[(485.34865264892579, 60), (438.69082794189455, 10), (414.35009002685547, 20), (408.09059143066406, 70), (379.82814941406252, 150), (378.50826873779295, 120), (363.94852600097659, 110), (346.88283996582032, 130), (337.63774414062499, 40), (331.86360626220704, 140), (325.84210968017578, 30), (323.0325927734375, 90), (320.74086608886716, 200), (298.93048553466798, 170), (293.59617004394534, 100), (284.3675567626953, 80), (284.3298

In [6]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

test_reward = test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=ckpts[-1])
print('Average Test Reward:', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-200
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-200
Test Episode 1 Reward: 336.47242736816406
Test Episode 2 Reward: 336.47242736816406
Test Episode 3 Reward: 336.47242736816406
Test Episode 4 Reward: 187.93832397460938
Test Episode 5 Reward: 129.0177001953125
Test Episode 6 Reward: 336.47242736816406
Test Episode 7 Reward: 496.9255828857422
Test Episode 8 Reward: 120.54534912109375
Test Episode 9 Reward: 343.6752471923828
Test Episode 10 Reward: 177.03176879882812
Test Episode 11 Reward: 220.8328857421875
Test Episode 12 Reward: 336.47242736816406
Test Episode 13 Reward: 197.3880157470703
Test Episode 14 Reward: 326.22898864746094
Test Episode 15 Reward: 336.47242736816406
Test Episode 16 Reward: 251.2376251220703
Test Episode 17 Reward: 249.87205505371094
Test Episode 18 Reward: 467.2608642578125
Test Episode 19 Reward: 336.47242736816406
Test Episode 20 Reward: 336.47242736816406
Average Test Reward: 