In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 200
steps_per_epoch = 2000
learning_rate = 0.001
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            buffer = np.concatenate((state.screen_buffer,
                                     np.expand_dims(state.depth_buffer,
                                                    axis=2)),
                                    axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

gamma = 0
t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
#Increase the discount factor at each epoch until it reaches approximately 0.99
    
    gamma = 1-.9775*(1-gamma)
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                np.expand_dims(state.depth_buffer, axis=2)),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    np.expand_dims(state.depth_buffer, axis=2)),
                                                    axis=2),
                                                    down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

        print('Epoch {} test:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward:', test_reward)
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [02:13<00:00, 14.97it/s]


Epoch 1 Mean Reward: 131.20637590026857


100%|██████████| 2000/2000 [02:26<00:00, 13.69it/s]


Epoch 2 Mean Reward: 133.057186378479


100%|██████████| 2000/2000 [02:30<00:00, 13.25it/s]


Epoch 3 Mean Reward: 131.8914068222046


100%|██████████| 2000/2000 [03:07<00:00, 10.66it/s]


Epoch 4 Mean Reward: 131.41910042572022


100%|██████████| 2000/2000 [03:11<00:00, 10.47it/s]


Epoch 5 Mean Reward: 132.31385022735597


100%|██████████| 2000/2000 [03:10<00:00, 10.48it/s]


Epoch 6 Mean Reward: 130.56549856567383


100%|██████████| 2000/2000 [03:20<00:00,  9.98it/s]


Epoch 7 Mean Reward: 131.11374017333983


100%|██████████| 2000/2000 [03:24<00:00,  9.76it/s]


Epoch 8 Mean Reward: 131.60551104736328


100%|██████████| 2000/2000 [03:23<00:00,  9.81it/s]


Epoch 9 Mean Reward: 131.69972844696045


100%|██████████| 2000/2000 [03:18<00:00, 10.07it/s]


Epoch 10 Mean Reward: 129.60476893615723
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 10 test:
Test Episode 1 Reward: 264.51609802246094
Test Episode 2 Reward: 100.95640563964844
Test Episode 3 Reward: 464.0071716308594
Test Episode 4 Reward: 514.6394958496094
Test Episode 5 Reward: 260.4552307128906
Test Episode 6 Reward: 264.51609802246094
Test Episode 7 Reward: 264.51609802246094
Test Episode 8 Reward: 264.51609802246094
Test Episode 9 Reward: 264.51609802246094
Test Episode 10 Reward: 224.53262329101562
Average Test Reward: 288.717141724


100%|██████████| 2000/2000 [03:11<00:00, 10.45it/s]


Epoch 11 Mean Reward: 130.3230729598999


100%|██████████| 2000/2000 [03:17<00:00, 10.11it/s]


Epoch 12 Mean Reward: 133.49271157073974


100%|██████████| 2000/2000 [03:13<00:00, 10.36it/s]


Epoch 13 Mean Reward: 130.2524232788086


100%|██████████| 2000/2000 [03:14<00:00, 10.28it/s]


Epoch 14 Mean Reward: 131.63729677581787


100%|██████████| 2000/2000 [03:13<00:00, 10.33it/s]


Epoch 15 Mean Reward: 129.50482849884034


100%|██████████| 2000/2000 [03:14<00:00, 10.28it/s]


Epoch 16 Mean Reward: 131.23255855560302


100%|██████████| 2000/2000 [03:15<00:00, 10.23it/s]


Epoch 17 Mean Reward: 131.24252757263184


100%|██████████| 2000/2000 [03:14<00:00, 10.30it/s]


Epoch 18 Mean Reward: 130.60085340118408


100%|██████████| 2000/2000 [03:09<00:00, 10.57it/s]


Epoch 19 Mean Reward: 130.82890988159178


100%|██████████| 2000/2000 [03:05<00:00, 10.79it/s]


Epoch 20 Mean Reward: 131.32710369873047
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test:
Test Episode 1 Reward: 269.7161560058594
Test Episode 2 Reward: 269.7161560058594
Test Episode 3 Reward: 437.1587219238281
Test Episode 4 Reward: 269.7161560058594
Test Episode 5 Reward: 269.7161560058594
Test Episode 6 Reward: 280.31324768066406
Test Episode 7 Reward: 269.7161560058594
Test Episode 8 Reward: 655.4543151855469
Test Episode 9 Reward: 224.3813934326172
Test Episode 10 Reward: 284.05975341796875
Average Test Reward: 322.994821167


100%|██████████| 2000/2000 [03:03<00:00, 10.87it/s]


Epoch 21 Mean Reward: 130.20504064178468


100%|██████████| 2000/2000 [02:56<00:00, 11.36it/s]


Epoch 22 Mean Reward: 130.0461135635376


100%|██████████| 2000/2000 [02:45<00:00, 12.10it/s]


Epoch 23 Mean Reward: 133.7899767074585


100%|██████████| 2000/2000 [02:42<00:00, 12.29it/s]


Epoch 24 Mean Reward: 132.03242538452147


100%|██████████| 2000/2000 [02:27<00:00, 13.56it/s]


Epoch 25 Mean Reward: 131.33547695159913


100%|██████████| 2000/2000 [02:45<00:00, 12.06it/s]


Epoch 26 Mean Reward: 131.77674252319335


100%|██████████| 2000/2000 [02:43<00:00, 12.25it/s]


Epoch 27 Mean Reward: 130.76213765716554


100%|██████████| 2000/2000 [02:26<00:00, 13.69it/s]


Epoch 28 Mean Reward: 130.21892264556885


100%|██████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 29 Mean Reward: 133.64876728057862


100%|██████████| 2000/2000 [02:16<00:00, 14.69it/s]


Epoch 30 Mean Reward: 132.25570106506348
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 30 test:
Test Episode 1 Reward: 411.390869140625
Test Episode 2 Reward: 270.09959411621094
Test Episode 3 Reward: 270.09959411621094
Test Episode 4 Reward: 567.1274719238281
Test Episode 5 Reward: 321.15367126464844
Test Episode 6 Reward: 270.09959411621094
Test Episode 7 Reward: 270.09959411621094
Test Episode 8 Reward: 251.95318603515625
Test Episode 9 Reward: 270.09959411621094
Test Episode 10 Reward: 230.5533905029297
Average Test Reward: 313.267655945


100%|██████████| 2000/2000 [02:13<00:00, 14.96it/s]


Epoch 31 Mean Reward: 131.4243786239624


100%|██████████| 2000/2000 [02:10<00:00, 15.32it/s]


Epoch 32 Mean Reward: 132.0598896408081


100%|██████████| 2000/2000 [02:06<00:00, 15.85it/s]


Epoch 33 Mean Reward: 129.87737755584718


100%|██████████| 2000/2000 [02:10<00:00, 15.33it/s]


Epoch 34 Mean Reward: 131.23563849639893


100%|██████████| 2000/2000 [02:01<00:00, 16.40it/s]


Epoch 35 Mean Reward: 131.16537063598633


100%|██████████| 2000/2000 [01:59<00:00, 16.67it/s]


Epoch 36 Mean Reward: 132.718643699646


100%|██████████| 2000/2000 [01:57<00:00, 17.00it/s]


Epoch 37 Mean Reward: 130.7267359085083


100%|██████████| 2000/2000 [01:56<00:00, 17.11it/s]


Epoch 38 Mean Reward: 132.75741297912597


100%|██████████| 2000/2000 [01:58<00:00, 16.94it/s]


Epoch 39 Mean Reward: 131.84676221466066


100%|██████████| 2000/2000 [02:00<00:00, 16.56it/s]


Epoch 40 Mean Reward: 130.63669516754152
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test:
Test Episode 1 Reward: 422.90850830078125
Test Episode 2 Reward: 291.3838195800781
Test Episode 3 Reward: 275.18701171875
Test Episode 4 Reward: 297.46827697753906
Test Episode 5 Reward: 275.18701171875
Test Episode 6 Reward: 275.18701171875
Test Episode 7 Reward: 470.4149932861328
Test Episode 8 Reward: 275.18701171875
Test Episode 9 Reward: 559.7353210449219
Test Episode 10 Reward: 565.1266326904297
Average Test Reward: 370.778559875


100%|██████████| 2000/2000 [02:13<00:00, 15.02it/s]


Epoch 41 Mean Reward: 131.70997634887695


100%|██████████| 2000/2000 [01:58<00:00, 16.91it/s]


Epoch 42 Mean Reward: 130.03732364654542


100%|██████████| 2000/2000 [01:55<00:00, 17.35it/s]


Epoch 43 Mean Reward: 130.1307624130249


100%|██████████| 2000/2000 [01:56<00:00, 17.18it/s]


Epoch 44 Mean Reward: 132.44481954193114


100%|██████████| 2000/2000 [01:55<00:00, 17.32it/s]


Epoch 45 Mean Reward: 130.8378092880249


100%|██████████| 2000/2000 [01:58<00:00, 16.82it/s]


Epoch 46 Mean Reward: 132.3126088027954


100%|██████████| 2000/2000 [01:55<00:00, 17.27it/s]


Epoch 47 Mean Reward: 132.09410712432862


100%|██████████| 2000/2000 [02:02<00:00, 16.31it/s]


Epoch 48 Mean Reward: 130.0795542449951


100%|██████████| 2000/2000 [01:57<00:00, 17.05it/s]


Epoch 49 Mean Reward: 131.2145549697876


100%|██████████| 2000/2000 [01:56<00:00, 17.11it/s]


Epoch 50 Mean Reward: 132.22678513336183
Epoch 50 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 50 test:
Test Episode 1 Reward: 192.49594116210938
Test Episode 2 Reward: 255.6304931640625
Test Episode 3 Reward: 280.0997619628906
Test Episode 4 Reward: 456.85589599609375
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 280.24900817871094
Test Episode 7 Reward: 271.63816833496094
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 294.95274353027344
Test Episode 10 Reward: 901.4931488037109
Average Test Reward: 352.332064819


100%|██████████| 2000/2000 [01:52<00:00, 17.81it/s]


Epoch 51 Mean Reward: 131.6921770401001


100%|██████████| 2000/2000 [01:56<00:00, 17.16it/s]


Epoch 52 Mean Reward: 131.6787165145874


100%|██████████| 2000/2000 [01:59<00:00, 16.69it/s]


Epoch 53 Mean Reward: 130.65775563049317


100%|██████████| 2000/2000 [01:58<00:00, 16.91it/s]


Epoch 54 Mean Reward: 130.01766913604737


100%|██████████| 2000/2000 [02:08<00:00, 15.58it/s]


Epoch 55 Mean Reward: 134.26259873199461


100%|██████████| 2000/2000 [01:52<00:00, 17.82it/s]


Epoch 56 Mean Reward: 129.5280708694458


100%|██████████| 2000/2000 [01:51<00:00, 17.91it/s]


Epoch 57 Mean Reward: 131.0260886077881


100%|██████████| 2000/2000 [01:53<00:00, 17.58it/s]


Epoch 58 Mean Reward: 130.05763639831542


100%|██████████| 2000/2000 [01:57<00:00, 17.00it/s]


Epoch 59 Mean Reward: 131.1292155532837


100%|██████████| 2000/2000 [01:54<00:00, 17.45it/s]


Epoch 60 Mean Reward: 132.25443745422362
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 247.9599609375
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 294.95274353027344
Test Episode 7 Reward: 286.61024475097656
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 294.95274353027344
Test Episode 10 Reward: 352.09478759765625
Average Test Reward: 295.1334198


100%|██████████| 2000/2000 [01:48<00:00, 18.43it/s]


Epoch 61 Mean Reward: 163.5645233001709


100%|██████████| 2000/2000 [01:53<00:00, 17.56it/s]


Epoch 62 Mean Reward: 165.59354425811767


100%|██████████| 2000/2000 [01:51<00:00, 18.01it/s]


Epoch 63 Mean Reward: 164.50876809692383


100%|██████████| 2000/2000 [01:59<00:00, 16.77it/s]


Epoch 64 Mean Reward: 167.42004314422607


100%|██████████| 2000/2000 [01:50<00:00, 18.12it/s]


Epoch 65 Mean Reward: 169.02717230987548


100%|██████████| 2000/2000 [01:52<00:00, 17.82it/s]


Epoch 66 Mean Reward: 172.75494039916993


100%|██████████| 2000/2000 [01:52<00:00, 17.76it/s]


Epoch 67 Mean Reward: 171.9838461151123


100%|██████████| 2000/2000 [01:56<00:00, 17.18it/s]


Epoch 68 Mean Reward: 172.30646137237548


100%|██████████| 2000/2000 [01:53<00:00, 17.55it/s]


Epoch 69 Mean Reward: 178.61751589202882


100%|██████████| 2000/2000 [01:59<00:00, 16.80it/s]


Epoch 70 Mean Reward: 177.84332455444337
Epoch 70 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 70 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 275.9210662841797
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 294.95274353027344
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 315.0540771484375
Test Episode 9 Reward: 508.6257019042969
Test Episode 10 Reward: 294.95274353027344
Average Test Reward: 316.427005005


100%|██████████| 2000/2000 [02:03<00:00, 16.20it/s]


Epoch 71 Mean Reward: 179.52298892211914


100%|██████████| 2000/2000 [01:56<00:00, 17.23it/s]


Epoch 72 Mean Reward: 181.38222532653808


100%|██████████| 2000/2000 [01:52<00:00, 17.84it/s]


Epoch 73 Mean Reward: 185.42917849731447


100%|██████████| 2000/2000 [01:51<00:00, 17.98it/s]


Epoch 74 Mean Reward: 184.52146026611328


100%|██████████| 2000/2000 [01:50<00:00, 18.10it/s]


Epoch 75 Mean Reward: 186.7009514846802


100%|██████████| 2000/2000 [01:50<00:00, 18.11it/s]


Epoch 76 Mean Reward: 187.13776959228517


100%|██████████| 2000/2000 [01:50<00:00, 18.12it/s]


Epoch 77 Mean Reward: 190.43890279388427


100%|██████████| 2000/2000 [01:53<00:00, 17.67it/s]


Epoch 78 Mean Reward: 187.11586281585693


100%|██████████| 2000/2000 [01:59<00:00, 16.72it/s]


Epoch 79 Mean Reward: 191.30172992706298


100%|██████████| 2000/2000 [01:55<00:00, 17.25it/s]


Epoch 80 Mean Reward: 195.6367014389038
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test:
Test Episode 1 Reward: 565.8706665039062
Test Episode 2 Reward: 283.61073303222656
Test Episode 3 Reward: 283.61073303222656
Test Episode 4 Reward: 283.61073303222656
Test Episode 5 Reward: 536.4062194824219
Test Episode 6 Reward: 283.61073303222656
Test Episode 7 Reward: 241.91311645507812
Test Episode 8 Reward: 277.8139953613281
Test Episode 9 Reward: 283.61073303222656
Test Episode 10 Reward: 283.61073303222656
Average Test Reward: 332.3668396


100%|██████████| 2000/2000 [02:02<00:00, 16.34it/s]


Epoch 81 Mean Reward: 195.814486618042


100%|██████████| 2000/2000 [01:53<00:00, 17.65it/s]


Epoch 82 Mean Reward: 199.6089612197876


100%|██████████| 2000/2000 [01:50<00:00, 18.13it/s]


Epoch 83 Mean Reward: 201.7995360107422


100%|██████████| 2000/2000 [01:50<00:00, 18.03it/s]


Epoch 84 Mean Reward: 205.32664278411866


100%|██████████| 2000/2000 [01:55<00:00, 17.39it/s]


Epoch 85 Mean Reward: 207.56846866607665


100%|██████████| 2000/2000 [01:52<00:00, 17.77it/s]


Epoch 86 Mean Reward: 203.67402701568602


100%|██████████| 2000/2000 [02:02<00:00, 16.38it/s]


Epoch 87 Mean Reward: 208.0365939025879


100%|██████████| 2000/2000 [01:51<00:00, 17.90it/s]


Epoch 88 Mean Reward: 207.96239978790283


100%|██████████| 2000/2000 [01:57<00:00, 17.09it/s]


Epoch 89 Mean Reward: 211.1049502029419


100%|██████████| 2000/2000 [01:53<00:00, 17.67it/s]


Epoch 90 Mean Reward: 212.1195323562622
Epoch 90 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 90 test:
Test Episode 1 Reward: 283.2568817138672
Test Episode 2 Reward: 464.0641784667969
Test Episode 3 Reward: 283.2568817138672
Test Episode 4 Reward: 283.2568817138672
Test Episode 5 Reward: 283.2568817138672
Test Episode 6 Reward: 283.2568817138672
Test Episode 7 Reward: 680.38916015625
Test Episode 8 Reward: 245.0606689453125
Test Episode 9 Reward: 283.2568817138672
Test Episode 10 Reward: 283.2568817138672
Average Test Reward: 337.231217957


100%|██████████| 2000/2000 [01:51<00:00, 17.98it/s]


Epoch 91 Mean Reward: 219.76105111694335


100%|██████████| 2000/2000 [01:50<00:00, 18.15it/s]


Epoch 92 Mean Reward: 216.56636153411864


100%|██████████| 2000/2000 [01:59<00:00, 16.75it/s]


Epoch 93 Mean Reward: 221.024774848938


100%|██████████| 2000/2000 [02:01<00:00, 16.50it/s]


Epoch 94 Mean Reward: 220.78270793151856


100%|██████████| 2000/2000 [01:49<00:00, 18.33it/s]


Epoch 95 Mean Reward: 224.40804636383058


100%|██████████| 2000/2000 [01:45<00:00, 18.94it/s]


Epoch 96 Mean Reward: 222.77379957580567


100%|██████████| 2000/2000 [01:47<00:00, 18.69it/s]


Epoch 97 Mean Reward: 229.14928831481933


100%|██████████| 2000/2000 [01:48<00:00, 18.42it/s]


Epoch 98 Mean Reward: 225.41577755737305


100%|██████████| 2000/2000 [01:49<00:00, 18.21it/s]


Epoch 99 Mean Reward: 232.52456301116942


100%|██████████| 2000/2000 [01:52<00:00, 17.80it/s]


Epoch 100 Mean Reward: 232.36669234466552
Epoch 100 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 100 test:
Test Episode 1 Reward: 554.3011627197266
Test Episode 2 Reward: 408.5043182373047
Test Episode 3 Reward: 568.8609161376953
Test Episode 4 Reward: 408.5043182373047
Test Episode 5 Reward: 509.5507354736328
Test Episode 6 Reward: 279.9243621826172
Test Episode 7 Reward: 554.3011627197266
Test Episode 8 Reward: 408.5043182373047
Test Episode 9 Reward: 554.3011627197266
Test Episode 10 Reward: 449.739501953125
Average Test Reward: 469.649195862


100%|██████████| 2000/2000 [01:53<00:00, 17.60it/s]


Epoch 101 Mean Reward: 236.59270111846925


100%|██████████| 2000/2000 [01:57<00:00, 17.08it/s]


Epoch 102 Mean Reward: 238.63062355804442


100%|██████████| 2000/2000 [01:49<00:00, 18.28it/s]


Epoch 103 Mean Reward: 238.01099868011474


100%|██████████| 2000/2000 [01:51<00:00, 17.87it/s]


Epoch 104 Mean Reward: 241.18932048797606


100%|██████████| 2000/2000 [01:50<00:00, 18.09it/s]


Epoch 105 Mean Reward: 244.98950969696045


100%|██████████| 2000/2000 [01:52<00:00, 17.84it/s]


Epoch 106 Mean Reward: 242.49417646026612


100%|██████████| 2000/2000 [01:53<00:00, 17.59it/s]


Epoch 107 Mean Reward: 245.05974826049805


100%|██████████| 2000/2000 [01:50<00:00, 18.02it/s]


Epoch 108 Mean Reward: 248.84493265533447


100%|██████████| 2000/2000 [01:55<00:00, 17.37it/s]


Epoch 109 Mean Reward: 244.11198652648926


100%|██████████| 2000/2000 [02:07<00:00, 15.66it/s]


Epoch 110 Mean Reward: 253.526777053833
Epoch 110 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 110 test:
Test Episode 1 Reward: 649.2366333007812
Test Episode 2 Reward: 274.50537109375
Test Episode 3 Reward: 329.61016845703125
Test Episode 4 Reward: 274.50537109375
Test Episode 5 Reward: 547.9924774169922
Test Episode 6 Reward: 292.44642639160156
Test Episode 7 Reward: 327.7057189941406
Test Episode 8 Reward: 461.6768035888672
Test Episode 9 Reward: 274.50537109375
Test Episode 10 Reward: 262.58863830566406
Average Test Reward: 369.477297974


100%|██████████| 2000/2000 [01:58<00:00, 16.93it/s]


Epoch 111 Mean Reward: 252.00183375549315


100%|██████████| 2000/2000 [01:59<00:00, 16.72it/s]


Epoch 112 Mean Reward: 258.05440036773683


100%|██████████| 2000/2000 [02:01<00:00, 16.43it/s]


Epoch 113 Mean Reward: 259.9622224884033


100%|██████████| 2000/2000 [02:00<00:00, 16.63it/s]


Epoch 114 Mean Reward: 265.62524797821044


100%|██████████| 2000/2000 [01:59<00:00, 16.69it/s]


Epoch 115 Mean Reward: 269.71281997680666


100%|██████████| 2000/2000 [01:53<00:00, 17.61it/s]


Epoch 116 Mean Reward: 268.58051694488523


100%|██████████| 2000/2000 [02:15<00:00, 14.81it/s]


Epoch 117 Mean Reward: 271.4864945983887


100%|██████████| 2000/2000 [01:54<00:00, 17.46it/s]


Epoch 118 Mean Reward: 277.0050301361084


100%|██████████| 2000/2000 [02:02<00:00, 16.36it/s]


Epoch 119 Mean Reward: 278.390573928833


100%|██████████| 2000/2000 [01:52<00:00, 17.71it/s]


Epoch 120 Mean Reward: 278.4549535903931
Epoch 120 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 120 test:
Test Episode 1 Reward: 589.8568420410156
Test Episode 2 Reward: 433.3393096923828
Test Episode 3 Reward: 220.64112854003906
Test Episode 4 Reward: 433.3393096923828
Test Episode 5 Reward: 499.2216339111328
Test Episode 6 Reward: 279.7518615722656
Test Episode 7 Reward: 565.3341674804688
Test Episode 8 Reward: 433.3393096923828
Test Episode 9 Reward: 519.3215942382812
Test Episode 10 Reward: 433.3393096923828
Average Test Reward: 440.748446655


100%|██████████| 2000/2000 [01:54<00:00, 17.49it/s]


Epoch 121 Mean Reward: 272.1907244262695


100%|██████████| 2000/2000 [01:53<00:00, 17.63it/s]


Epoch 122 Mean Reward: 271.8118034286499


100%|██████████| 2000/2000 [01:51<00:00, 18.00it/s]


Epoch 123 Mean Reward: 272.43620394134524


100%|██████████| 2000/2000 [01:56<00:00, 17.23it/s]


Epoch 124 Mean Reward: 274.28623601531984


100%|██████████| 2000/2000 [02:08<00:00, 15.61it/s]


Epoch 125 Mean Reward: 280.51985722351077


100%|██████████| 2000/2000 [01:49<00:00, 18.25it/s]


Epoch 126 Mean Reward: 282.98553855895995


100%|██████████| 2000/2000 [01:48<00:00, 18.44it/s]


Epoch 127 Mean Reward: 279.73729192352295


100%|██████████| 2000/2000 [01:50<00:00, 18.10it/s]


Epoch 128 Mean Reward: 277.8429881515503


100%|██████████| 2000/2000 [01:52<00:00, 17.72it/s]


Epoch 129 Mean Reward: 286.8635755233765


100%|██████████| 2000/2000 [01:51<00:00, 17.94it/s]


Epoch 130 Mean Reward: 290.7985429840088
Epoch 130 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 130 test:
Test Episode 1 Reward: 566.2972106933594
Test Episode 2 Reward: 264.96099853515625
Test Episode 3 Reward: 389.8818817138672
Test Episode 4 Reward: 278.9537658691406
Test Episode 5 Reward: 278.9537658691406
Test Episode 6 Reward: 278.9537658691406
Test Episode 7 Reward: 278.9537658691406
Test Episode 8 Reward: 549.0405426025391
Test Episode 9 Reward: 278.9537658691406
Test Episode 10 Reward: 300.62586975097656
Average Test Reward: 346.557533264


100%|██████████| 2000/2000 [01:53<00:00, 17.56it/s]


Epoch 131 Mean Reward: 313.04249699401856


100%|██████████| 2000/2000 [02:02<00:00, 16.26it/s]


Epoch 132 Mean Reward: 312.1344484863281


100%|██████████| 2000/2000 [01:59<00:00, 16.72it/s]


Epoch 133 Mean Reward: 308.71012237548825


100%|██████████| 2000/2000 [01:50<00:00, 18.18it/s]


Epoch 134 Mean Reward: 321.5422250595093


100%|██████████| 2000/2000 [01:48<00:00, 18.51it/s]


Epoch 135 Mean Reward: 318.91040743255616


100%|██████████| 2000/2000 [01:49<00:00, 18.31it/s]


Epoch 136 Mean Reward: 326.61759368133545


100%|██████████| 2000/2000 [01:51<00:00, 18.00it/s]


Epoch 137 Mean Reward: 322.5748907623291


100%|██████████| 2000/2000 [01:50<00:00, 18.05it/s]


Epoch 138 Mean Reward: 320.32940660858156


100%|██████████| 2000/2000 [01:52<00:00, 17.72it/s]


Epoch 139 Mean Reward: 321.9869648513794


100%|██████████| 2000/2000 [01:57<00:00, 17.04it/s]


Epoch 140 Mean Reward: 328.44694805908205
Epoch 140 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 140 test:
Test Episode 1 Reward: 442.9654083251953
Test Episode 2 Reward: 442.9654083251953
Test Episode 3 Reward: 442.9654083251953
Test Episode 4 Reward: 227.25901794433594
Test Episode 5 Reward: 442.9654083251953
Test Episode 6 Reward: 442.9654083251953
Test Episode 7 Reward: 223.1187744140625
Test Episode 8 Reward: 442.9654083251953
Test Episode 9 Reward: 268.7892761230469
Test Episode 10 Reward: 442.9654083251953
Average Test Reward: 381.992492676


100%|██████████| 2000/2000 [01:50<00:00, 18.10it/s]


Epoch 141 Mean Reward: 340.2140690612793


100%|██████████| 2000/2000 [01:52<00:00, 17.85it/s]


Epoch 142 Mean Reward: 335.42985098266604


100%|██████████| 2000/2000 [01:51<00:00, 17.90it/s]


Epoch 143 Mean Reward: 343.08025065612793


100%|██████████| 2000/2000 [01:52<00:00, 17.72it/s]


Epoch 144 Mean Reward: 338.7953411712646


100%|██████████| 2000/2000 [01:54<00:00, 17.47it/s]


Epoch 145 Mean Reward: 347.06300775909426


100%|██████████| 2000/2000 [01:54<00:00, 17.40it/s]


Epoch 146 Mean Reward: 341.4664251556396


100%|██████████| 2000/2000 [02:07<00:00, 15.75it/s]


Epoch 147 Mean Reward: 344.67511011505127


100%|██████████| 2000/2000 [01:57<00:00, 17.09it/s]


Epoch 148 Mean Reward: 352.0231597595215


100%|██████████| 2000/2000 [01:50<00:00, 18.16it/s]


Epoch 149 Mean Reward: 345.6266469497681


100%|██████████| 2000/2000 [01:52<00:00, 17.81it/s]


Epoch 150 Mean Reward: 355.83319284057615
Epoch 150 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 150 test:
Test Episode 1 Reward: 267.1122741699219
Test Episode 2 Reward: 267.1122741699219
Test Episode 3 Reward: 267.1122741699219
Test Episode 4 Reward: 267.1122741699219
Test Episode 5 Reward: 267.1122741699219
Test Episode 6 Reward: 229.29595947265625
Test Episode 7 Reward: 196.74769592285156
Test Episode 8 Reward: 267.1122741699219
Test Episode 9 Reward: 267.1122741699219
Test Episode 10 Reward: 603.4388122558594
Average Test Reward: 289.926838684


100%|██████████| 2000/2000 [01:52<00:00, 17.82it/s]


Epoch 151 Mean Reward: 361.89661605072024


100%|██████████| 2000/2000 [01:52<00:00, 17.76it/s]


Epoch 152 Mean Reward: 364.56293074798583


100%|██████████| 2000/2000 [01:52<00:00, 17.72it/s]


Epoch 153 Mean Reward: 369.88603582000735


100%|██████████| 2000/2000 [01:49<00:00, 18.24it/s]


Epoch 154 Mean Reward: 369.60674907684324


100%|██████████| 2000/2000 [01:51<00:00, 17.97it/s]


Epoch 155 Mean Reward: 374.1327777709961


100%|██████████| 2000/2000 [02:11<00:00, 15.15it/s]


Epoch 156 Mean Reward: 383.00821771240237


100%|██████████| 2000/2000 [01:49<00:00, 18.21it/s]


Epoch 157 Mean Reward: 381.9009682006836


100%|██████████| 2000/2000 [01:50<00:00, 18.05it/s]


Epoch 158 Mean Reward: 390.7051066055298


100%|██████████| 2000/2000 [01:50<00:00, 18.15it/s]


Epoch 159 Mean Reward: 395.9182024230957


100%|██████████| 2000/2000 [01:52<00:00, 17.76it/s]


Epoch 160 Mean Reward: 402.7312011489868
Epoch 160 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 160 test:
Test Episode 1 Reward: 409.42735290527344
Test Episode 2 Reward: 409.42735290527344
Test Episode 3 Reward: 409.42735290527344
Test Episode 4 Reward: 409.42735290527344
Test Episode 5 Reward: 493.9055938720703
Test Episode 6 Reward: 468.6914825439453
Test Episode 7 Reward: 182.6830291748047
Test Episode 8 Reward: 409.42735290527344
Test Episode 9 Reward: 260.16900634765625
Test Episode 10 Reward: 409.42735290527344
Average Test Reward: 386.201322937


100%|██████████| 2000/2000 [01:49<00:00, 18.24it/s]


Epoch 161 Mean Reward: 406.9395208816528


100%|██████████| 2000/2000 [01:48<00:00, 18.46it/s]


Epoch 162 Mean Reward: 411.8677098617554


100%|██████████| 2000/2000 [01:48<00:00, 18.51it/s]


Epoch 163 Mean Reward: 394.21250789642335


100%|██████████| 2000/2000 [02:03<00:00, 16.19it/s]


Epoch 164 Mean Reward: 403.53686395263674


100%|██████████| 2000/2000 [01:48<00:00, 18.38it/s]


Epoch 165 Mean Reward: 401.45045484924316


100%|██████████| 2000/2000 [01:50<00:00, 18.11it/s]


Epoch 166 Mean Reward: 402.68201430511476


100%|██████████| 2000/2000 [01:53<00:00, 17.66it/s]


Epoch 167 Mean Reward: 413.8208252182007


100%|██████████| 2000/2000 [01:52<00:00, 17.84it/s]


Epoch 168 Mean Reward: 417.8688429336548


100%|██████████| 2000/2000 [01:52<00:00, 17.82it/s]


Epoch 169 Mean Reward: 425.13627980804443


100%|██████████| 2000/2000 [01:52<00:00, 17.84it/s]


Epoch 170 Mean Reward: 431.07256774139404
Epoch 170 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 170 test:
Test Episode 1 Reward: 280.17100524902344
Test Episode 2 Reward: 547.0067138671875
Test Episode 3 Reward: 280.17100524902344
Test Episode 4 Reward: 250.56629943847656
Test Episode 5 Reward: 280.17100524902344
Test Episode 6 Reward: 297.5193176269531
Test Episode 7 Reward: 280.17100524902344
Test Episode 8 Reward: 700.7275543212891
Test Episode 9 Reward: 280.17100524902344
Test Episode 10 Reward: 304.5230407714844
Average Test Reward: 350.119795227


100%|██████████| 2000/2000 [01:59<00:00, 16.70it/s]


Epoch 171 Mean Reward: 461.59445695495606


100%|██████████| 2000/2000 [02:05<00:00, 15.90it/s]


Epoch 172 Mean Reward: 465.9225468826294


100%|██████████| 2000/2000 [01:54<00:00, 17.46it/s]


Epoch 173 Mean Reward: 476.3599847946167


100%|██████████| 2000/2000 [01:53<00:00, 17.61it/s]


Epoch 174 Mean Reward: 482.754192276001


100%|██████████| 2000/2000 [01:56<00:00, 17.16it/s]


Epoch 175 Mean Reward: 487.606626335144


100%|██████████| 2000/2000 [01:56<00:00, 17.19it/s]


Epoch 176 Mean Reward: 491.13189595794677


100%|██████████| 2000/2000 [01:52<00:00, 17.80it/s]


Epoch 177 Mean Reward: 486.65547205352783


100%|██████████| 2000/2000 [02:09<00:00, 15.50it/s]


Epoch 178 Mean Reward: 491.3819272994995


100%|██████████| 2000/2000 [01:53<00:00, 17.67it/s]


Epoch 179 Mean Reward: 524.4509245147705


100%|██████████| 2000/2000 [02:01<00:00, 16.46it/s]


Epoch 180 Mean Reward: 522.7353393707275
Epoch 180 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 180 test:
Test Episode 1 Reward: 110.2789306640625
Test Episode 2 Reward: 229.00808715820312
Test Episode 3 Reward: 224.3655242919922
Test Episode 4 Reward: 224.3655242919922
Test Episode 5 Reward: 224.3655242919922
Test Episode 6 Reward: 202.8043670654297
Test Episode 7 Reward: 224.3655242919922
Test Episode 8 Reward: 224.3655242919922
Test Episode 9 Reward: 224.3655242919922
Test Episode 10 Reward: 224.3655242919922
Average Test Reward: 211.265005493


100%|██████████| 2000/2000 [01:55<00:00, 17.35it/s]


Epoch 181 Mean Reward: 526.3715040588379


100%|██████████| 2000/2000 [01:53<00:00, 17.65it/s]


Epoch 182 Mean Reward: 528.9832680892945


100%|██████████| 2000/2000 [01:54<00:00, 17.40it/s]


Epoch 183 Mean Reward: 533.1092561340332


100%|██████████| 2000/2000 [01:53<00:00, 17.69it/s]


Epoch 184 Mean Reward: 535.2428143081665


100%|██████████| 2000/2000 [01:51<00:00, 17.91it/s]


Epoch 185 Mean Reward: 532.3555188522339


100%|██████████| 2000/2000 [02:07<00:00, 15.74it/s]


Epoch 186 Mean Reward: 531.5433667449951


100%|██████████| 2000/2000 [01:50<00:00, 18.05it/s]


Epoch 187 Mean Reward: 529.0116111602783


100%|██████████| 2000/2000 [01:59<00:00, 16.68it/s]


Epoch 188 Mean Reward: 529.1577212524414


100%|██████████| 2000/2000 [01:51<00:00, 17.86it/s]


Epoch 189 Mean Reward: 533.4689818649292


100%|██████████| 2000/2000 [01:54<00:00, 17.54it/s]


Epoch 190 Mean Reward: 536.9188267822266
Epoch 190 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 190 test:
Test Episode 1 Reward: 373.61695861816406
Test Episode 2 Reward: 535.854736328125
Test Episode 3 Reward: 506.0320129394531
Test Episode 4 Reward: 393.14263916015625
Test Episode 5 Reward: 275.61334228515625
Test Episode 6 Reward: 373.61695861816406
Test Episode 7 Reward: 373.61695861816406
Test Episode 8 Reward: 373.61695861816406
Test Episode 9 Reward: 373.61695861816406
Test Episode 10 Reward: 241.77830505371094
Average Test Reward: 382.050582886


100%|██████████| 2000/2000 [01:54<00:00, 17.45it/s]


Epoch 191 Mean Reward: 507.3721851501465


100%|██████████| 2000/2000 [01:53<00:00, 17.57it/s]


Epoch 192 Mean Reward: 503.6634953765869


100%|██████████| 2000/2000 [01:53<00:00, 17.57it/s]


Epoch 193 Mean Reward: 493.4234599227905


100%|██████████| 2000/2000 [02:09<00:00, 15.47it/s]


Epoch 194 Mean Reward: 491.18895847320556


100%|██████████| 2000/2000 [01:57<00:00, 16.95it/s]


Epoch 195 Mean Reward: 496.6380216293335


100%|██████████| 2000/2000 [01:55<00:00, 17.35it/s]


Epoch 196 Mean Reward: 487.02569772338865


100%|██████████| 2000/2000 [01:55<00:00, 17.27it/s]


Epoch 197 Mean Reward: 486.6309999771118


100%|██████████| 2000/2000 [02:00<00:00, 16.55it/s]


Epoch 198 Mean Reward: 492.0435645675659


100%|██████████| 2000/2000 [01:57<00:00, 17.02it/s]


Epoch 199 Mean Reward: 491.4137775039673


100%|██████████| 2000/2000 [01:57<00:00, 16.96it/s]


Epoch 200 Mean Reward: 490.0376734542847
Epoch 200 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 200 test:
Test Episode 1 Reward: 139.0738525390625
Test Episode 2 Reward: 528.2173156738281
Test Episode 3 Reward: 256.66539001464844
Test Episode 4 Reward: 255.59927368164062
Test Episode 5 Reward: 256.66539001464844
Test Episode 6 Reward: 256.66539001464844
Test Episode 7 Reward: 256.66539001464844
Test Episode 8 Reward: 210.04180908203125
Test Episode 9 Reward: 256.66539001464844
Test Episode 10 Reward: 256.66539001464844
Average Test Reward: 267.292459106
[(469.64919586181639, 100), (440.74844665527343, 120), (386.2013229370117, 160), (382.05058288574219, 190), (381.99249267578125, 140), (370.77855987548827, 40), (369.4772979736328, 110), (352.33206481933593, 50), (350.11979522705076, 170), (346.55753326416016, 130), (337.23121795654299, 90), (332.36683959960936, 80), (322.99482116699221, 20), (316.4270050048828, 70), (313.26765594482424, 30), (295.1334197998047, 60), (289.926

In [7]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

test_reward = test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=ckpts[-1])
print('Average Test Reward:', test_reward)


Loading model from checkpoints\deadly_corridor.ckpt-200
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-200
Test Episode 1 Reward: 474.33421325683594
Test Episode 2 Reward: 265.6115264892578
Test Episode 3 Reward: 332.4654846191406
Test Episode 4 Reward: 209.7943572998047
Test Episode 5 Reward: 332.4654846191406
Test Episode 6 Reward: 280.36224365234375
Test Episode 7 Reward: 221.31776428222656
Test Episode 8 Reward: 90.81770324707031
Test Episode 9 Reward: 332.4654846191406
Test Episode 10 Reward: 163.6245574951172
Test Episode 11 Reward: 332.4654846191406
Test Episode 12 Reward: 274.8756103515625
Test Episode 13 Reward: 332.4654846191406
Test Episode 14 Reward: 483.62811279296875
Test Episode 15 Reward: 332.4654846191406
Test Episode 16 Reward: 332.4654846191406
Test Episode 17 Reward: 164.77279663085938
Test Episode 18 Reward: 332.4654846191406
Test Episode 19 Reward: 302.7548828125
Test Episode 20 Reward: 205.616455078125
Average Test Reward: 289.86170501