In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 80
steps_per_epoch = 2000
discount_factor = 0.99
learning_rate = 0.001
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            buffer = np.concatenate((state.screen_buffer,
                                     np.expand_dims(state.depth_buffer,
                                                    axis=2)),
                                    axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return ('Average Test Reward:', np.mean(episode_rewards))


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, height, width, channels, learning_rate=0.001):
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name='state')
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name='action')
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name='Q_target')

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name='input_layer')
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name='conv1_layer')
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name='conv2_layer')
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name='flatten')
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name='dense1_layer')
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name='output_layer')        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           name='adam')
        self.train = self.adam.minimize(self.loss)

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet

tf.reset_default_graph()
DQN = Q_network(learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)
exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()
t = 0

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                np.expand_dims(state.depth_buffer, axis=2)),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    np.expand_dims(state.depth_buffer, axis=2)),
                                                    axis=2),
                                                    down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Train the Q-network by using the minibatch to update the action-value function Q
            
            Q2 = np.max(DQN.get_Q_values(session, s2), axis=1)
            target_Q = DQN.get_Q_values(session, s1)
            target_Q[np.arange(batch_size), a] = r + discount_factor*(1 - terminal)*Q2
            DQN.calculate_loss(session, s1, target_Q)
            
        epoch_rewards.append(game.get_total_reward())
        
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Test the agent's performance for 10 episodes and save the model every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        print('Epoch {} test:'.format(epoch + 1))
        test_agent(DQN, num_episodes=10, training=True,
                   load_model=False, session=session, model_dir=model_dir)
        
        if save_model == True:
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
        
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 20/20 [00:02<00:00,  9.73it/s]


Epoch 1 Mean Reward: 120.39130401611328


100%|██████████| 20/20 [00:01<00:00, 14.15it/s]


Epoch 2 Mean Reward: 154.05533828735352


100%|██████████| 20/20 [00:02<00:00,  9.91it/s]


Epoch 3 Mean Reward: 131.62899780273438


100%|██████████| 20/20 [00:01<00:00, 13.07it/s]


Epoch 4 Mean Reward: 157.8271598815918


100%|██████████| 20/20 [00:01<00:00, 14.76it/s]


Epoch 5 Mean Reward: 119.66508865356445


100%|██████████| 20/20 [00:01<00:00, 13.66it/s]


Epoch 6 Mean Reward: 139.97867813110352


100%|██████████| 20/20 [00:01<00:00, 17.08it/s]


Epoch 7 Mean Reward: 122.11528778076172


100%|██████████| 20/20 [00:01<00:00, 13.63it/s]


Epoch 8 Mean Reward: 130.174365234375


100%|██████████| 20/20 [00:01<00:00, 15.42it/s]


Epoch 9 Mean Reward: 128.75669708251954


100%|██████████| 20/20 [00:01<00:00, 12.11it/s]


Epoch 10 Mean Reward: 142.89727706909179
Epoch 10 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 631.3865814208984
Test Episode 3 Reward: 300.270751953125
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 329.85675048828125
Test Episode 7 Reward: 313.27296447753906
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 514.5715789794922
Test Episode 10 Reward: 212.5015106201172
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 20/20 [00:01<00:00, 14.94it/s]


Epoch 11 Mean Reward: 131.68692779541016


100%|██████████| 20/20 [00:01<00:00, 12.71it/s]


Epoch 12 Mean Reward: 135.68497161865236


100%|██████████| 20/20 [00:01<00:00, 16.59it/s]


Epoch 13 Mean Reward: 132.87233047485353


100%|██████████| 20/20 [00:01<00:00, 14.11it/s]


Epoch 14 Mean Reward: 121.21613998413086


100%|██████████| 20/20 [00:01<00:00, 11.73it/s]


Epoch 15 Mean Reward: 147.86905975341796


100%|██████████| 20/20 [00:01<00:00, 11.98it/s]


Epoch 16 Mean Reward: 128.43447189331056


100%|██████████| 20/20 [00:01<00:00, 14.21it/s]


Epoch 17 Mean Reward: 150.1641647338867


100%|██████████| 20/20 [00:01<00:00, 12.61it/s]


Epoch 18 Mean Reward: 140.14100723266603


100%|██████████| 20/20 [00:01<00:00, 15.99it/s]


Epoch 19 Mean Reward: 130.25592803955078


100%|██████████| 20/20 [00:01<00:00, 14.57it/s]


Epoch 20 Mean Reward: 145.81566314697267
Epoch 20 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 263.36476135253906
Test Episode 5 Reward: 307.1249084472656
Test Episode 6 Reward: 294.95274353027344
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 294.95274353027344
Test Episode 10 Reward: 294.95274353027344
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 20/20 [00:01<00:00, 17.32it/s]


Epoch 21 Mean Reward: 119.53192520141602


100%|██████████| 20/20 [00:01<00:00, 17.59it/s]


Epoch 22 Mean Reward: 119.22627334594726


100%|██████████| 20/20 [00:01<00:00, 15.78it/s]


Epoch 23 Mean Reward: 121.72506256103516


100%|██████████| 20/20 [00:01<00:00, 17.06it/s]


Epoch 24 Mean Reward: 120.31939849853515


100%|██████████| 20/20 [00:01<00:00, 18.68it/s]


Epoch 25 Mean Reward: 158.79324111938476


100%|██████████| 20/20 [00:01<00:00, 15.62it/s]


Epoch 26 Mean Reward: 142.45037918090821


100%|██████████| 20/20 [00:01<00:00, 18.05it/s]


Epoch 27 Mean Reward: 181.48605346679688


100%|██████████| 20/20 [00:01<00:00, 14.53it/s]


Epoch 28 Mean Reward: 180.91883697509766


100%|██████████| 20/20 [00:01<00:00, 16.96it/s]


Epoch 29 Mean Reward: 133.78098754882814


100%|██████████| 20/20 [00:01<00:00, 10.64it/s]


Epoch 30 Mean Reward: 166.66860656738282
Epoch 30 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 302.1759796142578
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 257.14605712890625
Test Episode 6 Reward: 328.3160858154297
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 640.3885650634766
Test Episode 9 Reward: 578.2197875976562
Test Episode 10 Reward: 294.95274353027344
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 20/20 [00:01<00:00, 16.58it/s]


Epoch 31 Mean Reward: 180.05858840942383


100%|██████████| 20/20 [00:01<00:00, 16.69it/s]


Epoch 32 Mean Reward: 178.94553833007814


100%|██████████| 20/20 [00:01<00:00, 19.32it/s]


Epoch 33 Mean Reward: 170.79694519042968


100%|██████████| 20/20 [00:01<00:00, 16.89it/s]


Epoch 34 Mean Reward: 234.27135467529297


100%|██████████| 20/20 [00:01<00:00, 16.64it/s]


Epoch 35 Mean Reward: 213.22036056518556


100%|██████████| 20/20 [00:01<00:00, 15.83it/s]


Epoch 36 Mean Reward: 191.69307479858398


100%|██████████| 20/20 [00:01<00:00, 16.31it/s]


Epoch 37 Mean Reward: 174.00246658325196


100%|██████████| 20/20 [00:01<00:00, 16.86it/s]


Epoch 38 Mean Reward: 184.84504165649415


100%|██████████| 20/20 [00:01<00:00, 19.24it/s]


Epoch 39 Mean Reward: 231.9142807006836


100%|██████████| 20/20 [00:01<00:00, 18.67it/s]


Epoch 40 Mean Reward: 209.11266326904297
Epoch 40 test:
Test Episode 1 Reward: 284.82667541503906
Test Episode 2 Reward: 284.82667541503906
Test Episode 3 Reward: 271.8874206542969
Test Episode 4 Reward: 284.82667541503906
Test Episode 5 Reward: 639.2828674316406
Test Episode 6 Reward: 284.82667541503906
Test Episode 7 Reward: 585.1678009033203
Test Episode 8 Reward: 284.82667541503906
Test Episode 9 Reward: 467.19029235839844
Test Episode 10 Reward: 318.2055358886719
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 20/20 [00:01<00:00, 14.57it/s]


Epoch 41 Mean Reward: 267.28471755981445


100%|██████████| 20/20 [00:01<00:00, 16.12it/s]


Epoch 42 Mean Reward: 248.21056518554687


100%|██████████| 20/20 [00:01<00:00, 15.36it/s]


Epoch 43 Mean Reward: 217.63948974609374


100%|██████████| 20/20 [00:01<00:00, 16.75it/s]


Epoch 44 Mean Reward: 213.0647285461426


100%|██████████| 20/20 [00:01<00:00, 18.26it/s]


Epoch 45 Mean Reward: 267.9881958007812


100%|██████████| 20/20 [00:01<00:00, 13.11it/s]


Epoch 46 Mean Reward: 175.98081130981444


100%|██████████| 20/20 [00:00<00:00, 20.83it/s]


Epoch 47 Mean Reward: 219.65376663208008


100%|██████████| 20/20 [00:01<00:00, 15.36it/s]


Epoch 48 Mean Reward: 171.19925842285156


100%|██████████| 20/20 [00:01<00:00, 16.68it/s]


Epoch 49 Mean Reward: 295.0933242797852


100%|██████████| 20/20 [00:01<00:00, 15.98it/s]


Epoch 50 Mean Reward: 197.61941986083986
Epoch 50 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 465.12831115722656
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 281.6623840332031
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 294.95274353027344
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 658.4134521484375
Test Episode 9 Reward: 294.95274353027344
Test Episode 10 Reward: 286.1171569824219
Epoch 50 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 20/20 [00:00<00:00, 20.93it/s]


Epoch 51 Mean Reward: 304.8505989074707


100%|██████████| 20/20 [00:01<00:00, 17.60it/s]


Epoch 52 Mean Reward: 214.39270935058593


100%|██████████| 20/20 [00:01<00:00, 18.02it/s]


Epoch 53 Mean Reward: 279.42830657958984


100%|██████████| 20/20 [00:01<00:00, 17.32it/s]


Epoch 54 Mean Reward: 191.01748962402343


100%|██████████| 20/20 [00:00<00:00, 20.92it/s]


Epoch 55 Mean Reward: 318.2202423095703


100%|██████████| 20/20 [00:01<00:00, 19.61it/s]


Epoch 56 Mean Reward: 177.66642684936522


100%|██████████| 20/20 [00:00<00:00, 20.26it/s]


Epoch 57 Mean Reward: 305.10694885253906


100%|██████████| 20/20 [00:01<00:00, 18.59it/s]


Epoch 58 Mean Reward: 199.37093811035157


100%|██████████| 20/20 [00:00<00:00, 21.35it/s]


Epoch 59 Mean Reward: 189.43438796997071


100%|██████████| 20/20 [00:01<00:00, 19.98it/s]


Epoch 60 Mean Reward: 300.1001663208008
Epoch 60 test:
Test Episode 1 Reward: 84.17875671386719
Test Episode 2 Reward: 84.02227783203125
Test Episode 3 Reward: 84.02159118652344
Test Episode 4 Reward: 84.17875671386719
Test Episode 5 Reward: 84.04368591308594
Test Episode 6 Reward: 84.06700134277344
Test Episode 7 Reward: 84.17875671386719
Test Episode 8 Reward: 84.17875671386719
Test Episode 9 Reward: 84.17875671386719
Test Episode 10 Reward: 84.02261352539062
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 20/20 [00:01<00:00, 16.94it/s]


Epoch 61 Mean Reward: 246.79349365234376


100%|██████████| 20/20 [00:01<00:00, 19.07it/s]


Epoch 62 Mean Reward: 350.71970138549807


100%|██████████| 20/20 [00:01<00:00, 18.92it/s]


Epoch 63 Mean Reward: 168.43794479370118


100%|██████████| 20/20 [00:00<00:00, 21.74it/s]


Epoch 64 Mean Reward: 302.8137855529785


100%|██████████| 20/20 [00:01<00:00, 17.22it/s]


Epoch 65 Mean Reward: 181.43633728027345


100%|██████████| 20/20 [00:00<00:00, 20.20it/s]


Epoch 66 Mean Reward: 293.8214813232422


100%|██████████| 20/20 [00:01<00:00, 18.84it/s]


Epoch 67 Mean Reward: 148.88097076416017


100%|██████████| 20/20 [00:01<00:00, 16.72it/s]


Epoch 68 Mean Reward: 315.20680770874026


100%|██████████| 20/20 [00:01<00:00, 19.08it/s]


Epoch 69 Mean Reward: 201.23438568115233


100%|██████████| 20/20 [00:00<00:00, 22.73it/s]


Epoch 70 Mean Reward: 333.4384735107422
Epoch 70 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 294.95274353027344
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 337.1773681640625
Test Episode 6 Reward: 563.5897827148438
Test Episode 7 Reward: 312.7727966308594
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 757.5793914794922
Test Episode 10 Reward: 279.5070037841797
Epoch 70 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 20/20 [00:01<00:00, 18.45it/s]


Epoch 71 Mean Reward: 247.86219177246093


100%|██████████| 20/20 [00:00<00:00, 20.42it/s]


Epoch 72 Mean Reward: 387.7639678955078


100%|██████████| 20/20 [00:00<00:00, 22.09it/s]


Epoch 73 Mean Reward: 203.09404449462892


100%|██████████| 20/20 [00:00<00:00, 22.68it/s]


Epoch 74 Mean Reward: 378.9828620910645


100%|██████████| 20/20 [00:01<00:00, 17.24it/s]


Epoch 75 Mean Reward: 199.167391204834


100%|██████████| 20/20 [00:00<00:00, 25.23it/s]


Epoch 76 Mean Reward: 356.6153953552246


100%|██████████| 20/20 [00:00<00:00, 20.68it/s]


Epoch 77 Mean Reward: 152.5507843017578


100%|██████████| 20/20 [00:00<00:00, 23.84it/s]


Epoch 78 Mean Reward: 299.77059707641604


100%|██████████| 20/20 [00:00<00:00, 21.08it/s]


Epoch 79 Mean Reward: 263.74560165405273


100%|██████████| 20/20 [00:00<00:00, 21.50it/s]


Epoch 80 Mean Reward: 332.44045028686526
Epoch 80 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 550.5181121826172
Test Episode 4 Reward: 259.6361541748047
Test Episode 5 Reward: 294.95274353027344
Test Episode 6 Reward: 567.1110076904297
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 281.06443786621094
Test Episode 9 Reward: 294.95274353027344
Test Episode 10 Reward: 294.95274353027344
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
12060 time steps experienced during training


In [6]:
#Test the fully trained model by only choosing actions with a greedy strategy

test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=model_dir)


Loading model from ./checkpoints/deadly_corridor.ckpt
INFO:tensorflow:Restoring parameters from ./checkpoints/deadly_corridor.ckpt


NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ./checkpoints/deadly_corridor.ckpt
	 [[Node: save_1/RestoreV2_13 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save_1/Const_0_0, save_1/RestoreV2_13/tensor_names, save_1/RestoreV2_13/shape_and_slices)]]
	 [[Node: save_1/RestoreV2_5/_51 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_106_save_1/RestoreV2_5", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

Caused by op 'save_1/RestoreV2_13', defined at:
  File "C:\Anaconda3\envs\doom\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 268, in <module>
    main()
  File "C:\Anaconda3\envs\doom\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 264, in main
    kernel.start()
  File "C:\Anaconda3\envs\doom\lib\site-packages\ipykernel\kernelapp.py", line 478, in start
    self.io_loop.start()
  File "C:\Anaconda3\envs\doom\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Anaconda3\envs\doom\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Anaconda3\envs\doom\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Anaconda3\envs\doom\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Anaconda3\envs\doom\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Anaconda3\envs\doom\lib\site-packages\ipykernel\kernelbase.py", line 281, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Anaconda3\envs\doom\lib\site-packages\ipykernel\kernelbase.py", line 232, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Anaconda3\envs\doom\lib\site-packages\ipykernel\kernelbase.py", line 397, in execute_request
    user_expressions, allow_stdin)
  File "C:\Anaconda3\envs\doom\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Anaconda3\envs\doom\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Anaconda3\envs\doom\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Anaconda3\envs\doom\lib\site-packages\IPython\core\interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Anaconda3\envs\doom\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-0ad9011c8a3c>", line 3, in <module>
    test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=model_dir)
  File "<ipython-input-3-5b4e6bd2e895>", line 47, in test_agent
    tf.train.Saver().restore(sess, model_dir)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\training\saver.py", line 1218, in __init__
    self.build()
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\training\saver.py", line 1227, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\training\saver.py", line 1263, in _build
    build_save=build_save, build_restore=build_restore)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\training\saver.py", line 751, in _build_internal
    restore_sequentially, reshape)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\training\saver.py", line 427, in _AddRestoreOps
    tensors = self.restore_op(filename_tensor, saveable, preferred_shard)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\training\saver.py", line 267, in restore_op
    [spec.tensor.dtype])[0])
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 1020, in restore_v2
    shape_and_slices=shape_and_slices, dtypes=dtypes, name=name)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op
    op_def=op_def)
  File "C:\Anaconda3\envs\doom\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ./checkpoints/deadly_corridor.ckpt
	 [[Node: save_1/RestoreV2_13 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save_1/Const_0_0, save_1/RestoreV2_13/tensor_names, save_1/RestoreV2_13/shape_and_slices)]]
	 [[Node: save_1/RestoreV2_5/_51 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_106_save_1/RestoreV2_5", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
