In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 80
steps_per_epoch = 2000
discount_factor = 0.75
learning_rate = 0.001
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            buffer = np.concatenate((state.screen_buffer,
                                     np.expand_dims(state.depth_buffer,
                                                    axis=2)),
                                    axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return ('Average Test Reward:', np.mean(episode_rewards))


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                np.expand_dims(state.depth_buffer, axis=2)),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    np.expand_dims(state.depth_buffer, axis=2)),
                                                    axis=2),
                                                    down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function

            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + discount_factor*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        epoch_rank.append((np.mean(epoch_rewards), epoch + 1))
        
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

        print('Epoch {} test:'.format(epoch + 1))
        print(test_agent(DQN, num_episodes=10,
                         training=True,
                         load_model=False,
                         session=session,
                         model_dir=model_dir))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [03:37<00:00,  9.20it/s]


Epoch 1 Mean Reward: 130.22627645111083


100%|██████████| 2000/2000 [02:52<00:00, 11.58it/s]


Epoch 2 Mean Reward: 134.51816317749024


100%|██████████| 2000/2000 [03:11<00:00, 10.47it/s]


Epoch 3 Mean Reward: 130.79105252075195


100%|██████████| 2000/2000 [03:06<00:00, 10.72it/s]


Epoch 4 Mean Reward: 130.91410857391358


100%|██████████| 2000/2000 [02:58<00:00, 11.19it/s]


Epoch 5 Mean Reward: 130.57677927398683


100%|██████████| 2000/2000 [03:24<00:00,  9.80it/s]


Epoch 6 Mean Reward: 132.42309548950195


100%|██████████| 2000/2000 [02:56<00:00, 11.34it/s]


Epoch 7 Mean Reward: 131.9567905883789


100%|██████████| 2000/2000 [02:56<00:00, 11.33it/s]


Epoch 8 Mean Reward: 131.45893687438965


100%|██████████| 2000/2000 [02:48<00:00, 11.84it/s]


Epoch 9 Mean Reward: 132.1183303833008


100%|██████████| 2000/2000 [02:46<00:00, 12.03it/s]


Epoch 10 Mean Reward: 133.49377787780762
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 10 test:
Test Episode 1 Reward: 226.78720092773438
Test Episode 2 Reward: 541.5596160888672
Test Episode 3 Reward: 344.442138671875
Test Episode 4 Reward: 344.442138671875
Test Episode 5 Reward: 344.442138671875
Test Episode 6 Reward: 344.442138671875
Test Episode 7 Reward: 344.442138671875
Test Episode 8 Reward: 344.442138671875
Test Episode 9 Reward: 344.442138671875
Test Episode 10 Reward: 344.442138671875
('Average Test Reward:', 352.38839263916014)


100%|██████████| 2000/2000 [03:13<00:00, 10.33it/s]


Epoch 11 Mean Reward: 130.14234243011475


100%|██████████| 2000/2000 [03:16<00:00, 10.17it/s]


Epoch 12 Mean Reward: 132.95560752105712


100%|██████████| 2000/2000 [02:52<00:00, 11.58it/s]


Epoch 13 Mean Reward: 131.0480877685547


100%|██████████| 2000/2000 [02:53<00:00, 11.54it/s]


Epoch 14 Mean Reward: 128.0188042755127


100%|██████████| 2000/2000 [02:55<00:00, 11.41it/s]


Epoch 15 Mean Reward: 130.37146461486816


100%|██████████| 2000/2000 [02:48<00:00, 11.85it/s]


Epoch 16 Mean Reward: 131.2083526763916


100%|██████████| 2000/2000 [02:47<00:00, 11.97it/s]


Epoch 17 Mean Reward: 132.5362410812378


100%|██████████| 2000/2000 [02:50<00:00, 11.73it/s]


Epoch 18 Mean Reward: 130.49721478271485


100%|██████████| 2000/2000 [02:49<00:00, 11.80it/s]


Epoch 19 Mean Reward: 131.60849320983885


100%|██████████| 2000/2000 [02:50<00:00, 11.74it/s]


Epoch 20 Mean Reward: 129.30863298797607
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 20 test:
Test Episode 1 Reward: 279.0060729980469
Test Episode 2 Reward: 510.0224151611328
Test Episode 3 Reward: 279.0060729980469
Test Episode 4 Reward: 315.7782897949219
Test Episode 5 Reward: 279.0060729980469
Test Episode 6 Reward: 563.895263671875
Test Episode 7 Reward: 98.09750366210938
Test Episode 8 Reward: 619.8581237792969
Test Episode 9 Reward: 279.0060729980469
Test Episode 10 Reward: 279.0060729980469
('Average Test Reward:', 350.26819610595703)


100%|██████████| 2000/2000 [02:47<00:00, 11.91it/s]


Epoch 21 Mean Reward: 131.9026269683838


100%|██████████| 2000/2000 [02:46<00:00, 12.02it/s]


Epoch 22 Mean Reward: 129.11614985656738


100%|██████████| 2000/2000 [02:44<00:00, 12.17it/s]


Epoch 23 Mean Reward: 131.10017154693602


100%|██████████| 2000/2000 [02:44<00:00, 12.13it/s]


Epoch 24 Mean Reward: 132.2241854171753


100%|██████████| 2000/2000 [03:24<00:00,  9.76it/s]


Epoch 25 Mean Reward: 164.55961542510985


100%|██████████| 2000/2000 [03:13<00:00, 10.33it/s]


Epoch 26 Mean Reward: 165.9850161819458


100%|██████████| 2000/2000 [03:14<00:00, 10.26it/s]


Epoch 27 Mean Reward: 172.69815673065185


100%|██████████| 2000/2000 [03:23<00:00,  9.82it/s]


Epoch 28 Mean Reward: 179.77751403808594


100%|██████████| 2000/2000 [03:20<00:00,  9.96it/s]


Epoch 29 Mean Reward: 181.11722457885742


100%|██████████| 2000/2000 [03:23<00:00,  9.84it/s]


Epoch 30 Mean Reward: 187.4103791732788
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 30 test:
Test Episode 1 Reward: 294.95274353027344
Test Episode 2 Reward: 294.95274353027344
Test Episode 3 Reward: 628.3979644775391
Test Episode 4 Reward: 294.95274353027344
Test Episode 5 Reward: 453.07073974609375
Test Episode 6 Reward: 436.69554138183594
Test Episode 7 Reward: 294.95274353027344
Test Episode 8 Reward: 294.95274353027344
Test Episode 9 Reward: 646.2454071044922
Test Episode 10 Reward: 316.8426818847656
('Average Test Reward:', 395.60160522460939)


100%|██████████| 2000/2000 [03:27<00:00,  9.65it/s]


Epoch 31 Mean Reward: 193.75594888305665


100%|██████████| 2000/2000 [03:22<00:00,  9.89it/s]


Epoch 32 Mean Reward: 195.87126916503905


100%|██████████| 2000/2000 [03:24<00:00,  9.76it/s]


Epoch 33 Mean Reward: 199.13965762329101


100%|██████████| 2000/2000 [03:24<00:00,  9.77it/s]


Epoch 34 Mean Reward: 203.63886737060548


100%|██████████| 2000/2000 [03:26<00:00,  9.67it/s]


Epoch 35 Mean Reward: 212.4102200012207


100%|██████████| 2000/2000 [03:24<00:00,  9.78it/s]


Epoch 36 Mean Reward: 211.66010665893555


100%|██████████| 2000/2000 [03:31<00:00,  9.47it/s]


Epoch 37 Mean Reward: 220.39235485076904


100%|██████████| 2000/2000 [03:32<00:00,  9.42it/s]


Epoch 38 Mean Reward: 222.24457292175293


100%|██████████| 2000/2000 [03:28<00:00,  9.57it/s]


Epoch 39 Mean Reward: 225.80115264892578


100%|██████████| 2000/2000 [03:31<00:00,  9.47it/s]


Epoch 40 Mean Reward: 235.4856062850952
Epoch 40 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 40 test:
Test Episode 1 Reward: 288.68162536621094
Test Episode 2 Reward: 523.4003448486328
Test Episode 3 Reward: 288.68162536621094
Test Episode 4 Reward: 572.4085998535156
Test Episode 5 Reward: 244.4740753173828
Test Episode 6 Reward: 317.6009826660156
Test Episode 7 Reward: 505.3304901123047
Test Episode 8 Reward: 350.8273620605469
Test Episode 9 Reward: 288.68162536621094
Test Episode 10 Reward: 259.5741424560547
('Average Test Reward:', 363.96608734130859)


100%|██████████| 2000/2000 [03:31<00:00,  9.43it/s]


Epoch 41 Mean Reward: 237.11937521362304


100%|██████████| 2000/2000 [03:27<00:00,  9.62it/s]


Epoch 42 Mean Reward: 241.99600649261475


100%|██████████| 2000/2000 [03:27<00:00,  9.64it/s]


Epoch 43 Mean Reward: 239.86270484161378


100%|██████████| 2000/2000 [03:33<00:00,  9.37it/s]


Epoch 44 Mean Reward: 253.58983532714845


100%|██████████| 2000/2000 [03:37<00:00,  9.19it/s]


Epoch 45 Mean Reward: 250.91648341369628


100%|██████████| 2000/2000 [03:37<00:00,  9.20it/s]


Epoch 46 Mean Reward: 259.4569700241089


100%|██████████| 2000/2000 [03:38<00:00,  9.16it/s]


Epoch 47 Mean Reward: 262.80324201965334


100%|██████████| 2000/2000 [03:37<00:00,  9.21it/s]


Epoch 48 Mean Reward: 265.3439067230225


100%|██████████| 2000/2000 [03:39<00:00,  9.12it/s]


Epoch 49 Mean Reward: 271.24021926116944


100%|██████████| 2000/2000 [03:41<00:00,  9.03it/s]


Epoch 50 Mean Reward: 282.3534098129272
Epoch 50 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 50 test:
Test Episode 1 Reward: 288.6736145019531
Test Episode 2 Reward: 260.38990783691406
Test Episode 3 Reward: 288.6736145019531
Test Episode 4 Reward: 316.6221618652344
Test Episode 5 Reward: 288.6736145019531
Test Episode 6 Reward: 288.6736145019531
Test Episode 7 Reward: 281.7825622558594
Test Episode 8 Reward: 1041.0503540039062
Test Episode 9 Reward: 327.6304473876953
Test Episode 10 Reward: 588.8087158203125
('Average Test Reward:', 397.09786071777341)


100%|██████████| 2000/2000 [03:42<00:00,  9.01it/s]


Epoch 51 Mean Reward: 294.8150086288452


100%|██████████| 2000/2000 [03:42<00:00,  8.98it/s]


Epoch 52 Mean Reward: 292.51391342163083


100%|██████████| 2000/2000 [03:40<00:00,  9.07it/s]


Epoch 53 Mean Reward: 308.40388306427


100%|██████████| 2000/2000 [03:33<00:00,  9.35it/s]


Epoch 54 Mean Reward: 313.07802807617185


100%|██████████| 2000/2000 [03:31<00:00,  9.48it/s]


Epoch 55 Mean Reward: 317.4867276687622


100%|██████████| 2000/2000 [03:37<00:00,  9.18it/s]


Epoch 56 Mean Reward: 324.30731047058106


100%|██████████| 2000/2000 [03:35<00:00,  9.28it/s]


Epoch 57 Mean Reward: 331.31592361450197


100%|██████████| 2000/2000 [03:38<00:00,  9.17it/s]


Epoch 58 Mean Reward: 338.96638208770753


100%|██████████| 2000/2000 [03:43<00:00,  8.95it/s]


Epoch 59 Mean Reward: 351.65807095336913


100%|██████████| 2000/2000 [03:43<00:00,  8.94it/s]


Epoch 60 Mean Reward: 361.01266443634034
Epoch 60 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 60 test:
Test Episode 1 Reward: 288.68162536621094
Test Episode 2 Reward: 288.68162536621094
Test Episode 3 Reward: 554.5430603027344
Test Episode 4 Reward: 288.68162536621094
Test Episode 5 Reward: 288.68162536621094
Test Episode 6 Reward: 259.55804443359375
Test Episode 7 Reward: 288.68162536621094
Test Episode 8 Reward: 565.3264770507812
Test Episode 9 Reward: 288.68162536621094
Test Episode 10 Reward: 288.68162536621094
('Average Test Reward:', 340.01989593505857)


100%|██████████| 2000/2000 [03:38<00:00,  9.15it/s]


Epoch 61 Mean Reward: 364.13893077850344


100%|██████████| 2000/2000 [03:27<00:00,  9.62it/s]


Epoch 62 Mean Reward: 374.53940602111817


100%|██████████| 2000/2000 [02:51<00:00, 11.64it/s]


Epoch 63 Mean Reward: 376.26286662292483


100%|██████████| 2000/2000 [03:17<00:00, 10.15it/s]


Epoch 64 Mean Reward: 379.5990826873779


100%|██████████| 2000/2000 [03:15<00:00, 10.21it/s]


Epoch 65 Mean Reward: 387.723549369812


100%|██████████| 2000/2000 [01:53<00:00, 17.60it/s]


Epoch 66 Mean Reward: 399.78635751342773


100%|██████████| 2000/2000 [01:50<00:00, 18.04it/s]


Epoch 67 Mean Reward: 408.8314730758667


100%|██████████| 2000/2000 [01:48<00:00, 18.44it/s]


Epoch 68 Mean Reward: 416.6059064788818


100%|██████████| 2000/2000 [01:47<00:00, 18.60it/s]


Epoch 69 Mean Reward: 425.6775333328247


100%|██████████| 2000/2000 [01:48<00:00, 18.37it/s]


Epoch 70 Mean Reward: 432.81911361694335
Epoch 70 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 70 test:
Test Episode 1 Reward: 287.1919708251953
Test Episode 2 Reward: 287.1919708251953
Test Episode 3 Reward: 562.4911346435547
Test Episode 4 Reward: 287.1919708251953
Test Episode 5 Reward: 769.7454833984375
Test Episode 6 Reward: 676.4645385742188
Test Episode 7 Reward: 492.7627716064453
Test Episode 8 Reward: 351.0941467285156
Test Episode 9 Reward: 287.1919708251953
Test Episode 10 Reward: 257.4929504394531
('Average Test Reward:', 425.88189086914065)


100%|██████████| 2000/2000 [01:50<00:00, 18.12it/s]


Epoch 71 Mean Reward: 440.33210068511966


100%|██████████| 2000/2000 [01:47<00:00, 18.66it/s]


Epoch 72 Mean Reward: 443.35806465148926


100%|██████████| 2000/2000 [02:08<00:00, 15.58it/s]


Epoch 73 Mean Reward: 447.29509090423585


100%|██████████| 2000/2000 [03:21<00:00,  9.91it/s]


Epoch 74 Mean Reward: 446.39959513092043


100%|██████████| 2000/2000 [02:35<00:00, 12.82it/s]


Epoch 75 Mean Reward: 444.86860160064697


100%|██████████| 2000/2000 [01:48<00:00, 18.44it/s]


Epoch 76 Mean Reward: 448.83525344085695


100%|██████████| 2000/2000 [01:46<00:00, 18.73it/s]


Epoch 77 Mean Reward: 447.6067451324463


100%|██████████| 2000/2000 [01:52<00:00, 17.80it/s]


Epoch 78 Mean Reward: 442.4372561645508


100%|██████████| 2000/2000 [01:49<00:00, 18.31it/s]


Epoch 79 Mean Reward: 451.96415175628664


100%|██████████| 2000/2000 [01:50<00:00, 18.13it/s]


Epoch 80 Mean Reward: 444.92636431121826
Epoch 80 Model saved to ./checkpoints/deadly_corridor.ckpt
Epoch 80 test:
Test Episode 1 Reward: 288.68162536621094
Test Episode 2 Reward: 265.7804718017578
Test Episode 3 Reward: 288.68162536621094
Test Episode 4 Reward: 524.2005310058594
Test Episode 5 Reward: 549.0877685546875
Test Episode 6 Reward: 288.68162536621094
Test Episode 7 Reward: 517.6818084716797
Test Episode 8 Reward: 561.9625091552734
Test Episode 9 Reward: 288.68162536621094
Test Episode 10 Reward: 288.68162536621094
('Average Test Reward:', 386.21212158203127)
[(451.96415175628664, 79), (448.83525344085695, 76), (447.60674513244629, 77), (447.29509090423585, 73), (446.39959513092043, 74), (444.92636431121826, 80), (444.86860160064697, 75), (443.35806465148926, 72), (442.43725616455077, 78), (440.33210068511966, 71), (432.81911361694335, 70), (425.67753333282468, 69), (416.60590647888182, 68), (408.83147307586671, 67), (399.78635751342773, 66), (387.72354936981202, 65), (379.59

In [6]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

print(test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=ckpts[-4]))


Loading model from checkpoints\deadly_corridor.ckpt-50
INFO:tensorflow:Restoring parameters from checkpoints\deadly_corridor.ckpt-50
Test Episode 1 Reward: 288.6736145019531
Test Episode 2 Reward: 318.01182556152344
Test Episode 3 Reward: 478.78614807128906
Test Episode 4 Reward: 288.6736145019531
Test Episode 5 Reward: 264.34947204589844
Test Episode 6 Reward: 652.1700592041016
Test Episode 7 Reward: 221.61668395996094
Test Episode 8 Reward: 288.6736145019531
Test Episode 9 Reward: 288.6736145019531
Test Episode 10 Reward: 511.9719543457031
Test Episode 11 Reward: 288.6736145019531
Test Episode 12 Reward: 309.7176055908203
Test Episode 13 Reward: 275.09674072265625
Test Episode 14 Reward: 524.8068237304688
Test Episode 15 Reward: 324.20448303222656
Test Episode 16 Reward: 288.6736145019531
Test Episode 17 Reward: 288.6736145019531
Test Episode 18 Reward: 520.2238616943359
Test Episode 19 Reward: 304.08660888671875
Test Episode 20 Reward: 288.6736145019531
('Average Test Reward:', 350.