In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [38]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 40
steps_per_epoch = 2000
discount_factor = 0.99
learning_rate = 0.001
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = True
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [41]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            buffer = np.concatenate((state.screen_buffer,
                                     np.expand_dims(state.depth_buffer,
                                                    axis=2)),
                                    axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
    if training == False:
        game.close()
    
    return ('Average Test Reward:', np.mean(episode_rewards))


In [42]:
#Create a Q-network to estimate values and choose actions for a given state

tf.reset_default_graph()

s_t = tf.placeholder(tf.float32, shape=[None, height, width, channels], name='state')
a_t = tf.placeholder(tf.int32, shape=[None], name='action')
Q_target = tf.placeholder(tf.float32, shape=[None, num_actions], name='Q_target')

input_layer = tf.reshape(s_t, [-1, height, width, channels], name='input_layer')
conv1 = tf.layers.conv2d(inputs=input_layer,
                         filters=32,
                         kernel_size=[8, 8],
                         strides=[4, 4],
                         padding='valid',
                         activation=tf.nn.relu,
                         name='conv1_layer')
conv2 = tf.layers.conv2d(inputs=conv1,
                         filters=64,
                         kernel_size=[4, 4],
                         strides=[2, 2],
                         padding='valid',
                         activation=tf.nn.relu,
                         name='conv2_layer')
flatten = tf.reshape(conv2, [-1, 6*8*64], name='flatten')
dense1 = tf.layers.dense(inputs=flatten,
                         units=512,
                         activation=tf.nn.relu,
                         name='dense1_layer')
Q_values = tf.layers.dense(inputs=dense1,
                           units=len(actions),
                           activation=None,
                           name='output_layer')        
    
best_action = tf.argmax(Q_values, 1)
loss = tf.losses.mean_squared_error(Q_values, Q_target)
adam = tf.train.AdamOptimizer(learning_rate=learning_rate, name='adam').minimize(loss)

def calculate_loss(session, s, q):
    L, _ = session.run([loss, adam], feed_dict={s_t: s, Q_target: q})
    
    return L

#Return the array of Q-values and the best action associated with a given state

def get_Q_values(session, s):
    Q = session.run(Q_values, feed_dict={s_t: s})

    return Q
    
def choose_action(session, s):
    a = session.run(best_action, feed_dict={s_t: s})
    
    return a


In [43]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()
t = 0

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                np.expand_dims(state.depth_buffer, axis=2)),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    np.expand_dims(state.depth_buffer, axis=2)),
                                                    axis=2),
                                                    down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Train the Q-network by using the minibatch to update the action-value function Q
            
            Q2 = np.max(get_Q_values(session, s2), axis=1)
            target_Q = get_Q_values(session, s1)
            target_Q[np.arange(batch_size), a] = r + discount_factor*(1 - terminal)*Q2
            calculate_loss(session, s1, target_Q)
            
        epoch_rewards.append(game.get_total_reward())
        
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        print('Epoch {} test:'.format(epoch + 1))
        test_agent(num_episodes=10, load_model=True, model_dir=model_dir)
        
        if save_model == True:
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
        
print('{} time steps experienced during training'.format(t))
game.close()
    

Loading model from ./checkpoints/deadly_corridor.ckpt
INFO:tensorflow:Restoring parameters from ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:29<00:00,  6.74it/s]


Epoch 1 Mean Reward: 124.19927780151367
Epoch 1 Model saved to ./checkpoints/deadly_corridor.ckpt
Loading model from ./checkpoints/deadly_corridor.ckpt
INFO:tensorflow:Restoring parameters from ./checkpoints/deadly_corridor.ckpt
Test Episode 1 Reward: 84.17875671386719
Test Episode 2 Reward: 84.0882568359375
Test Episode 3 Reward: 84.04368591308594
Test Episode 4 Reward: 84.17875671386719
Test Episode 5 Reward: 84.17875671386719
Test Episode 6 Reward: 84.02003479003906
Test Episode 7 Reward: 84.11338806152344
Test Episode 8 Reward: 84.17875671386719
Test Episode 9 Reward: 84.02165222167969
Test Episode 10 Reward: 84.08804321289062


100%|██████████| 200/200 [00:27<00:00,  7.18it/s]


Epoch 2 Mean Reward: 123.75431213378906
Epoch 2 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:26<00:00,  7.43it/s]


Epoch 3 Mean Reward: 130.18446998596193
Epoch 3 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:27<00:00,  7.27it/s]


Epoch 4 Mean Reward: 131.7388217163086
Epoch 4 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:31<00:00,  6.31it/s]


Epoch 5 Mean Reward: 135.2420574951172
Epoch 5 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:30<00:00,  6.64it/s]


Epoch 6 Mean Reward: 140.85665336608886
Epoch 6 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:31<00:00,  6.43it/s]


Epoch 7 Mean Reward: 131.1745918273926
Epoch 7 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:29<00:00,  6.77it/s]


Epoch 8 Mean Reward: 136.09171058654786
Epoch 8 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:29<00:00,  6.88it/s]


Epoch 9 Mean Reward: 133.3774912261963
Epoch 9 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:33<00:00,  5.97it/s]


Epoch 10 Mean Reward: 137.90078353881836
Epoch 10 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:44<00:00,  4.50it/s]


Epoch 11 Mean Reward: 136.7114636993408
Epoch 11 Model saved to ./checkpoints/deadly_corridor.ckpt
Loading model from ./checkpoints/deadly_corridor.ckpt
INFO:tensorflow:Restoring parameters from ./checkpoints/deadly_corridor.ckpt
Test Episode 1 Reward: 84.17875671386719
Test Episode 2 Reward: 84.02400207519531
Test Episode 3 Reward: 84.17875671386719
Test Episode 4 Reward: 84.17875671386719
Test Episode 5 Reward: 84.04461669921875
Test Episode 6 Reward: 84.06651306152344
Test Episode 7 Reward: 84.02003479003906
Test Episode 8 Reward: 84.04368591308594
Test Episode 9 Reward: 84.17875671386719
Test Episode 10 Reward: 84.13008117675781


100%|██████████| 200/200 [00:39<00:00,  5.02it/s]


Epoch 12 Mean Reward: 134.58923225402833
Epoch 12 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:56<00:00,  3.51it/s]


Epoch 13 Mean Reward: 130.9185460662842
Epoch 13 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:00<00:00,  3.31it/s]


Epoch 14 Mean Reward: 138.94172439575195
Epoch 14 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:51<00:00,  3.90it/s]


Epoch 15 Mean Reward: 124.83926788330078
Epoch 15 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:24<00:00,  2.38it/s]


Epoch 16 Mean Reward: 141.11394065856933
Epoch 16 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:32<00:00,  2.16it/s]


Epoch 17 Mean Reward: 142.12135818481445
Epoch 17 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:53<00:00,  1.77it/s]


Epoch 18 Mean Reward: 123.14656517028808
Epoch 18 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:46<00:00,  1.88it/s]


Epoch 19 Mean Reward: 140.68948944091798
Epoch 19 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [02:01<00:00,  1.64it/s]


Epoch 20 Mean Reward: 116.32652534484863
Epoch 20 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [02:23<00:00,  1.39it/s]


Epoch 21 Mean Reward: 104.45860725402832
Epoch 21 Model saved to ./checkpoints/deadly_corridor.ckpt
Loading model from ./checkpoints/deadly_corridor.ckpt
INFO:tensorflow:Restoring parameters from ./checkpoints/deadly_corridor.ckpt
Test Episode 1 Reward: 84.06651306152344
Test Episode 2 Reward: 84.17875671386719
Test Episode 3 Reward: 84.021484375
Test Episode 4 Reward: 84.11024475097656
Test Episode 5 Reward: 84.06504821777344
Test Episode 6 Reward: 84.17875671386719
Test Episode 7 Reward: 84.13105773925781
Test Episode 8 Reward: 84.17875671386719
Test Episode 9 Reward: 84.02398681640625
Test Episode 10 Reward: 84.17875671386719


100%|██████████| 200/200 [02:02<00:00,  1.64it/s]


Epoch 22 Mean Reward: 199.75455345153807
Epoch 22 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:50<00:00,  1.82it/s]


Epoch 23 Mean Reward: 113.43198715209961
Epoch 23 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:30<00:00,  2.20it/s]


Epoch 24 Mean Reward: 127.56523567199707
Epoch 24 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:33<00:00,  2.14it/s]


Epoch 25 Mean Reward: 147.83740425109863
Epoch 25 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:53<00:00,  3.72it/s]


Epoch 26 Mean Reward: 186.9592300415039
Epoch 26 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [00:50<00:00,  3.96it/s]


Epoch 27 Mean Reward: 180.36884506225587
Epoch 27 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:11<00:00,  2.78it/s]


Epoch 28 Mean Reward: 153.12448875427245
Epoch 28 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:22<00:00,  2.41it/s]


Epoch 29 Mean Reward: 168.1335831451416
Epoch 29 Model saved to ./checkpoints/deadly_corridor.ckpt


100%|██████████| 200/200 [01:16<00:00,  2.62it/s]


Epoch 30 Mean Reward: 114.92005027770996
Epoch 30 Model saved to ./checkpoints/deadly_corridor.ckpt
163454 time steps experienced during training


In [37]:
epoch = 50
if epoch % 10 == 0:
        test_agent(10, training=False, load_model=False, session=session, model_dir=model_dir)

Test Episode 1 Reward: 84.00428771972656
Test Episode 2 Reward: 84.17875671386719
Test Episode 3 Reward: 84.17875671386719
Test Episode 4 Reward: 84.02398681640625
Test Episode 5 Reward: 84.0443115234375
Test Episode 6 Reward: 84.02200317382812
Test Episode 7 Reward: 84.0416259765625
Test Episode 8 Reward: 84.17875671386719
Test Episode 9 Reward: 84.02398681640625
Test Episode 10 Reward: 84.17875671386719


In [27]:
#Test the fully trained model by only choosing actions with a greedy strategy

#tf.reset_default_graph()
session = tf.Session()

if load_model == True:
    #saver = tf.train.import_meta_graph(model_dir + '.meta')
    #session.run(tf.global_variables_initializer())
    print('Loading model from', model_dir)
    #saver.restore(session, model_dir)
    tf.train.Saver().restore(session, model_dir)

game.set_sound_enabled(True)
game.init()
episode_rewards = list()

for i in range(5):
    game.new_episode()
    
    while not game.is_episode_finished():
        state = game.get_state()
        state1 = preprocess(np.concatenate((state.screen_buffer,
                                            np.expand_dims(state.depth_buffer, axis=2)),
                                            axis=2),
                            down_sample_ratio)
        action = choose_action(session, state1)[0]
        reward = game.make_action(actions[action])
        time.sleep(0.02)
        
    episode_rewards.append(game.get_total_reward())
    print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
    time.sleep(1)
    
game.close()
print('Average Test Reward:', np.mean(episode_rewards))


Loading model from ./checkpoints/deadly_corridor.ckpt
INFO:tensorflow:Restoring parameters from ./checkpoints/deadly_corridor.ckpt
Test Episode 1 Reward: 84.03608703613281
Test Episode 2 Reward: 84.17875671386719
Test Episode 3 Reward: 84.02197265625
Test Episode 4 Reward: 84.17875671386719
Test Episode 5 Reward: 84.17875671386719
Average Test Reward: 84.1188659668


In [36]:
game.close()