In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd')
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_screen_resolution(vd.ScreenResolution.RES_640X480)
game.load_config('basic.cfg')

down_sample_ratio = 0.125
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels()

#Specify the available actions in the scenario

left = [1, 0, 0]
right = [0, 1, 0]
shoot = [0, 0, 1]
actions = [left, right, shoot]
num_actions = len(actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 10000
epochs = 30
steps_per_epoch = 2000
discount_factor = 0.99
learning_rate = 0.001
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = True
save_model = True
model_dir = './checkpoints/basic.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add new experiences to the buffer (remove old experiences if necessary to avoid exceeding the buffer size)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image
    

In [4]:
#Create a Q-network to estimate values and choose actions for a given state

tf.reset_default_graph()

s_t = tf.placeholder(tf.float32, shape=[None, height, width, channels], name='state')
a_t = tf.placeholder(tf.int32, shape=[None], name='action')
Q_target = tf.placeholder(tf.float32, shape=[None, num_actions], name='Q_target')

input_layer = tf.reshape(s_t, [-1, height, width, channels], name='input_layer')
conv1 = tf.layers.conv2d(inputs=input_layer,
                         filters=32,
                         kernel_size=[8, 8],
                         strides=[4, 4],
                         padding='valid',
                         activation=tf.nn.relu,
                         name='conv1_layer')
conv2 = tf.layers.conv2d(inputs=conv1,
                         filters=64,
                         kernel_size=[4, 4],
                         strides=[2, 2],
                         padding='valid',
                         activation=tf.nn.relu,
                         name='conv2_layer')
flatten = tf.reshape(conv2, [-1, 6*8*64], name='flatten')
dense1 = tf.layers.dense(inputs=flatten,
                         units=512,
                         activation=tf.nn.relu,
                         name='dense1_layer')
Q_values = tf.layers.dense(inputs=dense1,
                           units=len(actions),
                           activation=None,
                           name='output_layer')        
    
best_action = tf.argmax(Q_values, 1)
loss = tf.losses.mean_squared_error(Q_values, Q_target)
adam = tf.train.AdamOptimizer(learning_rate=learning_rate, name='adam').minimize(loss)

def calculate_loss(session, s, q):
    L, _ = session.run([loss, adam], feed_dict={s_t: s, Q_target: q})
    
    return L

#Return the array of Q-values and the best action associated with a given state

def get_Q_values(session, s):
    Q = session.run(Q_values, feed_dict={s_t: s})

    return Q
    
def choose_action(session, s):
    a = session.run(best_action, feed_dict={s_t: s})
    
    return a
    

In [31]:
#Play the game by choosing random actions drawn from a uniform distribution to act as a baseline example

game.set_sound_enabled(True)
game.init()
episode_rewards = list()

for i in range(20):
    game.new_episode()
    
    while not game.is_episode_finished():
        action  = np.random.randint(num_actions)
        reward = game.make_action(actions[action])
        
#Insert a 0.02 second delay after each time step so that the episode is played at normal speed
        
        time.sleep(0.02)
    
    episode_rewards.append(game.get_total_reward())
    print('Random Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
    time.sleep(1)
    
game.close()
print('Average Random Reward:', np.mean(episode_rewards))


Random Episode 1 Reward: 95.0
Random Episode 2 Reward: -229.0
Random Episode 3 Reward: -380.0
Random Episode 4 Reward: -104.0
Random Episode 5 Reward: -107.0
Random Episode 6 Reward: -68.0
Random Episode 7 Reward: -380.0
Random Episode 8 Reward: -375.0
Random Episode 9 Reward: 95.0
Random Episode 10 Reward: -380.0
Random Episode 11 Reward: -375.0
Random Episode 12 Reward: -380.0
Random Episode 13 Reward: 93.0
Random Episode 14 Reward: 95.0
Random Episode 15 Reward: 94.0
Random Episode 16 Reward: 89.0
Random Episode 17 Reward: 95.0
Random Episode 18 Reward: -380.0
Random Episode 19 Reward: 93.0
Random Episode 20 Reward: 63.0
Average Random Reward: -117.3


In [47]:
#Embed a recording of the untrained agent playing 20 episodes

HTML('<iframe src="https://drive.google.com/file/d/1ZqdB9cqy-GbpPF-OY1Cmp6w_hVogZK_p/preview" width="640" height="480"></iframe>')


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()
t = 0

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(state.screen_buffer, down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.2*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(state.screen_buffer, down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Train the Q-network by using the minibatch to update the action-value function Q
            
            Q2 = np.max(get_Q_values(session, s2), axis=1)
            target_Q = get_Q_values(session, s1)
            target_Q[np.arange(batch_size), a] = r + discount_factor*(1 - terminal)*Q2
            calculate_loss(session, s1, target_Q)
            
        epoch_rewards.append(game.get_total_reward())
        
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
        
print('{} time steps experienced during training'.format(t))
game.close()

if save_model == True:
    print('Model saved to', model_dir)
    tf.train.Saver().save(session, model_dir)


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [06:37<00:00,  5.04it/s]


Epoch 1 Mean Reward: -129.916


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [06:39<00:00,  5.00it/s]


Epoch 2 Mean Reward: -126.474


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [06:36<00:00,  5.05it/s]


Epoch 3 Mean Reward: -134.6735


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [06:31<00:00,  5.11it/s]


Epoch 4 Mean Reward: -130.9035


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [05:43<00:00,  5.82it/s]


Epoch 5 Mean Reward: -124.179


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [05:36<00:00,  5.95it/s]


Epoch 6 Mean Reward: -132.4785


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [04:45<00:00,  6.99it/s]


Epoch 7 Mean Reward: -97.5435


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [04:13<00:00,  7.89it/s]


Epoch 8 Mean Reward: -77.2585


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [04:00<00:00,  8.32it/s]


Epoch 9 Mean Reward: -54.1945


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [04:00<00:00,  8.31it/s]


Epoch 10 Mean Reward: -42.6085


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [03:26<00:00,  9.69it/s]


Epoch 11 Mean Reward: -15.587


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [03:09<00:00, 10.57it/s]


Epoch 12 Mean Reward: 4.0555


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [03:03<00:00, 10.90it/s]


Epoch 13 Mean Reward: 12.07


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:54<00:00, 11.47it/s]


Epoch 14 Mean Reward: 22.372


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:52<00:00, 11.59it/s]


Epoch 15 Mean Reward: 30.8725


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:28<00:00, 13.43it/s]


Epoch 16 Mean Reward: 41.701


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:19<00:00, 14.34it/s]


Epoch 17 Mean Reward: 49.7745


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:28<00:00, 13.46it/s]


Epoch 18 Mean Reward: 56.3865


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:25<00:00, 13.78it/s]


Epoch 19 Mean Reward: 60.1255


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:07<00:00, 15.71it/s]


Epoch 20 Mean Reward: 65.39


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:10<00:00, 15.38it/s]


Epoch 21 Mean Reward: 68.1245


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:57<00:00, 16.99it/s]


Epoch 22 Mean Reward: 72.0775


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:54<00:00, 17.46it/s]


Epoch 23 Mean Reward: 74.286


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:51<00:00, 17.94it/s]


Epoch 24 Mean Reward: 76.1515


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:53<00:00, 17.64it/s]


Epoch 25 Mean Reward: 78.7695


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:46<00:00, 18.81it/s]


Epoch 26 Mean Reward: 80.227


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:37<00:00, 20.42it/s]


Epoch 27 Mean Reward: 82.4585


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:40<00:00, 19.94it/s]


Epoch 28 Mean Reward: 82.0455


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:40<00:00, 19.87it/s]


Epoch 29 Mean Reward: 82.4905


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:30<00:00, 13.30it/s]


Epoch 30 Mean Reward: 82.151
424216 time steps experienced during training


In [29]:
#Test the fully trained model by only choosing actions with a greedy strategy

if load_model == True:
    session = tf.Session()
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)

game.set_sound_enabled(True)
game.init()
episode_rewards = list()

for i in range(20):
    game.new_episode()
    
    while not game.is_episode_finished():
        state = game.get_state()
        state1 = preprocess(state.screen_buffer, down_sample_ratio)
        action = choose_action(session, state1)[0]
        reward = game.make_action(actions[action])
        time.sleep(0.02)
        
    episode_rewards.append(game.get_total_reward())
    print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
    time.sleep(1)
    
game.close()
print('Average Test Reward:', np.mean(episode_rewards))


Test Episode 1 Reward: 61.0
Test Episode 2 Reward: 95.0
Test Episode 3 Reward: 93.0
Test Episode 4 Reward: 68.0
Test Episode 5 Reward: 62.0
Test Episode 6 Reward: 95.0
Test Episode 7 Reward: 68.0
Test Episode 8 Reward: 54.0
Test Episode 9 Reward: 95.0
Test Episode 10 Reward: 66.0
Test Episode 11 Reward: 70.0
Test Episode 12 Reward: 57.0
Test Episode 13 Reward: 95.0
Test Episode 14 Reward: 62.0
Test Episode 15 Reward: 75.0
Test Episode 16 Reward: 87.0
Test Episode 17 Reward: 68.0
Test Episode 18 Reward: 95.0
Test Episode 19 Reward: 95.0
Test Episode 20 Reward: 95.0
Average Test Reward: 77.8


In [46]:
#Embed a recording of the fully trained agent playing 20 episodes

HTML('<iframe src="https://drive.google.com/file/d/1azoMIdvmOAPBHoQTkVoNA8DRWw9iOCsm/preview" width="640" height="480"></iframe>')
