In [1]:
import importlib.util
import scipy.misc
import time

import tensorflow as tf
import numpy as np

from tqdm import trange
from IPython.display import HTML

#Import the vizdoom package as "vd" since it can't be installed normally on Windows

vd_location = 'C:/Anaconda3/envs/doom/Lib/site-packages/vizdoom/vizdoom.pyd'
vizdoom = importlib.util.spec_from_file_location('vizdoom',
                                                 vd_location)
vd = importlib.util.module_from_spec(vizdoom)
vizdoom.loader.exec_module(vd)


In [2]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_depth_buffer_enabled(True)
game.set_screen_resolution(vd.ScreenResolution.RES_160X120)
game.load_config('deadly_corridor.cfg')

down_sample_ratio = 0.5
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + 1

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 50000
epochs = 4
steps_per_epoch = 200
discount_factor = 0.99
learning_rate = 0.001
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/deadly_corridor.ckpt'


In [3]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if down_sample_ratio != 1:
        image = scipy.misc.imresize(image, down_sample_ratio)
    image = image.astype(np.float32)
    image /= 255.0
    image = np.expand_dims(image, axis=0)
    
    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            buffer = np.concatenate((state.screen_buffer,
                                     np.expand_dims(state.depth_buffer,
                                                    axis=2)),
                                    axis=2)
            state1 = preprocess(buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return ('Average Test Reward:', np.mean(episode_rewards))


In [4]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.reshape(self.conv2,
                                  [-1, 6*8*64],
                                  name=network_name + '_flatten'
                                 )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created last
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network to match the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [5]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet

tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver()
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()
t = 0

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            state1 = preprocess(np.concatenate((state.screen_buffer,
                                                np.expand_dims(state.depth_buffer, axis=2)),
                                                axis=2),
                                               down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.3*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                    
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                state2 = preprocess(np.concatenate((state.screen_buffer,
                                                    np.expand_dims(state.depth_buffer, axis=2)),
                                                    axis=2),
                                                    down_sample_ratio)
        
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Train the Q-network by using the minibatch to update the action-value function Q
            
            Q2 = np.max(DQN.get_Q_values(session, s2), axis=1)
            target_Q = DQN.get_Q_values(session, s1)
            target_Q[np.arange(batch_size), a] = r + discount_factor*(1 - terminal)*Q2
            DQN.calculate_loss(session, s1, target_Q)
            
        epoch_rewards.append(game.get_total_reward())

    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Save the model, update the target network, and test the agent for 10 episodes every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)
            
        print('Epoch {} test:'.format(epoch + 1))
        print(test_agent(DQN, num_episodes=10,
                         training=True,
                         load_model=False,
                         session=session,
                         model_dir=model_dir))
        
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:14<00:00, 14.24it/s]


Epoch 1 Mean Reward: 136.92948776245117
Epoch 1 Tensor("target_conv1_layer/kernel/read:0", shape=(8, 8, 4, 32), dtype=float32) [[[[ -5.38073480e-04  -3.90165709e-02   5.01258597e-02 ...,
      4.49389890e-02  -4.48479578e-02  -1.00529194e-02]
   [  1.35227814e-02   1.62318647e-02  -1.80295631e-02 ...,
     -4.06825393e-02   1.69028491e-02  -3.54929343e-02]
   [  3.41364965e-02   4.91783693e-02   4.56413180e-02 ...,
     -3.05127185e-02   8.53919610e-03   2.36751437e-02]
   [ -1.91343166e-02  -4.46616970e-02  -3.01158763e-02 ...,
     -1.56481639e-02  -4.76997830e-02   5.02218977e-02]]

  [[ -4.33390737e-02  -4.58763018e-02  -2.37723775e-02 ...,
     -2.23583337e-02   3.28511447e-02   3.04002091e-02]
   [  3.28202918e-03   4.29666415e-02  -4.06604931e-02 ...,
     -3.86675298e-02  -1.27191283e-02  -2.88914386e-02]
   [  1.00928508e-02   4.32154536e-03   7.97413662e-03 ...,
     -3.35233659e-02   2.16503143e-02   4.64262441e-02]
   [  4.65580672e-02  -9.18387622e-03   3.49955559e-02 ...,

     -4.33436632e-02   6.15068153e-03   4.45641950e-02]]]]
Epoch 1 Tensor("target_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
Epoch 1 Tensor("target_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[ 0.032681    0.00825977 -0.01125285 ...,  0.02822976 -0.00067407
    -0.00958267]
   [ 0.01089126 -0.03351262  0.00642926 ..., -0.00859715  0.03406656
    -0.05463187]
   [-0.05444732 -0.02222523 -0.04492816 ..., -0.03293735  0.03493433
     0.00555679]
   ..., 
   [-0.01567777  0.00838165 -0.02406426 ..., -0.01122496 -0.01800856
    -0.04048254]
   [ 0.05420281  0.01812902 -0.03565072 ...,  0.02238421 -0.05198269
     0.05231963]
   [ 0.04173176  0.04316385 -0.0466224  ...,  0.05345303  0.05892639
     0.04998359]]

  [[-0.00045007 -0.04575011 -0.04264088 ...,  0.05416721 -0.02603503
    -0.02666256]
   [ 0.04246594 -0.05946474  0.0

    -0.01187237]]]]
Epoch 1 Tensor("target_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
Epoch 1 Tensor("target_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32) [[ 0.02279261  0.01348449  0.02987393 ...,  0.02522606 -0.03943726
   0.0114601 ]
 [-0.00607321  0.01227226 -0.00700982 ...,  0.03629587  0.02542537
  -0.01761506]
 [-0.00178795  0.00329674 -0.01419873 ...,  0.03999633  0.02126062
  -0.0193316 ]
 ..., 
 [-0.03002015 -0.02281206  0.01064405 ..., -0.00666411 -0.01820347
  -0.01083244]
 [ 0.0213048  -0.01909448 -0.03053279 ...,  0.00832694  0.02645934
  -0.02168998]
 [-0.03539062 -0.00154861 -0.04051494 ...,  0.00901387 -0.03726084
  -0.03243659]]
Epoch 1 Tensor("target_dense1_layer/bias/read:0", shape=

    -0.01229834]]]]
Epoch 1 Tensor("online_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.00998046  0.01096126  0.01665642  0.01380155  0.01089968  0.01011455
  0.01214929  0.01704935  0.02970435  0.0166959   0.01646263  0.01333768
  0.00114722  0.00473172 -0.00135617  0.01264965  0.00559393 -0.00270905
  0.0143431   0.01332767  0.01058558  0.01604313  0.00953063  0.00286364
  0.0088952   0.01140923  0.01373933  0.01363416  0.0117148   0.01612695
  0.00405253  0.01375497]
Epoch 1 Tensor("online_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[ -3.43095399e-02   6.85858876e-02  -1.34868901e-02 ...,
      2.77280528e-02   6.15937263e-02   6.13318346e-02]
   [ -4.26065847e-02   5.70590124e-02   6.96490705e-02 ...,
      1.12478146e-02   9.22244880e-03  -1.81617271e-02]
   [ -3.13232206e-02  -3.60172801e-02  -1.23584084e-02 ...,
      1.64321940e-02   1.92311034e-02   7.63725862e-02]
   ..., 
   [ -1.32054016e-02   3.47100720e-02   7.26103038e-02 ...,
      1

      9.52579838e-04  -1.70628577e-02   2.22146558e-03]]]]
Epoch 1 Tensor("online_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.02068944  0.02074827  0.02308786  0.02086914  0.02048927  0.02270553
 -0.00589217  0.0137466   0.02306501  0.01479056  0.02033488  0.02260095
  0.02324379  0.01265869  0.02377167 -0.00711189  0.0241143   0.0239802
  0.02073328  0.01686767  0.02001903  0.01884886  0.02127007  0.00910504
  0.01586481  0.02105849  0.02108392  0.0198582   0.02430466  0.00530597
  0.01755042  0.02039582  0.02099302  0.02396529  0.01740743  0.02339384
  0.02237461  0.0236814   0.01211906  0.0239194   0.02026177  0.00031011
  0.01615382  0.01853841 -0.00037012  0.02007431  0.02128727 -0.00210031
  0.01668294 -0.00402681  0.01843907 -0.00534579  0.02298356  0.02329294
  0.02229211 -0.00165216 -0.00285526  0.02284266  0.02300842  0.02413777
  0.02360903  0.01751695  0.02334722  0.02263661]
Epoch 1 Tensor("online_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32)

   2.20262576e-02   1.42398570e-02  -5.86492196e-03   8.69065430e-03]
Epoch 1 Tensor("online_output_layer/kernel/read:0", shape=(512, 7), dtype=float32) [[ 0.1006219   0.06215213 -0.02072052 ..., -0.08145713  0.12197734
   0.09986423]
 [ 0.03404297 -0.06102786  0.09961747 ..., -0.07118447 -0.0132134
   0.03345056]
 [ 0.00941239  0.0795709  -0.05583194 ...,  0.04885146  0.03136585
   0.05439398]
 ..., 
 [ 0.10475917 -0.00380158  0.02263753 ..., -0.05470352  0.09015651
   0.05390124]
 [ 0.05318509 -0.03894806  0.01236578 ..., -0.07919432 -0.0711684
  -0.07880869]
 [ 0.03134237  0.06957599 -0.06693304 ..., -0.04053574  0.02550854
   0.06807113]]
Epoch 1 Tensor("online_output_layer/bias/read:0", shape=(7,), dtype=float32) [ 0.02344326  0.03701505  0.02362838  0.01862667  0.02696524  0.01354078
  0.01452726]


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:14<00:00, 13.87it/s]


Epoch 2 Mean Reward: 137.14943664550782
Epoch 2 Tensor("target_conv1_layer/kernel/read:0", shape=(8, 8, 4, 32), dtype=float32) [[[[  5.00485860e-02  -1.45031214e-02  -1.37521503e-02 ...,
      6.28788322e-02   4.36173519e-03   1.70186218e-02]
   [ -3.50263193e-02  -1.55144942e-03   2.07747016e-02 ...,
      3.75305042e-02  -2.72120163e-02   3.77959281e-04]
   [  1.25594595e-02   1.16664090e-03   1.60876438e-02 ...,
      3.33568677e-02  -6.32177740e-02   6.59097312e-03]
   [ -1.22172637e-02  -2.33157910e-02   6.96150586e-02 ...,
      4.99590077e-02  -3.66195552e-02  -1.39286239e-02]]

  [[  5.29896393e-02   1.75568130e-04   2.76541919e-03 ...,
      4.77953590e-02  -5.97909763e-02   4.14104015e-03]
   [  7.77238607e-03   4.44106907e-02   3.36174555e-02 ...,
      6.01929836e-02  -3.22375372e-02   4.04368192e-02]
   [  4.30519469e-02  -5.00971414e-02  -3.93620767e-02 ...,
      3.03440671e-02  -6.80513605e-02  -4.37459052e-02]
   [  4.74970375e-06  -2.04133848e-03  -1.61342584e-02 ...,

      6.49228552e-03   5.60613759e-02  -1.87998693e-02]]]]
Epoch 2 Tensor("target_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.00503446  0.00772467  0.02463803  0.01962376  0.01283305  0.00651487
  0.00689674  0.02612324  0.05186253  0.02471139  0.02322371  0.01771112
  0.0029885   0.00790194  0.00213891  0.01556958  0.00720619 -0.00141298
  0.02240219  0.01861885  0.01223848  0.02152901  0.00853516  0.00519983
  0.00502043  0.0144525   0.0211486   0.01946189  0.00742501  0.0214534
  0.01214941  0.00926718]
Epoch 2 Tensor("target_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[-0.04206598  0.06119433 -0.02149013 ...,  0.0201619   0.05412949
     0.05389601]
   [-0.05083889  0.0493154   0.06099827 ...,  0.00166565  0.00150822
    -0.02634924]
   [-0.02978264 -0.03344428 -0.01265548 ...,  0.01193694  0.02246441
     0.07758146]
   ..., 
   [-0.01404844  0.0348702   0.07030737 ..., -0.00477346  0.04183599
     0.03684872]
   [-0.04268995  0.00833096 -0.01

    -0.00549175]]]]
Epoch 2 Tensor("target_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.05114191  0.05143592  0.04923395  0.05268323  0.0513421   0.0497993
 -0.00589217  0.0274605   0.05352336  0.02590189  0.04785303  0.04893831
  0.05475503  0.01652328  0.05435876 -0.00711189  0.05741706  0.05586756
  0.0527437   0.03035812  0.05017113  0.03545658  0.04186175  0.01256429
  0.02877858  0.0526733   0.03944463  0.04879654  0.058095    0.00530597
  0.03421588  0.04582443  0.0401909   0.0574468   0.02821111  0.05239864
  0.04853762  0.05307013  0.01378862  0.05771947  0.04784242  0.0003767
  0.03528848  0.03710266 -0.0001781   0.04287852  0.04159566 -0.00210031
  0.03350197 -0.00403178  0.04387469 -0.00533555  0.05095491  0.05315076
  0.04861747 -0.0016093  -0.00276602  0.05375302  0.05510601  0.05690867
  0.05381001  0.03379512  0.05571711  0.05090351]
Epoch 2 Tensor("target_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32) [[-0.03751493 -0.03747647 -0.00594528 .

   5.72973639e-02   4.73111495e-02  -5.86492196e-03   1.08399978e-02]
Epoch 2 Tensor("target_output_layer/kernel/read:0", shape=(512, 7), dtype=float32) [[ 0.1006219   0.06215213 -0.02072052 ..., -0.08145713  0.12197734
   0.09986423]
 [ 0.03404297 -0.06102786  0.09961747 ..., -0.07118447 -0.0132134
   0.03345056]
 [ 0.00680809  0.07618912 -0.05882051 ...,  0.04376627  0.02868569
   0.05054599]
 ..., 
 [ 0.10308767 -0.00491428  0.0222906  ..., -0.05513031  0.09036234
   0.04904212]
 [ 0.05318509 -0.03894806  0.01236578 ..., -0.07919432 -0.0711684
  -0.07880869]
 [ 0.02978794  0.0655316  -0.06265574 ..., -0.04329158  0.02960177
   0.06716526]]
Epoch 2 Tensor("target_output_layer/bias/read:0", shape=(7,), dtype=float32) [ 0.06784763  0.07446461  0.04852601  0.04029027  0.07093964  0.03324635
  0.04612995]
Epoch 2 Tensor("online_conv1_layer/kernel/read:0", shape=(8, 8, 4, 32), dtype=float32) [[[[  5.00485860e-02  -1.45031214e-02  -1.37521503e-02 ...,
      6.28788322e-02   4.36173519e-03 

      6.49228552e-03   5.60613759e-02  -1.87998693e-02]]]]
Epoch 2 Tensor("online_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.00503446  0.00772467  0.02463803  0.01962376  0.01283305  0.00651487
  0.00689674  0.02612324  0.05186253  0.02471139  0.02322371  0.01771112
  0.0029885   0.00790194  0.00213891  0.01556958  0.00720619 -0.00141298
  0.02240219  0.01861885  0.01223848  0.02152901  0.00853516  0.00519983
  0.00502043  0.0144525   0.0211486   0.01946189  0.00742501  0.0214534
  0.01214941  0.00926718]
Epoch 2 Tensor("online_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[-0.04206598  0.06119433 -0.02149013 ...,  0.0201619   0.05412949
     0.05389601]
   [-0.05083889  0.0493154   0.06099827 ...,  0.00166565  0.00150822
    -0.02634924]
   [-0.02978264 -0.03344428 -0.01265548 ...,  0.01193694  0.02246441
     0.07758146]
   ..., 
   [-0.01404844  0.0348702   0.07030737 ..., -0.00477346  0.04183599
     0.03684872]
   [-0.04268995  0.00833096 -0.01

    -0.00549175]]]]
Epoch 2 Tensor("online_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.05114191  0.05143592  0.04923395  0.05268323  0.0513421   0.0497993
 -0.00589217  0.0274605   0.05352336  0.02590189  0.04785303  0.04893831
  0.05475503  0.01652328  0.05435876 -0.00711189  0.05741706  0.05586756
  0.0527437   0.03035812  0.05017113  0.03545658  0.04186175  0.01256429
  0.02877858  0.0526733   0.03944463  0.04879654  0.058095    0.00530597
  0.03421588  0.04582443  0.0401909   0.0574468   0.02821111  0.05239864
  0.04853762  0.05307013  0.01378862  0.05771947  0.04784242  0.0003767
  0.03528848  0.03710266 -0.0001781   0.04287852  0.04159566 -0.00210031
  0.03350197 -0.00403178  0.04387469 -0.00533555  0.05095491  0.05315076
  0.04861747 -0.0016093  -0.00276602  0.05375302  0.05510601  0.05690867
  0.05381001  0.03379512  0.05571711  0.05090351]
Epoch 2 Tensor("online_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32) [[-0.03751493 -0.03747647 -0.00594528 .

   5.72973639e-02   4.73111495e-02  -5.86492196e-03   1.08399978e-02]
Epoch 2 Tensor("online_output_layer/kernel/read:0", shape=(512, 7), dtype=float32) [[ 0.1006219   0.06215213 -0.02072052 ..., -0.08145713  0.12197734
   0.09986423]
 [ 0.03404297 -0.06102786  0.09961747 ..., -0.07118447 -0.0132134
   0.03345056]
 [ 0.00680809  0.07618912 -0.05882051 ...,  0.04376627  0.02868569
   0.05054599]
 ..., 
 [ 0.10308767 -0.00491428  0.0222906  ..., -0.05513031  0.09036234
   0.04904212]
 [ 0.05318509 -0.03894806  0.01236578 ..., -0.07919432 -0.0711684
  -0.07880869]
 [ 0.02978794  0.0655316  -0.06265574 ..., -0.04329158  0.02960177
   0.06716526]]
Epoch 2 Tensor("online_output_layer/bias/read:0", shape=(7,), dtype=float32) [ 0.06784763  0.07446461  0.04852601  0.04029027  0.07093964  0.03324635
  0.04612995]


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:10<00:00, 18.83it/s]


Epoch 3 Mean Reward: 232.87641006469727
Epoch 3 Tensor("target_conv1_layer/kernel/read:0", shape=(8, 8, 4, 32), dtype=float32) [[[[  5.00485860e-02  -1.45031214e-02  -1.37521503e-02 ...,
      6.28788322e-02   4.36173519e-03   1.70186218e-02]
   [ -3.50263193e-02  -1.55144942e-03   2.07747016e-02 ...,
      3.75305042e-02  -2.72120163e-02   3.77959281e-04]
   [  1.25594595e-02   1.16664090e-03   1.60876438e-02 ...,
      3.33568677e-02  -6.32177740e-02   6.59097312e-03]
   [ -1.22172637e-02  -2.33157910e-02   6.96150586e-02 ...,
      4.99590077e-02  -3.66195552e-02  -1.39286239e-02]]

  [[  5.29896393e-02   1.75568130e-04   2.76541919e-03 ...,
      4.77953590e-02  -5.97909763e-02   4.14104015e-03]
   [  7.77238607e-03   4.44106907e-02   3.36174555e-02 ...,
      6.01929836e-02  -3.22375372e-02   4.04368192e-02]
   [  4.30519469e-02  -5.00971414e-02  -3.93620767e-02 ...,
      3.03440671e-02  -6.80513605e-02  -4.37459052e-02]
   [  4.74970375e-06  -2.04133848e-03  -1.61342584e-02 ...,

      6.49228552e-03   5.60613759e-02  -1.87998693e-02]]]]
Epoch 3 Tensor("target_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.00503446  0.00772467  0.02463803  0.01962376  0.01283305  0.00651487
  0.00689674  0.02612324  0.05186253  0.02471139  0.02322371  0.01771112
  0.0029885   0.00790194  0.00213891  0.01556958  0.00720619 -0.00141298
  0.02240219  0.01861885  0.01223848  0.02152901  0.00853516  0.00519983
  0.00502043  0.0144525   0.0211486   0.01946189  0.00742501  0.0214534
  0.01214941  0.00926718]
Epoch 3 Tensor("target_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[-0.04206598  0.06119433 -0.02149013 ...,  0.0201619   0.05412949
     0.05389601]
   [-0.05083889  0.0493154   0.06099827 ...,  0.00166565  0.00150822
    -0.02634924]
   [-0.02978264 -0.03344428 -0.01265548 ...,  0.01193694  0.02246441
     0.07758146]
   ..., 
   [-0.01404844  0.0348702   0.07030737 ..., -0.00477346  0.04183599
     0.03684872]
   [-0.04268995  0.00833096 -0.01

    -0.00549175]]]]
Epoch 3 Tensor("target_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.05114191  0.05143592  0.04923395  0.05268323  0.0513421   0.0497993
 -0.00589217  0.0274605   0.05352336  0.02590189  0.04785303  0.04893831
  0.05475503  0.01652328  0.05435876 -0.00711189  0.05741706  0.05586756
  0.0527437   0.03035812  0.05017113  0.03545658  0.04186175  0.01256429
  0.02877858  0.0526733   0.03944463  0.04879654  0.058095    0.00530597
  0.03421588  0.04582443  0.0401909   0.0574468   0.02821111  0.05239864
  0.04853762  0.05307013  0.01378862  0.05771947  0.04784242  0.0003767
  0.03528848  0.03710266 -0.0001781   0.04287852  0.04159566 -0.00210031
  0.03350197 -0.00403178  0.04387469 -0.00533555  0.05095491  0.05315076
  0.04861747 -0.0016093  -0.00276602  0.05375302  0.05510601  0.05690867
  0.05381001  0.03379512  0.05571711  0.05090351]
Epoch 3 Tensor("target_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32) [[-0.03751493 -0.03747647 -0.00594528 .

   5.72973639e-02   4.73111495e-02  -5.86492196e-03   1.08399978e-02]
Epoch 3 Tensor("target_output_layer/kernel/read:0", shape=(512, 7), dtype=float32) [[ 0.1006219   0.06215213 -0.02072052 ..., -0.08145713  0.12197734
   0.09986423]
 [ 0.03404297 -0.06102786  0.09961747 ..., -0.07118447 -0.0132134
   0.03345056]
 [ 0.00680809  0.07618912 -0.05882051 ...,  0.04376627  0.02868569
   0.05054599]
 ..., 
 [ 0.10308767 -0.00491428  0.0222906  ..., -0.05513031  0.09036234
   0.04904212]
 [ 0.05318509 -0.03894806  0.01236578 ..., -0.07919432 -0.0711684
  -0.07880869]
 [ 0.02978794  0.0655316  -0.06265574 ..., -0.04329158  0.02960177
   0.06716526]]
Epoch 3 Tensor("target_output_layer/bias/read:0", shape=(7,), dtype=float32) [ 0.06784763  0.07446461  0.04852601  0.04029027  0.07093964  0.03324635
  0.04612995]
Epoch 3 Tensor("online_conv1_layer/kernel/read:0", shape=(8, 8, 4, 32), dtype=float32) [[[[  5.12748510e-02  -1.69229489e-02  -1.15544600e-02 ...,
      6.29799441e-02   5.10200765e-03 

      1.35142850e-02   5.56298830e-02  -1.76649224e-02]]]]
Epoch 3 Tensor("online_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.00687332  0.00763064  0.03236208  0.02429168  0.01864812  0.00893374
  0.00802453  0.03466252  0.09549967  0.03240167  0.02930646  0.02702974
  0.00741037  0.01461632  0.01060915  0.02254215  0.01345192 -0.0001409
  0.02999371  0.02329127  0.01328965  0.02630598  0.00695488  0.01123187
  0.0038334   0.01638747  0.0279552   0.02469253  0.00897928  0.02600464
  0.03029324  0.01054183]
Epoch 3 Tensor("online_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[-0.04002456  0.06360849 -0.02010221 ...,  0.02103647  0.05666557
     0.05605999]
   [-0.05275203  0.04758933  0.05897632 ...,  0.00021194 -0.00021308
    -0.02819478]
   [-0.02766566 -0.030036   -0.01234872 ...,  0.0097659   0.02656346
     0.07951225]
   ..., 
   [-0.01459383  0.03547522  0.06853212 ..., -0.00803598  0.04299522
     0.03629647]
   [-0.03562127  0.01579934 -0.00

    -0.00354857]]]]
Epoch 3 Tensor("online_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.07139094  0.07249916  0.06407811  0.07456315  0.07210473  0.06666331
 -0.00589217  0.02988605  0.07392171  0.02682064  0.06465647  0.06483692
  0.07693965  0.01497072  0.07508919 -0.00711189  0.08124378  0.07827206
  0.0753528   0.03415906  0.06983702  0.03964742  0.05049747  0.00778302
  0.03037943  0.0747849   0.04653237  0.06793791  0.08276246  0.00530597
  0.03819938  0.05934241  0.04600928  0.0813863   0.02904304  0.07088833
  0.0645208   0.07197579  0.01122945  0.08211575  0.06560335  0.00046927
  0.04338513  0.042655   -0.00017808  0.05309337  0.05067303 -0.00210031
  0.03813276 -0.00403184  0.05766851 -0.00530824  0.06868371  0.0727151
  0.06464674 -0.00092881 -0.00268086  0.07468679  0.07722269  0.0801424
  0.07448834  0.0375482   0.0789535   0.06881034]
Epoch 3 Tensor("online_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32) [[-0.03751493 -0.03747647 -0.01130008 .

   7.42573291e-02   6.02619126e-02  -5.86492196e-03   1.08185094e-02]
Epoch 3 Tensor("online_output_layer/kernel/read:0", shape=(512, 7), dtype=float32) [[ 0.1006219   0.06215213 -0.02072052 ..., -0.08145713  0.12197734
   0.09986423]
 [ 0.03404297 -0.06102786  0.09961747 ..., -0.07118447 -0.0132134
   0.03345056]
 [ 0.00515776  0.07516403 -0.05484526 ...,  0.04857498  0.03017473
   0.05226852]
 ..., 
 [ 0.10287991 -0.0016571   0.02950205 ..., -0.04296145  0.09533649
   0.05023082]
 [ 0.05318509 -0.03894806  0.01236578 ..., -0.07919432 -0.0711684
  -0.07880869]
 [ 0.02977259  0.06551363 -0.06266053 ..., -0.04330845  0.02959852
   0.06715276]]
Epoch 3 Tensor("online_output_layer/bias/read:0", shape=(7,), dtype=float32) [ 0.0912843   0.10060911  0.07806488  0.04029471  0.11149506  0.04482936
  0.08377936]


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:09<00:00, 21.61it/s]


Epoch 4 Mean Reward: 259.16441040039064
Epoch 4 Tensor("target_conv1_layer/kernel/read:0", shape=(8, 8, 4, 32), dtype=float32) [[[[  5.18262908e-02  -1.76942386e-02  -8.80439766e-03 ...,
      6.23698086e-02   9.58576985e-03   1.48198428e-02]
   [ -3.27046327e-02  -5.49641019e-03   2.47977767e-02 ...,
      3.61153744e-02  -2.61285044e-02  -1.89471163e-03]
   [  1.54423583e-02  -2.16127629e-03   1.85667258e-02 ...,
      3.12425643e-02  -6.24984875e-02   4.04794887e-03]
   [ -1.04238773e-02  -2.34391019e-02   8.35516006e-02 ...,
      5.63822612e-02  -4.02660780e-02  -1.32640274e-02]]

  [[  5.44033796e-02  -1.93366827e-03   5.85590769e-03 ...,
      4.59830835e-02  -6.01914264e-02   2.41814647e-03]
   [  8.63781013e-03   4.13494706e-02   3.35667394e-02 ...,
      5.55699244e-02  -3.42977084e-02   3.75230648e-02]
   [  4.33811918e-02  -5.25615178e-02  -4.61723469e-02 ...,
      2.03608144e-02  -7.03802034e-02  -4.74250093e-02]
   [  1.33634929e-03  -2.28548888e-03  -1.24431518e-03 ...,

      1.53160160e-02   5.08010611e-02  -1.69739015e-02]]]]
Epoch 4 Tensor("target_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.00821518  0.00797846  0.0380997   0.02533304  0.02404024  0.01028251
  0.00976532  0.04161205  0.15016034  0.03819393  0.03298371  0.03505682
  0.01237697  0.02330031  0.0218413   0.02956758  0.01746091 -0.0005924
  0.0358631   0.02482431  0.0096363   0.02792755  0.0023239   0.01795165
  0.00379562  0.01392857  0.03213799  0.02669661  0.01043417  0.02745037
  0.06390282  0.01202095]
Epoch 4 Tensor("target_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[-0.03913163  0.06493122 -0.02006002 ...,  0.02058386  0.05815026
     0.05697547]
   [-0.05406087  0.04633566  0.05748347 ..., -0.00069478 -0.00136831
    -0.02954008]
   [-0.0259845  -0.02649065 -0.01401004 ...,  0.00672808  0.03160282
     0.080462  ]
   ..., 
   [-0.0169897   0.0345169   0.06411554 ..., -0.01185559  0.04316273
     0.03345779]
   [-0.02585646  0.02680165  0.00

    -0.00370837]]]]
Epoch 4 Tensor("target_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.09029818  0.09348553  0.07476953  0.09664004  0.09332771  0.08164314
 -0.00589217  0.03182727  0.0937581   0.0300987   0.07776768  0.0776246
  0.0995565   0.0137414   0.09503114 -0.00711189  0.10635817  0.10055987
  0.09870144  0.04191434  0.08870799  0.04026045  0.05396047  0.00341845
  0.03329061  0.09707917  0.04648458  0.08571221  0.10912129  0.00530597
  0.03966041  0.06812066  0.04709508  0.1074002   0.02765803  0.08801778
  0.07677183  0.08962462  0.01048387  0.10739353  0.08012608  0.00308253
  0.04737995  0.04288788 -0.00017808  0.05784631  0.05399861 -0.00210031
  0.04106256 -0.00403184  0.06793603 -0.00525366  0.08465961  0.09164605
  0.07822563  0.02998095 -0.00263949  0.09551606  0.09936899  0.10414565
  0.09478296  0.03597187  0.1034136   0.08506134]
Epoch 4 Tensor("target_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32) [[-0.03751493 -0.03747647 -0.0104574  

   1.15194358e-01   9.24459323e-02  -5.86492196e-03   1.08185094e-02]
Epoch 4 Tensor("target_output_layer/kernel/read:0", shape=(512, 7), dtype=float32) [[ 0.1006219   0.06215213 -0.02072052 ..., -0.08145713  0.12197734
   0.09986423]
 [ 0.03404297 -0.06102786  0.09961747 ..., -0.07118447 -0.0132134
   0.03345056]
 [ 0.00486251  0.07200358 -0.05394989 ...,  0.04727814  0.02996204
   0.05429955]
 ..., 
 [ 0.10827197  0.00164223  0.03618358 ..., -0.03556941  0.09737757
   0.05615848]
 [ 0.05318509 -0.03894806  0.01236578 ..., -0.07919432 -0.0711684
  -0.07880869]
 [ 0.02977259  0.06551363 -0.06266053 ..., -0.04330845  0.02959852
   0.06715276]]
Epoch 4 Tensor("target_output_layer/bias/read:0", shape=(7,), dtype=float32) [ 0.13366784  0.13944715  0.0954923   0.06735868  0.13339001  0.04791654
  0.10652097]
Epoch 4 Tensor("online_conv1_layer/kernel/read:0", shape=(8, 8, 4, 32), dtype=float32) [[[[  5.18262908e-02  -1.76942386e-02  -8.80439766e-03 ...,
      6.23698086e-02   9.58576985e-03 

      1.53160160e-02   5.08010611e-02  -1.69739015e-02]]]]
Epoch 4 Tensor("online_conv1_layer/bias/read:0", shape=(32,), dtype=float32) [ 0.00821518  0.00797846  0.0380997   0.02533304  0.02404024  0.01028251
  0.00976532  0.04161205  0.15016034  0.03819393  0.03298371  0.03505682
  0.01237697  0.02330031  0.0218413   0.02956758  0.01746091 -0.0005924
  0.0358631   0.02482431  0.0096363   0.02792755  0.0023239   0.01795165
  0.00379562  0.01392857  0.03213799  0.02669661  0.01043417  0.02745037
  0.06390282  0.01202095]
Epoch 4 Tensor("online_conv2_layer/kernel/read:0", shape=(4, 4, 32, 64), dtype=float32) [[[[-0.03913163  0.06493122 -0.02006002 ...,  0.02058386  0.05815026
     0.05697547]
   [-0.05406087  0.04633566  0.05748347 ..., -0.00069478 -0.00136831
    -0.02954008]
   [-0.0259845  -0.02649065 -0.01401004 ...,  0.00672808  0.03160282
     0.080462  ]
   ..., 
   [-0.0169897   0.0345169   0.06411554 ..., -0.01185559  0.04316273
     0.03345779]
   [-0.02585646  0.02680165  0.00

    -0.00370837]]]]
Epoch 4 Tensor("online_conv2_layer/bias/read:0", shape=(64,), dtype=float32) [ 0.09029818  0.09348553  0.07476953  0.09664004  0.09332771  0.08164314
 -0.00589217  0.03182727  0.0937581   0.0300987   0.07776768  0.0776246
  0.0995565   0.0137414   0.09503114 -0.00711189  0.10635817  0.10055987
  0.09870144  0.04191434  0.08870799  0.04026045  0.05396047  0.00341845
  0.03329061  0.09707917  0.04648458  0.08571221  0.10912129  0.00530597
  0.03966041  0.06812066  0.04709508  0.1074002   0.02765803  0.08801778
  0.07677183  0.08962462  0.01048387  0.10739353  0.08012608  0.00308253
  0.04737995  0.04288788 -0.00017808  0.05784631  0.05399861 -0.00210031
  0.04106256 -0.00403184  0.06793603 -0.00525366  0.08465961  0.09164605
  0.07822563  0.02998095 -0.00263949  0.09551606  0.09936899  0.10414565
  0.09478296  0.03597187  0.1034136   0.08506134]
Epoch 4 Tensor("online_dense1_layer/kernel/read:0", shape=(3072, 512), dtype=float32) [[-0.03751493 -0.03747647 -0.0104574  

   1.15194358e-01   9.24459323e-02  -5.86492196e-03   1.08185094e-02]
Epoch 4 Tensor("online_output_layer/kernel/read:0", shape=(512, 7), dtype=float32) [[ 0.1006219   0.06215213 -0.02072052 ..., -0.08145713  0.12197734
   0.09986423]
 [ 0.03404297 -0.06102786  0.09961747 ..., -0.07118447 -0.0132134
   0.03345056]
 [ 0.00486251  0.07200358 -0.05394989 ...,  0.04727814  0.02996204
   0.05429955]
 ..., 
 [ 0.10827197  0.00164223  0.03618358 ..., -0.03556941  0.09737757
   0.05615848]
 [ 0.05318509 -0.03894806  0.01236578 ..., -0.07919432 -0.0711684
  -0.07880869]
 [ 0.02977259  0.06551363 -0.06266053 ..., -0.04330845  0.02959852
   0.06715276]]
Epoch 4 Tensor("online_output_layer/bias/read:0", shape=(7,), dtype=float32) [ 0.13366784  0.13944715  0.0954923   0.06735868  0.13339001  0.04791654
  0.10652097]
6105 time steps experienced during training


In [None]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

print(test_agent(DQN, num_episodes=20, training=False, load_model=True, model_dir=ckpts[-2]))
