In [3]:
import importlib.util
import time

import tensorflow as tf
import numpy as np
import vizdoom as vd

from skimage.transform import rescale
from tqdm import trange
from IPython.display import HTML


In [4]:
#Specify the game scenario and the screen format/resolution

game = vd.DoomGame()
game.set_screen_format(vd.ScreenFormat.BGR24)
game.set_screen_resolution(vd.ScreenResolution.RES_640X480)
game.set_depth_buffer_enabled(False)
game.load_config('take_cover.cfg')

down_sample_ratio = 0.125
width = int(game.get_screen_width()*down_sample_ratio)
height = int(game.get_screen_height()*down_sample_ratio)
channels = game.get_screen_channels() + int(game.is_depth_buffer_enabled())

#Specify the available actions in the scenario

available_actions = game.get_available_buttons()
actions = [list(ohe) for ohe in list(np.identity(len(available_actions)))]
num_actions = len(available_actions)

#Specify the Q-network learning parameters

frame_delay = 12
buffer_size = 25000
epochs = 200
steps_per_epoch = 2000
learning_rate = 0.0025
gamma = 0
start_epsilon = 1.0
end_epsilon = 0.1
batch_size = 100
load_model = False
save_model = True
model_dir = './checkpoints/take_cover.ckpt'
num_ckpts = 10


In [5]:
#Create a buffer object that holds a set of training experiences (state-action-reward tuples)

class Buffer():
    def __init__(self, size=1000):
        self.buffer = list()
        self.length = len(self.buffer)
        self.size = size
        
#Add a new experience to the buffer (remove the oldest experience if the buffer is already full)
        
    def add_experience(self, experience):
        if self.length + 1 >= self.size:
            self.buffer[0:(self.length + 1) - self.size] = []
        
        self.buffer.append(experience)
        self.length = len(self.buffer)
            
#Return a batch of experience arrays randomly sampled from the buffer
            
    def sample_buffer(self, sample_size):
        sample = np.random.randint(self.length, size=sample_size)
        s1 = np.concatenate([self.buffer[idx][0] for idx in sample], axis=0)
        a = np.array([self.buffer[idx][1] for idx in sample])
        r = np.array([self.buffer[idx][2] for idx in sample])
        s2 = np.concatenate([self.buffer[idx][3] for idx in sample], axis=0)
        terminal = np.array([self.buffer[idx][4] for idx in sample], dtype=np.int32)
        
        return s1, a, r, s2, terminal

#Downsample and normalize an image array representing the game state at a given time stamp

def preprocess(image, down_sample_ratio=1):
    if float(down_sample_ratio) != 1.0:
        image = rescale(image=image,
                        scale=(down_sample_ratio,
                               down_sample_ratio),
                        mode='reflect')
    image = image.astype(np.float32)
    image = np.expand_dims(image, axis=0)

    return image

#Test the agent using a currently training or previously trained model

def test_agent(model, num_episodes, load_model, depth, training=True, session=None, model_dir=None):
    if load_model == True:
        sess = tf.Session()
        print('Loading model from', model_dir)
        tf.train.Saver().restore(sess, model_dir)
        
#Require an existing session if a pretrained model isn't provided
        
    elif load_model == False:
        sess = session

    game.set_sound_enabled(True)
    episode_rewards = list()
    
#Avoid reinitializing the game if this was already done by the training process
    
    if training == False:
        game.init()

    for i in range(num_episodes):
        game.new_episode()
    
        while not game.is_episode_finished():
            state = game.get_state()
            
            if depth == False:
                state_buffer = np.moveaxis(state.screen_buffer, 0, 2)
            
            elif depth == True:
                depth_buffer = state.depth_buffer
                state_buffer = np.stack((state.screen_buffer,
                                         depth_buffer), axis=-1)
                
            state1 = preprocess(state_buffer, down_sample_ratio)
            action = model.choose_action(sess, state1)[0]
            reward = game.make_action(actions[action])
            
#Add a delay between each time step so that the episodes occur at normal speed

            time.sleep(0.02)
        
        episode_rewards.append(game.get_total_reward())
        print('Test Episode {} Reward: {}'.format(i + 1, game.get_total_reward()))
        time.sleep(1)
    
#Avoid ending the game so that the training process can continue
    
    if training == False:
        game.close()
    
    return np.mean(episode_rewards)


In [6]:
#Create a Q-network to estimate values and choose actions for a given state

class Q_network():
    def __init__(self, network_name, height, width, channels, learning_rate=0.001):
        self.learning_rate = learning_rate
        self.s_t = tf.placeholder(tf.float32,
                                  shape=[None, height, width, channels],
                                  name=network_name + '_state'
                                 )
        self.a_t = tf.placeholder(tf.int32,
                                  shape=[None],
                                  name=network_name + '_action'
                                 )
        self.Q_target = tf.placeholder(tf.float32,
                                       shape=[None, num_actions],
                                       name=network_name + '_Q_target'
                                      )

        self.input_layer = tf.reshape(self.s_t,
                                      [-1, height, width, channels],
                                      name=network_name + '_input_layer'
                                     )
        self.conv1 = tf.layers.conv2d(inputs=self.input_layer,
                                      filters=32,
                                      kernel_size=[8, 8],
                                      strides=[4, 4],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv1_layer'
                                     )
        self.conv2 = tf.layers.conv2d(inputs=self.conv1,
                                      filters=64,
                                      kernel_size=[4, 4],
                                      strides=[2, 2],
                                      padding='valid',
                                      activation=tf.nn.relu,
                                      name=network_name + '_conv2_layer'
                                     )
        self.flatten = tf.layers.flatten(self.conv2,
                                         name=network_name + '_flatten'
                                        )
        self.dense = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.relu,
                                      name=network_name + '_dense1_layer'
                                    )
        self.Q_values = tf.layers.dense(inputs=self.dense,
                                        units=len(actions),
                                        activation=None,
                                        name=network_name + '_output_layer'
                                       )        
    
        self.best_action = tf.argmax(self.Q_values, 1)
        self.loss = tf.losses.mean_squared_error(self.Q_values,
                                                 self.Q_target)
        self.adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           name=network_name + '_adam'
                                          )
        self.train = self.adam.minimize(self.loss)
        
    def update_lr(self):
        self.learning_rate = 0.98*self.learning_rate
        
        return self.learning_rate

    def calculate_loss(self, session, s, q):
        L, _ = session.run([self.loss, self.train],
                           feed_dict={self.s_t: s,
                                      self.Q_target: q})
    
        return L

#Return the array of Q-values and the best action associated with a given state

    def get_Q_values(self, session, s):
        Q = session.run(self.Q_values,
                        feed_dict={self.s_t: s})

        return Q
    
    def choose_action(self, session, s):
        a = session.run(self.best_action,
                        feed_dict={self.s_t: s})
    
        return a
    
#Create a list of variable update operations

def update_graph(variables):
    update_ops = list()
    
#Assign weight values from the network created first to the one created second
    
    for idx, variable in enumerate(variables[:len(variables)//2]):
        op = variable.assign(variables[idx + len(variables)//2].value())
        update_ops.append(op)
    
    return update_ops

#Update the target network parameters to match those of the online network

def update_target(ops, session):
    for op in update_ops:
        session.run(op)


In [7]:
#For each time step, collect the following data:
#The current game state
#The action that was taken taken
#The reward obtained from the chosen action
#The next game state (store the first game state if the previous action ends the episode)
#A variable indicating whether the episode is over yet


tf.reset_default_graph()

#Instantiate the target network before the online network so that it's updated correctly

target_net = Q_network(network_name='target',
                       learning_rate=learning_rate,
                       height=height,
                       width=width,
                       channels=channels)
DQN = Q_network(network_name='online',
                learning_rate=learning_rate,
                height=height,
                width=width,
                channels=channels)

exp_buffer = Buffer(size=buffer_size)
session = tf.Session()
saver = tf.train.Saver(max_to_keep=num_ckpts, reshape=True)
weights = tf.trainable_variables()

update_ops = update_graph(weights)

if load_model == True:
    print('Loading model from', model_dir)
    tf.train.Saver().restore(session, model_dir)
    
elif load_model == False:
    session.run(tf.global_variables_initializer())

game.set_sound_enabled(False)
game.init()

t = 0
epoch_rank = list()
epoch_rank_depth = list()

#Accumulate experiences in the buffer using an epsilon-greedy strategy with three training phases

for epoch in range(epochs):
    epoch_rewards = list()
    
    for step in trange(steps_per_epoch, leave=True):
        experience = list()
        game.new_episode()
        
        while not game.is_episode_finished():
            state = game.get_state()
            
#Substitute an array of zeros for the depth buffer if that setting is disabled

            if game.is_depth_buffer_enabled() == False:
                state1_buffer = np.moveaxis(state.screen_buffer, 0, 2)
            else:
                depth_buffer = state.depth_buffer
                state1_buffer = np.stack((state.screen_buffer,
                                          depth_buffer), axis=-1)
                
            state1 = preprocess(state1_buffer, down_sample_ratio)
            
#Explore the environment by choosing random actions with 100% probability for the first phase of training

            if epoch < 0.2*epochs:
                action = np.random.randint(num_actions)
            
#Increase the probability of greedily choosing an action by a constant amount at each epoch in the second phase
            
            elif epoch < 0.9*epochs:
                epsilon = start_epsilon - (epoch + 1 - 0.2*epochs)*(start_epsilon-end_epsilon)/(0.7*epochs)
            
                if np.random.uniform(0, 1) <= epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

#Select a random action with 10% probability in the final phase of training
                
            else:
                if np.random.uniform(0, 1) <= end_epsilon:
                    action = np.random.randint(num_actions)
                else:
                    action = DQN.choose_action(session, state1)[0]

            reward = game.make_action(actions[action], frame_delay)
            done = game.is_episode_finished()
            
            if done == False:
                state = game.get_state()
                
                if game.is_depth_buffer_enabled() == False:
                    state2_buffer = np.moveaxis(state.screen_buffer, 0, 2)
                else:
                    depth_buffer = state.depth_buffer
                    state2_buffer = np.stack((state.screen_buffer,
                                              depth_buffer), axis=-1)
                
                state2 = preprocess(state2_buffer, down_sample_ratio)
                
            elif done == True:
                state2 = state1
        
#Add the experience obtained from each time step to the buffer

            t += 1
            exp_buffer.add_experience((state1, action, reward, state2, done))
        
#Sample a minibatch from the buffer if there are enough experiences in the buffer

        if exp_buffer.length > batch_size:
            s1, a, r, s2, terminal = exp_buffer.sample_buffer(batch_size)
            
#Get the target values from the target Q-network
            
            target_Q = np.max(target_net.get_Q_values(session, s2), axis=1)
            
#Train the online Q-network by using a minibatch to update the action-value function
            
            Q2 = DQN.get_Q_values(session, s1)
            Q2[np.arange(batch_size), a] = r + gamma*(1 - terminal)*target_Q
            DQN.calculate_loss(session, s1, Q2)
            
        epoch_rewards.append(game.get_total_reward())
        
#Increase the discount factor at each epoch until it reaches 0.99
    
    if gamma < 0.99:
        gamma = 1-.98*(1-gamma)
    elif gamma >= 0.99:
        gamma = 0.99
        
#Decrease the learning rate at each epoch

    DQN.update_lr()
    target_net.update_lr()
    
    print('Epoch {} Mean Reward: {}'.format(epoch + 1, np.mean(epoch_rewards)))
    
#Update the target network every 10 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        update_target(update_ops, session)
        
#Save the model and test the agent for 10 episodes every 20 epochs
    
    if (epoch + 1) % 10 == 0 and epoch > 0:
        if save_model == True:
            checkpoint = model_dir + '-' + str(epoch + 1)
            print('Epoch {} Model saved to {}'.format(epoch + 1, model_dir))
            saver.save(session, model_dir, global_step=epoch + 1)
            
        update_target(update_ops, session)

#Test the agent both with and without the depth buffer given
        
        print('Epoch {} test:'.format(epoch + 1))
        test_reward = test_agent(DQN, num_episodes=10,
                                 training=True,
                                 load_model=False,
                                 depth=False,
                                 session=session,
                                 model_dir=model_dir)
        print('Average Test Reward:', test_reward)
        
        epoch_rank_depth.append((test_reward, epoch + 1))
        epoch_rank.append((test_reward, epoch + 1))
        
#Return a sorted list of epoch checkpoints based on average test episode reward
        
print(sorted(epoch_rank, reverse=True))
print('{} time steps experienced during training'.format(t))
game.close()
    

100%|██████████| 2000/2000 [08:39<00:00,  3.85it/s]


Epoch 1 Mean Reward: 395.2325


100%|██████████| 2000/2000 [10:49<00:00,  3.08it/s]


Epoch 2 Mean Reward: 398.9645


100%|██████████| 2000/2000 [14:48<00:00,  2.25it/s]


Epoch 3 Mean Reward: 399.998


100%|██████████| 2000/2000 [14:59<00:00,  2.22it/s]


Epoch 4 Mean Reward: 390.0215


100%|██████████| 2000/2000 [11:38<00:00,  2.86it/s]


Epoch 5 Mean Reward: 388.4615


100%|██████████| 2000/2000 [10:24<00:00,  3.20it/s]


Epoch 6 Mean Reward: 389.777


100%|██████████| 2000/2000 [12:39<00:00,  2.63it/s]


Epoch 7 Mean Reward: 392.988


100%|██████████| 2000/2000 [08:37<00:00,  3.86it/s]


Epoch 8 Mean Reward: 392.9875


100%|██████████| 2000/2000 [11:24<00:00,  2.92it/s]


Epoch 9 Mean Reward: 392.8345


100%|██████████| 2000/2000 [12:38<00:00,  2.64it/s]


Epoch 10 Mean Reward: 389.4645
Epoch 10 Model saved to ./checkpoints/take_cover.ckpt
Epoch 10 test:
Test Episode 1 Reward: 219.0
Test Episode 2 Reward: 292.0
Test Episode 3 Reward: 219.0
Test Episode 4 Reward: 219.0
Test Episode 5 Reward: 248.0
Test Episode 6 Reward: 219.0
Test Episode 7 Reward: 165.0
Test Episode 8 Reward: 219.0
Test Episode 9 Reward: 139.0
Test Episode 10 Reward: 232.0
Average Test Reward: 217.1


100%|██████████| 2000/2000 [11:45<00:00,  2.83it/s]


Epoch 11 Mean Reward: 394.697


100%|██████████| 2000/2000 [12:18<00:00,  2.71it/s]


Epoch 12 Mean Reward: 397.646


100%|██████████| 2000/2000 [12:33<00:00,  2.65it/s]


Epoch 13 Mean Reward: 393.736


100%|██████████| 2000/2000 [12:20<00:00,  2.70it/s]


Epoch 14 Mean Reward: 395.663


100%|██████████| 2000/2000 [13:02<00:00,  2.56it/s]


Epoch 15 Mean Reward: 390.174


100%|██████████| 2000/2000 [16:50<00:00,  1.98it/s]


Epoch 16 Mean Reward: 388.615


100%|██████████| 2000/2000 [16:48<00:00,  1.98it/s]


Epoch 17 Mean Reward: 393.3045


100%|██████████| 2000/2000 [13:48<00:00,  2.41it/s]


Epoch 18 Mean Reward: 393.8945


100%|██████████| 2000/2000 [11:29<00:00,  2.90it/s]


Epoch 19 Mean Reward: 391.5235


100%|██████████| 2000/2000 [11:44<00:00,  2.84it/s]


Epoch 20 Mean Reward: 390.8105
Epoch 20 Model saved to ./checkpoints/take_cover.ckpt
Epoch 20 test:
Test Episode 1 Reward: 375.0
Test Episode 2 Reward: 191.0
Test Episode 3 Reward: 321.0
Test Episode 4 Reward: 976.0
Test Episode 5 Reward: 372.0
Test Episode 6 Reward: 532.0
Test Episode 7 Reward: 375.0
Test Episode 8 Reward: 375.0
Test Episode 9 Reward: 375.0
Test Episode 10 Reward: 375.0
Average Test Reward: 426.7


100%|██████████| 2000/2000 [08:39<00:00,  3.85it/s]


Epoch 21 Mean Reward: 393.8735


100%|██████████| 2000/2000 [08:52<00:00,  3.76it/s]


Epoch 22 Mean Reward: 397.0425


100%|██████████| 2000/2000 [08:52<00:00,  3.76it/s]


Epoch 23 Mean Reward: 401.2745


100%|██████████| 2000/2000 [08:45<00:00,  3.81it/s]


Epoch 24 Mean Reward: 394.5335


100%|██████████| 2000/2000 [08:40<00:00,  3.85it/s]


Epoch 25 Mean Reward: 393.5985


100%|██████████| 2000/2000 [08:41<00:00,  3.84it/s]


Epoch 26 Mean Reward: 395.8495


100%|██████████| 2000/2000 [08:29<00:00,  3.93it/s]


Epoch 27 Mean Reward: 388.2095


100%|██████████| 2000/2000 [08:35<00:00,  3.88it/s]


Epoch 28 Mean Reward: 390.7565


100%|██████████| 2000/2000 [08:26<00:00,  3.95it/s]


Epoch 29 Mean Reward: 382.6955


100%|██████████| 2000/2000 [08:29<00:00,  3.92it/s]


Epoch 30 Mean Reward: 388.8065
Epoch 30 Model saved to ./checkpoints/take_cover.ckpt
Epoch 30 test:
Test Episode 1 Reward: 219.0
Test Episode 2 Reward: 167.0
Test Episode 3 Reward: 219.0
Test Episode 4 Reward: 240.0
Test Episode 5 Reward: 159.0
Test Episode 6 Reward: 149.0
Test Episode 7 Reward: 136.0
Test Episode 8 Reward: 180.0
Test Episode 9 Reward: 175.0
Test Episode 10 Reward: 169.0
Average Test Reward: 181.3


100%|██████████| 2000/2000 [09:06<00:00,  3.66it/s]


Epoch 31 Mean Reward: 395.9095


100%|██████████| 2000/2000 [08:37<00:00,  3.87it/s]


Epoch 32 Mean Reward: 394.0405


100%|██████████| 2000/2000 [08:32<00:00,  3.90it/s]


Epoch 33 Mean Reward: 397.0735


100%|██████████| 2000/2000 [08:36<00:00,  3.87it/s]


Epoch 34 Mean Reward: 395.165


100%|██████████| 2000/2000 [08:26<00:00,  3.95it/s]


Epoch 35 Mean Reward: 389.423


100%|██████████| 2000/2000 [08:28<00:00,  3.93it/s]


Epoch 36 Mean Reward: 394.017


100%|██████████| 2000/2000 [08:31<00:00,  3.91it/s]


Epoch 37 Mean Reward: 393.069


100%|██████████| 2000/2000 [08:45<00:00,  3.80it/s]


Epoch 38 Mean Reward: 400.075


100%|██████████| 2000/2000 [08:43<00:00,  3.82it/s]


Epoch 39 Mean Reward: 397.4745


100%|██████████| 2000/2000 [08:35<00:00,  3.88it/s]


Epoch 40 Mean Reward: 389.958
Epoch 40 Model saved to ./checkpoints/take_cover.ckpt
Epoch 40 test:
Test Episode 1 Reward: 435.0
Test Episode 2 Reward: 470.0
Test Episode 3 Reward: 435.0
Test Episode 4 Reward: 435.0
Test Episode 5 Reward: 117.0
Test Episode 6 Reward: 726.0
Test Episode 7 Reward: 480.0
Test Episode 8 Reward: 905.0
Test Episode 9 Reward: 435.0
Test Episode 10 Reward: 435.0
Average Test Reward: 487.3


100%|██████████| 2000/2000 [09:06<00:00,  3.66it/s]


Epoch 41 Mean Reward: 391.5645


100%|██████████| 2000/2000 [09:48<00:00,  3.40it/s]


Epoch 42 Mean Reward: 387.9565


100%|██████████| 2000/2000 [08:49<00:00,  3.78it/s]


Epoch 43 Mean Reward: 384.848


100%|██████████| 2000/2000 [08:49<00:00,  3.77it/s]


Epoch 44 Mean Reward: 396.919


100%|██████████| 2000/2000 [09:06<00:00,  3.66it/s]


Epoch 45 Mean Reward: 395.5575


100%|██████████| 2000/2000 [08:50<00:00,  3.77it/s]


Epoch 46 Mean Reward: 391.451


100%|██████████| 2000/2000 [08:41<00:00,  3.83it/s]


Epoch 47 Mean Reward: 391.883


100%|██████████| 2000/2000 [08:53<00:00,  3.75it/s]


Epoch 48 Mean Reward: 398.3225


100%|██████████| 2000/2000 [08:54<00:00,  3.74it/s]


Epoch 49 Mean Reward: 399.957


100%|██████████| 2000/2000 [08:50<00:00,  3.77it/s]


Epoch 50 Mean Reward: 393.984
Epoch 50 Model saved to ./checkpoints/take_cover.ckpt
Epoch 50 test:
Test Episode 1 Reward: 269.0
Test Episode 2 Reward: 269.0
Test Episode 3 Reward: 161.0
Test Episode 4 Reward: 269.0
Test Episode 5 Reward: 496.0
Test Episode 6 Reward: 269.0
Test Episode 7 Reward: 440.0
Test Episode 8 Reward: 491.0
Test Episode 9 Reward: 269.0
Test Episode 10 Reward: 161.0
Average Test Reward: 309.4


100%|██████████| 2000/2000 [08:47<00:00,  3.79it/s]


Epoch 51 Mean Reward: 389.8545


100%|██████████| 2000/2000 [08:58<00:00,  3.72it/s]


Epoch 52 Mean Reward: 396.5905


100%|██████████| 2000/2000 [08:57<00:00,  3.72it/s]


Epoch 53 Mean Reward: 393.3605


100%|██████████| 2000/2000 [08:53<00:00,  3.75it/s]


Epoch 54 Mean Reward: 394.096


100%|██████████| 2000/2000 [09:19<00:00,  3.57it/s]


Epoch 55 Mean Reward: 391.4845


100%|██████████| 2000/2000 [09:50<00:00,  3.39it/s]


Epoch 56 Mean Reward: 392.2925


100%|██████████| 2000/2000 [09:29<00:00,  3.51it/s]


Epoch 57 Mean Reward: 394.17


100%|██████████| 2000/2000 [11:27<00:00,  2.91it/s]


Epoch 58 Mean Reward: 391.553


100%|██████████| 2000/2000 [12:23<00:00,  2.69it/s]


Epoch 59 Mean Reward: 392.749


100%|██████████| 2000/2000 [11:23<00:00,  2.92it/s]


Epoch 60 Mean Reward: 389.193
Epoch 60 Model saved to ./checkpoints/take_cover.ckpt
Epoch 60 test:
Test Episode 1 Reward: 269.0
Test Episode 2 Reward: 218.0
Test Episode 3 Reward: 228.0
Test Episode 4 Reward: 269.0
Test Episode 5 Reward: 280.0
Test Episode 6 Reward: 184.0
Test Episode 7 Reward: 112.0
Test Episode 8 Reward: 271.0
Test Episode 9 Reward: 315.0
Test Episode 10 Reward: 322.0
Average Test Reward: 246.8


100%|██████████| 2000/2000 [08:46<00:00,  3.80it/s]


Epoch 61 Mean Reward: 394.778


100%|██████████| 2000/2000 [09:05<00:00,  3.66it/s]


Epoch 62 Mean Reward: 397.8115


100%|██████████| 2000/2000 [08:47<00:00,  3.79it/s]


Epoch 63 Mean Reward: 394.1235


100%|██████████| 2000/2000 [09:07<00:00,  3.65it/s]


Epoch 64 Mean Reward: 400.2845


100%|██████████| 2000/2000 [08:57<00:00,  3.72it/s]


Epoch 65 Mean Reward: 393.9055


100%|██████████| 2000/2000 [12:13<00:00,  2.73it/s]


Epoch 66 Mean Reward: 395.423


100%|██████████| 2000/2000 [13:01<00:00,  2.56it/s]


Epoch 67 Mean Reward: 397.1135


100%|██████████| 2000/2000 [11:25<00:00,  2.92it/s]


Epoch 68 Mean Reward: 397.683


100%|██████████| 2000/2000 [11:17<00:00,  2.95it/s]


Epoch 69 Mean Reward: 392.526


100%|██████████| 2000/2000 [12:45<00:00,  2.61it/s]


Epoch 70 Mean Reward: 393.7305
Epoch 70 Model saved to ./checkpoints/take_cover.ckpt
Epoch 70 test:
Test Episode 1 Reward: 270.0
Test Episode 2 Reward: 224.0
Test Episode 3 Reward: 166.0
Test Episode 4 Reward: 270.0
Test Episode 5 Reward: 270.0
Test Episode 6 Reward: 270.0
Test Episode 7 Reward: 270.0
Test Episode 8 Reward: 270.0
Test Episode 9 Reward: 270.0
Test Episode 10 Reward: 258.0
Average Test Reward: 253.8


100%|██████████| 2000/2000 [11:37<00:00,  2.87it/s]


Epoch 71 Mean Reward: 392.858


100%|██████████| 2000/2000 [12:16<00:00,  2.72it/s]


Epoch 72 Mean Reward: 411.661


100%|██████████| 2000/2000 [11:50<00:00,  2.82it/s]


Epoch 73 Mean Reward: 401.809


100%|██████████| 2000/2000 [11:47<00:00,  2.83it/s]


Epoch 74 Mean Reward: 396.961


100%|██████████| 2000/2000 [11:57<00:00,  2.79it/s]


Epoch 75 Mean Reward: 394.392


100%|██████████| 2000/2000 [12:08<00:00,  2.75it/s]


Epoch 76 Mean Reward: 398.349


100%|██████████| 2000/2000 [12:04<00:00,  2.76it/s]


Epoch 77 Mean Reward: 401.9865


100%|██████████| 2000/2000 [12:04<00:00,  2.76it/s]


Epoch 78 Mean Reward: 401.3905


100%|██████████| 2000/2000 [11:57<00:00,  2.79it/s]


Epoch 79 Mean Reward: 397.029


100%|██████████| 2000/2000 [12:16<00:00,  2.71it/s]


Epoch 80 Mean Reward: 407.3415
Epoch 80 Model saved to ./checkpoints/take_cover.ckpt
Epoch 80 test:
Test Episode 1 Reward: 288.0
Test Episode 2 Reward: 210.0
Test Episode 3 Reward: 287.0
Test Episode 4 Reward: 218.0
Test Episode 5 Reward: 218.0
Test Episode 6 Reward: 166.0
Test Episode 7 Reward: 218.0
Test Episode 8 Reward: 142.0
Test Episode 9 Reward: 218.0
Test Episode 10 Reward: 218.0
Average Test Reward: 218.3


100%|██████████| 2000/2000 [12:14<00:00,  2.72it/s]


Epoch 81 Mean Reward: 405.8345


100%|██████████| 2000/2000 [12:44<00:00,  2.62it/s]


Epoch 82 Mean Reward: 418.8325


100%|██████████| 2000/2000 [12:14<00:00,  2.72it/s]


Epoch 83 Mean Reward: 405.6345


100%|██████████| 2000/2000 [12:17<00:00,  2.71it/s]


Epoch 84 Mean Reward: 406.8175


100%|██████████| 2000/2000 [12:11<00:00,  2.74it/s]


Epoch 85 Mean Reward: 403.0565


100%|██████████| 2000/2000 [12:37<00:00,  2.64it/s]


Epoch 86 Mean Reward: 415.616


100%|██████████| 2000/2000 [12:09<00:00,  2.74it/s]


Epoch 87 Mean Reward: 402.3125


100%|██████████| 2000/2000 [12:26<00:00,  2.68it/s]


Epoch 88 Mean Reward: 410.56


100%|██████████| 2000/2000 [12:25<00:00,  2.68it/s]


Epoch 89 Mean Reward: 412.394


100%|██████████| 2000/2000 [12:17<00:00,  2.71it/s]


Epoch 90 Mean Reward: 407.0995
Epoch 90 Model saved to ./checkpoints/take_cover.ckpt
Epoch 90 test:
Test Episode 1 Reward: 258.0
Test Episode 2 Reward: 222.0
Test Episode 3 Reward: 168.0
Test Episode 4 Reward: 222.0
Test Episode 5 Reward: 222.0
Test Episode 6 Reward: 222.0
Test Episode 7 Reward: 129.0
Test Episode 8 Reward: 222.0
Test Episode 9 Reward: 222.0
Test Episode 10 Reward: 292.0
Average Test Reward: 217.9


100%|██████████| 2000/2000 [11:48<00:00,  2.82it/s]


Epoch 91 Mean Reward: 403.2305


100%|██████████| 2000/2000 [11:57<00:00,  2.79it/s]


Epoch 92 Mean Reward: 410.3655


100%|██████████| 2000/2000 [11:57<00:00,  2.79it/s]


Epoch 93 Mean Reward: 409.0245


100%|██████████| 2000/2000 [12:35<00:00,  2.65it/s]


Epoch 94 Mean Reward: 428.481


100%|██████████| 2000/2000 [12:02<00:00,  2.77it/s]


Epoch 95 Mean Reward: 410.7775


100%|██████████| 2000/2000 [12:23<00:00,  2.69it/s]


Epoch 96 Mean Reward: 423.6075


100%|██████████| 2000/2000 [12:05<00:00,  2.76it/s]


Epoch 97 Mean Reward: 416.136


100%|██████████| 2000/2000 [12:11<00:00,  2.73it/s]


Epoch 98 Mean Reward: 416.5475


100%|██████████| 2000/2000 [12:07<00:00,  2.75it/s]


Epoch 99 Mean Reward: 417.188


100%|██████████| 2000/2000 [12:07<00:00,  2.75it/s]


Epoch 100 Mean Reward: 419.771
Epoch 100 Model saved to ./checkpoints/take_cover.ckpt
Epoch 100 test:
Test Episode 1 Reward: 219.0
Test Episode 2 Reward: 219.0
Test Episode 3 Reward: 219.0
Test Episode 4 Reward: 131.0
Test Episode 5 Reward: 204.0
Test Episode 6 Reward: 219.0
Test Episode 7 Reward: 219.0
Test Episode 8 Reward: 219.0
Test Episode 9 Reward: 124.0
Test Episode 10 Reward: 219.0
Average Test Reward: 199.2


100%|██████████| 2000/2000 [11:56<00:00,  2.79it/s]


Epoch 101 Mean Reward: 414.119


100%|██████████| 2000/2000 [11:53<00:00,  2.80it/s]


Epoch 102 Mean Reward: 408.98


100%|██████████| 2000/2000 [11:45<00:00,  2.83it/s]


Epoch 103 Mean Reward: 405.703


100%|██████████| 2000/2000 [11:56<00:00,  2.79it/s]


Epoch 104 Mean Reward: 411.569


100%|██████████| 2000/2000 [12:03<00:00,  2.77it/s]


Epoch 105 Mean Reward: 414.0145


100%|██████████| 2000/2000 [11:39<00:00,  2.86it/s]


Epoch 106 Mean Reward: 404.1955


100%|██████████| 2000/2000 [11:38<00:00,  2.86it/s]


Epoch 107 Mean Reward: 407.869


100%|██████████| 2000/2000 [11:57<00:00,  2.79it/s]


Epoch 108 Mean Reward: 419.2175


100%|██████████| 2000/2000 [12:21<00:00,  2.70it/s]


Epoch 109 Mean Reward: 428.3205


100%|██████████| 2000/2000 [12:03<00:00,  2.76it/s]


Epoch 110 Mean Reward: 419.8955
Epoch 110 Model saved to ./checkpoints/take_cover.ckpt
Epoch 110 test:
Test Episode 1 Reward: 142.0
Test Episode 2 Reward: 407.0
Test Episode 3 Reward: 271.0
Test Episode 4 Reward: 407.0
Test Episode 5 Reward: 407.0
Test Episode 6 Reward: 179.0
Test Episode 7 Reward: 135.0
Test Episode 8 Reward: 503.0
Test Episode 9 Reward: 407.0
Test Episode 10 Reward: 333.0
Average Test Reward: 319.1


100%|██████████| 2000/2000 [11:27<00:00,  2.91it/s]


Epoch 111 Mean Reward: 398.7595


100%|██████████| 2000/2000 [11:36<00:00,  2.87it/s]


Epoch 112 Mean Reward: 402.7715


100%|██████████| 2000/2000 [11:50<00:00,  2.82it/s]


Epoch 113 Mean Reward: 412.891


100%|██████████| 2000/2000 [12:35<00:00,  2.65it/s]


Epoch 114 Mean Reward: 420.371


100%|██████████| 2000/2000 [14:43<00:00,  2.26it/s]


Epoch 115 Mean Reward: 403.4015


100%|██████████| 2000/2000 [13:23<00:00,  2.49it/s]


Epoch 116 Mean Reward: 409.711


100%|██████████| 2000/2000 [11:31<00:00,  2.89it/s]


Epoch 117 Mean Reward: 402.729


100%|██████████| 2000/2000 [11:48<00:00,  2.82it/s]


Epoch 118 Mean Reward: 414.8575


100%|██████████| 2000/2000 [11:33<00:00,  2.88it/s]


Epoch 119 Mean Reward: 403.1385


100%|██████████| 2000/2000 [11:23<00:00,  2.93it/s]


Epoch 120 Mean Reward: 400.7385
Epoch 120 Model saved to ./checkpoints/take_cover.ckpt
Epoch 120 test:
Test Episode 1 Reward: 617.0
Test Episode 2 Reward: 617.0
Test Episode 3 Reward: 617.0
Test Episode 4 Reward: 617.0
Test Episode 5 Reward: 137.0
Test Episode 6 Reward: 144.0
Test Episode 7 Reward: 269.0
Test Episode 8 Reward: 617.0
Test Episode 9 Reward: 213.0
Test Episode 10 Reward: 410.0
Average Test Reward: 425.8


100%|██████████| 2000/2000 [11:52<00:00,  2.81it/s]


Epoch 121 Mean Reward: 419.5055


100%|██████████| 2000/2000 [12:52<00:00,  2.59it/s]


Epoch 122 Mean Reward: 423.428


100%|██████████| 2000/2000 [14:54<00:00,  2.24it/s]


Epoch 123 Mean Reward: 414.885


100%|██████████| 2000/2000 [13:57<00:00,  2.39it/s]


Epoch 124 Mean Reward: 421.3


100%|██████████| 2000/2000 [17:30<00:00,  1.90it/s]


Epoch 125 Mean Reward: 416.873


100%|██████████| 2000/2000 [17:16<00:00,  1.93it/s]


Epoch 126 Mean Reward: 425.878


100%|██████████| 2000/2000 [14:32<00:00,  2.29it/s]


Epoch 127 Mean Reward: 446.0145


100%|██████████| 2000/2000 [13:03<00:00,  2.55it/s]


Epoch 128 Mean Reward: 436.3665


100%|██████████| 2000/2000 [12:30<00:00,  2.67it/s]


Epoch 129 Mean Reward: 426.4055


100%|██████████| 2000/2000 [12:36<00:00,  2.64it/s]


Epoch 130 Mean Reward: 429.072
Epoch 130 Model saved to ./checkpoints/take_cover.ckpt
Epoch 130 test:
Test Episode 1 Reward: 359.0
Test Episode 2 Reward: 425.0
Test Episode 3 Reward: 372.0
Test Episode 4 Reward: 170.0
Test Episode 5 Reward: 439.0
Test Episode 6 Reward: 367.0
Test Episode 7 Reward: 236.0
Test Episode 8 Reward: 807.0
Test Episode 9 Reward: 636.0
Test Episode 10 Reward: 807.0
Average Test Reward: 461.8


100%|██████████| 2000/2000 [12:49<00:00,  2.60it/s]


Epoch 131 Mean Reward: 429.845


100%|██████████| 2000/2000 [12:57<00:00,  2.57it/s]


Epoch 132 Mean Reward: 439.3325


100%|██████████| 2000/2000 [13:00<00:00,  2.56it/s]


Epoch 133 Mean Reward: 445.333


100%|██████████| 2000/2000 [12:15<00:00,  2.72it/s]


Epoch 134 Mean Reward: 441.206


100%|██████████| 2000/2000 [11:56<00:00,  2.79it/s]


Epoch 135 Mean Reward: 431.66


100%|██████████| 2000/2000 [11:54<00:00,  2.80it/s]


Epoch 136 Mean Reward: 431.118


100%|██████████| 2000/2000 [11:56<00:00,  2.79it/s]


Epoch 137 Mean Reward: 433.717


100%|██████████| 2000/2000 [11:52<00:00,  2.81it/s]


Epoch 138 Mean Reward: 435.8055


100%|██████████| 2000/2000 [11:37<00:00,  2.87it/s]


Epoch 139 Mean Reward: 426.3965


100%|██████████| 2000/2000 [11:27<00:00,  2.91it/s]


Epoch 140 Mean Reward: 417.9815
Epoch 140 Model saved to ./checkpoints/take_cover.ckpt
Epoch 140 test:
Test Episode 1 Reward: 920.0
Test Episode 2 Reward: 920.0
Test Episode 3 Reward: 920.0
Test Episode 4 Reward: 318.0
Test Episode 5 Reward: 920.0
Test Episode 6 Reward: 920.0
Test Episode 7 Reward: 135.0
Test Episode 8 Reward: 171.0
Test Episode 9 Reward: 920.0
Test Episode 10 Reward: 920.0
Average Test Reward: 706.4


100%|██████████| 2000/2000 [11:52<00:00,  2.81it/s]


Epoch 141 Mean Reward: 442.445


100%|██████████| 2000/2000 [11:53<00:00,  2.80it/s]


Epoch 142 Mean Reward: 446.193


100%|██████████| 2000/2000 [11:29<00:00,  2.90it/s]


Epoch 143 Mean Reward: 430.3015


100%|██████████| 2000/2000 [11:58<00:00,  2.78it/s]


Epoch 144 Mean Reward: 448.6685


100%|██████████| 2000/2000 [11:37<00:00,  2.87it/s]


Epoch 145 Mean Reward: 437.0465


100%|██████████| 2000/2000 [11:33<00:00,  2.88it/s]


Epoch 146 Mean Reward: 435.2235


100%|██████████| 2000/2000 [11:50<00:00,  2.81it/s]


Epoch 147 Mean Reward: 447.7955


100%|██████████| 2000/2000 [11:25<00:00,  2.92it/s]


Epoch 148 Mean Reward: 433.1415


100%|██████████| 2000/2000 [11:49<00:00,  2.82it/s]


Epoch 149 Mean Reward: 431.733


100%|██████████| 2000/2000 [12:11<00:00,  2.73it/s]


Epoch 150 Mean Reward: 434.245
Epoch 150 Model saved to ./checkpoints/take_cover.ckpt
Epoch 150 test:
Test Episode 1 Reward: 991.0
Test Episode 2 Reward: 991.0
Test Episode 3 Reward: 310.0
Test Episode 4 Reward: 991.0
Test Episode 5 Reward: 991.0
Test Episode 6 Reward: 131.0
Test Episode 7 Reward: 326.0
Test Episode 8 Reward: 991.0
Test Episode 9 Reward: 439.0
Test Episode 10 Reward: 991.0
Average Test Reward: 715.2


100%|██████████| 2000/2000 [11:40<00:00,  2.86it/s]


Epoch 151 Mean Reward: 445.356


100%|██████████| 2000/2000 [11:50<00:00,  2.81it/s]


Epoch 152 Mean Reward: 448.629


100%|██████████| 2000/2000 [11:56<00:00,  2.79it/s]


Epoch 153 Mean Reward: 459.8595


100%|██████████| 2000/2000 [11:09<00:00,  2.99it/s]


Epoch 154 Mean Reward: 432.859


100%|██████████| 2000/2000 [11:20<00:00,  2.94it/s]


Epoch 155 Mean Reward: 440.852


100%|██████████| 2000/2000 [11:43<00:00,  2.84it/s]


Epoch 156 Mean Reward: 462.7625


100%|██████████| 2000/2000 [11:57<00:00,  2.79it/s]


Epoch 157 Mean Reward: 469.11


100%|██████████| 2000/2000 [12:08<00:00,  2.75it/s]


Epoch 158 Mean Reward: 479.832


100%|██████████| 2000/2000 [12:15<00:00,  2.72it/s]


Epoch 159 Mean Reward: 465.9475


100%|██████████| 2000/2000 [12:27<00:00,  2.68it/s]


Epoch 160 Mean Reward: 473.1855
Epoch 160 Model saved to ./checkpoints/take_cover.ckpt
Epoch 160 test:
Test Episode 1 Reward: 215.0
Test Episode 2 Reward: 215.0
Test Episode 3 Reward: 215.0
Test Episode 4 Reward: 117.0
Test Episode 5 Reward: 257.0
Test Episode 6 Reward: 215.0
Test Episode 7 Reward: 242.0
Test Episode 8 Reward: 325.0
Test Episode 9 Reward: 215.0
Test Episode 10 Reward: 191.0
Average Test Reward: 220.7


100%|██████████| 2000/2000 [15:17<00:00,  2.18it/s]


Epoch 161 Mean Reward: 449.5195


100%|██████████| 2000/2000 [18:25<00:00,  1.81it/s]


Epoch 162 Mean Reward: 476.32


100%|██████████| 2000/2000 [18:08<00:00,  1.84it/s]


Epoch 163 Mean Reward: 460.6735


100%|██████████| 2000/2000 [15:32<00:00,  2.14it/s]


Epoch 164 Mean Reward: 466.8125


100%|██████████| 2000/2000 [18:10<00:00,  1.83it/s]


Epoch 165 Mean Reward: 473.6895


100%|██████████| 2000/2000 [18:04<00:00,  1.84it/s]


Epoch 166 Mean Reward: 462.2035


100%|██████████| 2000/2000 [15:57<00:00,  2.09it/s]


Epoch 167 Mean Reward: 455.0795


100%|██████████| 2000/2000 [16:12<00:00,  2.06it/s]


Epoch 168 Mean Reward: 458.3555


100%|██████████| 2000/2000 [16:12<00:00,  2.06it/s]


Epoch 169 Mean Reward: 463.071


100%|██████████| 2000/2000 [22:14<00:00,  1.50it/s]


Epoch 170 Mean Reward: 490.117
Epoch 170 Model saved to ./checkpoints/take_cover.ckpt
Epoch 170 test:
Test Episode 1 Reward: 525.0
Test Episode 2 Reward: 292.0
Test Episode 3 Reward: 193.0
Test Episode 4 Reward: 292.0
Test Episode 5 Reward: 292.0
Test Episode 6 Reward: 266.0
Test Episode 7 Reward: 292.0
Test Episode 8 Reward: 292.0
Test Episode 9 Reward: 138.0
Test Episode 10 Reward: 292.0
Average Test Reward: 287.4


100%|██████████| 2000/2000 [18:23<00:00,  1.81it/s]


Epoch 171 Mean Reward: 465.7305


100%|██████████| 2000/2000 [17:11<00:00,  1.94it/s]


Epoch 172 Mean Reward: 480.026


100%|██████████| 2000/2000 [15:58<00:00,  2.09it/s]


Epoch 173 Mean Reward: 475.4215


100%|██████████| 2000/2000 [13:28<00:00,  2.47it/s]


Epoch 174 Mean Reward: 488.506


100%|██████████| 2000/2000 [12:35<00:00,  2.65it/s]


Epoch 175 Mean Reward: 479.416


100%|██████████| 2000/2000 [13:03<00:00,  2.55it/s]


Epoch 176 Mean Reward: 492.386


100%|██████████| 2000/2000 [15:31<00:00,  2.15it/s]


Epoch 177 Mean Reward: 482.582


100%|██████████| 2000/2000 [17:55<00:00,  1.86it/s]


Epoch 178 Mean Reward: 503.442


100%|██████████| 2000/2000 [14:20<00:00,  2.32it/s]


Epoch 179 Mean Reward: 505.4875


100%|██████████| 2000/2000 [19:24<00:00,  1.72it/s]


Epoch 180 Mean Reward: 499.389
Epoch 180 Model saved to ./checkpoints/take_cover.ckpt
Epoch 180 test:
Test Episode 1 Reward: 338.0
Test Episode 2 Reward: 338.0
Test Episode 3 Reward: 338.0
Test Episode 4 Reward: 337.0
Test Episode 5 Reward: 186.0
Test Episode 6 Reward: 338.0
Test Episode 7 Reward: 338.0
Test Episode 8 Reward: 770.0
Test Episode 9 Reward: 338.0
Test Episode 10 Reward: 338.0
Average Test Reward: 365.9


100%|██████████| 2000/2000 [19:20<00:00,  1.72it/s]


Epoch 181 Mean Reward: 481.464


100%|██████████| 2000/2000 [16:02<00:00,  2.08it/s]


Epoch 182 Mean Reward: 486.259


100%|██████████| 2000/2000 [14:52<00:00,  2.24it/s]


Epoch 183 Mean Reward: 499.6335


100%|██████████| 2000/2000 [13:23<00:00,  2.49it/s]


Epoch 184 Mean Reward: 493.8505


100%|██████████| 2000/2000 [17:59<00:00,  1.85it/s]


Epoch 185 Mean Reward: 487.7175


100%|██████████| 2000/2000 [15:56<00:00,  2.09it/s]


Epoch 186 Mean Reward: 479.28


100%|██████████| 2000/2000 [13:35<00:00,  2.45it/s]


Epoch 187 Mean Reward: 499.0015


100%|██████████| 2000/2000 [13:07<00:00,  2.54it/s]


Epoch 188 Mean Reward: 491.855


100%|██████████| 2000/2000 [13:27<00:00,  2.48it/s]


Epoch 189 Mean Reward: 510.0185


100%|██████████| 2000/2000 [12:35<00:00,  2.65it/s]


Epoch 190 Mean Reward: 477.589
Epoch 190 Model saved to ./checkpoints/take_cover.ckpt
Epoch 190 test:
Test Episode 1 Reward: 443.0
Test Episode 2 Reward: 189.0
Test Episode 3 Reward: 261.0
Test Episode 4 Reward: 277.0
Test Episode 5 Reward: 222.0
Test Episode 6 Reward: 222.0
Test Episode 7 Reward: 222.0
Test Episode 8 Reward: 222.0
Test Episode 9 Reward: 222.0
Test Episode 10 Reward: 222.0
Average Test Reward: 250.2


100%|██████████| 2000/2000 [12:16<00:00,  2.72it/s]


Epoch 191 Mean Reward: 488.1905


100%|██████████| 2000/2000 [12:11<00:00,  2.73it/s]


Epoch 192 Mean Reward: 475.7795


100%|██████████| 2000/2000 [12:00<00:00,  2.78it/s]


Epoch 193 Mean Reward: 471.1165


100%|██████████| 2000/2000 [12:03<00:00,  2.76it/s]


Epoch 194 Mean Reward: 473.743


100%|██████████| 2000/2000 [12:12<00:00,  2.73it/s]


Epoch 195 Mean Reward: 481.091


100%|██████████| 2000/2000 [12:20<00:00,  2.70it/s]


Epoch 196 Mean Reward: 485.9025


100%|██████████| 2000/2000 [12:34<00:00,  2.65it/s]


Epoch 197 Mean Reward: 498.582


100%|██████████| 2000/2000 [12:32<00:00,  2.66it/s]


Epoch 198 Mean Reward: 490.1165


100%|██████████| 2000/2000 [12:06<00:00,  2.75it/s]


Epoch 199 Mean Reward: 474.1785


100%|██████████| 2000/2000 [12:27<00:00,  2.67it/s]


Epoch 200 Mean Reward: 487.3145
Epoch 200 Model saved to ./checkpoints/take_cover.ckpt
Epoch 200 test:
Test Episode 1 Reward: 233.0
Test Episode 2 Reward: 434.0
Test Episode 3 Reward: 434.0
Test Episode 4 Reward: 113.0
Test Episode 5 Reward: 302.0
Test Episode 6 Reward: 173.0
Test Episode 7 Reward: 434.0
Test Episode 8 Reward: 157.0
Test Episode 9 Reward: 171.0
Test Episode 10 Reward: 434.0
Average Test Reward: 288.5
[(715.2, 150), (706.4, 140), (487.3, 40), (461.8, 130), (426.7, 20), (425.8, 120), (365.9, 180), (319.1, 110), (309.4, 50), (288.5, 200), (287.4, 170), (253.8, 70), (250.2, 190), (246.8, 60), (220.7, 160), (218.3, 80), (217.9, 90), (217.1, 10), (199.2, 100), (181.3, 30)]
14318015 time steps experienced during training


In [None]:
#Get a list of checkpoints saved during training

ckpts = tf.train.get_checkpoint_state('checkpoints').all_model_checkpoint_paths

#Test the trained model from a certain checkpoint by only choosing actions with a greedy strategy

test_reward = test_agent(DQN, num_episodes=20,
                             training=False,
                             load_model=True,
                             depth=False,
                             model_dir=ckpts[-6])
print('Average Test Reward:', test_reward)


Loading model from checkpoints\take_cover.ckpt-150
INFO:tensorflow:Restoring parameters from checkpoints\take_cover.ckpt-150
Test Episode 1 Reward: 991.0
Test Episode 2 Reward: 247.0
Test Episode 3 Reward: 991.0
Test Episode 4 Reward: 200.0
Test Episode 5 Reward: 991.0
Test Episode 6 Reward: 163.0
Test Episode 7 Reward: 991.0
Test Episode 8 Reward: 188.0
Test Episode 9 Reward: 991.0
Test Episode 10 Reward: 183.0
Test Episode 11 Reward: 991.0
Test Episode 12 Reward: 537.0
Test Episode 13 Reward: 991.0
Test Episode 14 Reward: 991.0
Test Episode 15 Reward: 585.0
Test Episode 16 Reward: 638.0
Test Episode 17 Reward: 162.0
Test Episode 18 Reward: 991.0
Test Episode 19 Reward: 235.0
