In [1]:
import gym
from vizdoom import gym_wrapper

from typing import Dict, Tuple
import tensorflow as tf
from tensorflow import keras
import numpy as np
from collections import deque
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from agent import Agent
from reinforce_agent import BaselineREINFORCEAgent
from state import StateManager
from action import Action
from variables import GAMMA, SIGMA

In [2]:
env = gym.make("VizdoomHealthGatheringSupreme-v0", frame_skip=4)



We can move forward or turn left/right.

In [3]:
env.action_space

Discrete(3)

The observation space contains:
- The 240x320 RGB frame
- The health of the player

In [4]:
env.observation_space

Dict(gamevariables:Box(-3.4028235e+38, 3.4028235e+38, (1,), float32), rgb:Box(0, 255, (240, 320, 3), uint8))

In [5]:
model = BaselineREINFORCEAgent(3, Adam(learning_rate=1e-2, clipnorm=40.0))

In [6]:
def play_one_step(env, state, state_manager):
    a_info = model.choose_action(state)
    next_obv, reward, done, _ = env.step(a_info['action'].value)
    next_state = state_manager.get_current_state(next_obv['rgb'].transpose(2,0,1))
    return (state, a_info, reward, next_state, done)

In [7]:
def play_episode(env, init_state, state_manager):
    ep_steps = 0
    episode_buffer = deque([], maxlen=2100)
    state = init_state
    for step in tf.range(2100-1):
        ep_steps += 1
        state, a_info, reward, next_state, done = play_one_step(env, init_state, state_manager)
        episode_buffer.append((state, a_info, reward, next_state, done))
        if done:
            break
        state = next_state
    states, actions, rewards, next_states, dones = (
        np.array([experience[i] for experience in list(episode_buffer)]) 
        for i in range(5))
    # Compute returns
    returns = []
    discounted_sum = tf.constant(0.0)
    for i in tf.range(ep_steps-1, -1, -1):
        discounted_sum = rewards[i] + 0.99 * discounted_sum
        returns.append(discounted_sum)
    returns = np.stack(returns[::-1])
    returns = (returns - tf.math.reduce_mean(returns)) / (tf.math.reduce_std(returns))
    return states, actions, rewards, returns, next_states, dones

In [8]:
def training_step(env, initial_state, state_manager):
    # We open a GradientTape because we want to reverse these operations to obtain the gradient
    # of the loss with respect to the model's parameter
    with tf.GradientTape() as tape:
        # 1) Get the experience playing the episode
        states, actions, rewards, returns, next_states, dones = play_episode(env, initial_state, state_manager)
        v_st_pred = tf.squeeze(tf.stack([a['value'] for a in actions]))
        a_probs = tf.stack([a['policy'] for a in actions])
        a_indices = tf.stack([a['action'].value for a in actions])
        a_probs = tf.gather(a_probs, a_indices, batch_dims=1)
        a_log_probs = tf.math.log(a_probs)
        # Compute delta
        delta = returns - v_st_pred
        # Actor loss
        actor_loss = -tf.reduce_sum(tf.expand_dims(delta, axis=-1)*a_log_probs)
        # Critic loss
        critic_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)(
            v_st_pred, returns)
        # Entropy loss
        entropy_loss = -tf.reduce_sum(a_log_probs*a_probs)
        # Total loss
        loss = tf.reduce_sum(actor_loss + critic_loss) #+ 0.01*entropy_loss)
    # 9) Obtain the gradient of the loss with respect to the model's parameters
    grads = tape.gradient(loss, model.trainable_variables)
    # 10) Apply the update
    model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return rewards

In [9]:
from tqdm import trange

last_scores = deque([], maxlen=10)
with trange(500) as t:
    for ep_num in t:
        state_manager = StateManager()
        initial_obs = env.reset()
        initial_state = state_manager.get_current_state(initial_obs['rgb'].transpose(2,0,1))
        rewards = training_step(env, initial_state, state_manager)
        episode_reward = sum(rewards)
        last_scores.append(episode_reward)
        rewards_mean = np.mean(last_scores)
        t.set_description(f'Episode reward: {episode_reward:.2f}, mean reward: {rewards_mean:.2f}')
        
env.close()

Episode reward: 348.00, mean reward: 316.00:   2%|▏         | 9/500 [01:22<1:13:51,  9.02s/it]

Show some epochs with the trained model

In [10]:
def testing_step(env, initial_state, state_manager):
    ep_steps = 0
    state = initial_state
    rewards = []
    env.render()
    for step in tf.range(2100-1):
        ep_steps += 1
        a_info = model.choose_action(state)
        next_obv, reward, done, _ = env.step(a_info['action'].value)
        state = state_manager.get_current_state(next_obv['rgb'].transpose(2,0,1))
        env.render()
        rewards.append(reward)
        if done:
            break
    return rewards

In [11]:
env = gym.make("VizdoomHealthGatheringSupreme-v0", frame_skip=4)

with trange(10) as t:
    for ep_num in t:
        state_manager = StateManager()
        initial_obs = env.reset()
        initial_state = state_manager.get_current_state(initial_obs['rgb'].transpose(2,0,1))
        rewards = testing_step(env, initial_state, state_manager)
        episode_reward = sum(rewards)
        last_scores.append(episode_reward)
        rewards_mean = np.mean(last_scores)
        t.set_description(f'Episode reward: {episode_reward:.2f}, mean reward: {rewards_mean:.2f}')
        
env.close()

Episode reward: 316.00, mean reward: 312.80:  50%|█████     | 5/10 [00:32<00:43,  8.63s/it]

: 

: 