In [None]:
import gym
from vizdoom import gym_wrapper
import time
import tensorflow as tf
from tensorflow import keras
import numpy as np
from collections import deque
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from reinforce_agent import BaselineREINFORCEAgent
from state import StateManager
from tqdm import tqdm, trange

TRAINING_STEPS = 500
MAX_EP_STEP = 2100

# Scenario analysis

In [None]:
env = gym.make("VizdoomHealthGatheringSupreme-v0", frame_skip=4)

We can move forward or turn left/right.

In [None]:
env.action_space

The observation space contains:
- The 240x320 RGB frame
- The health of the player

In [None]:
env.observation_space

# Baseline (random moves)

In [None]:
env = gym.make("VizdoomHealthGatheringSupreme-v0", frame_skip=4)

# Rendering random rollouts for ten episodes
rewards = []
for _ in range(5):
    done = False
    obs = env.reset()
    ep_rewards = []
    while not done:
        obs, rew, done, info = env.step(env.action_space.sample())
        time.sleep(1/30)
        env.render()
        ep_rewards.append(rew)
    episode_reward = sum(ep_rewards)
    print(f"Episode reward: {episode_reward}")
env.close()

# DQN

In [None]:
model_dqn = keras.Sequential([
    layers.Input(shape=(42,42,4)),
    layers.Conv2D(filters=8, kernel_size=3, strides=(2,2), padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.01)),
    layers.Conv2D(filters=16, kernel_size=3, strides=(2,2), padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.01)),
    layers.Conv2D(filters=16, kernel_size=3, strides=(2,2), padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.01)),
    layers.Conv2D(filters=32, kernel_size=3, strides=(2,2), padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.01)),
    layers.Permute((3, 1, 2), input_shape=(3, 3, 32)),
    layers.Reshape((32, 9)),
    layers.LSTM(64),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(3)
])

In [None]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(3)
    else:
        Q_values = model_dqn.predict(tf.expand_dims(state.repr, axis=0))
        return np.argmax(Q_values[0])

In [None]:
replay_buffer = deque(maxlen=2000)

In [None]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [None]:
batch_size = 32
discount_factor = 0.99
optimizer = keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = keras.losses.mean_squared_error

In [None]:
def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model_dqn.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1-dones)*discount_factor*max_next_Q_values)
    mask = tf.one_hot(actions, 3)
    with tf.GradientTape() as tape:
        all_Q_values = model_dqn(states)
        Q_values = tf.reduce_sum(all_Q_values*mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values,Q_values))
    grads = tape.gradient(loss, model_dqn.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_dqn.trainable_variables))

In [None]:
def play_one_step(env, state, epsilon, state_manager):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    next_state = state_manager.get_current_state(next_state['rgb'].transpose(2,0,1))
    replay_buffer.append((state.repr, action, reward, next_state.repr, done))
    return next_state, reward, done, info

In [None]:
env = gym.make("VizdoomHealthGatheringSupreme-v0", frame_skip=4)

last_scores = deque([], maxlen=20)
for ep_num in range(TRAINING_STEPS):
    initial_obs = env.reset()
    state_manager = StateManager()
    rewards = deque([], maxlen=MAX_EP_STEP)
    with trange(MAX_EP_STEP) as pbar:
        for ep_step in pbar:
            pbar.update(1)
            initial_state = state_manager.get_current_state(initial_obs['rgb'].transpose(2,0,1))
            epsilon = max(1- ep_num/500, 0.01)
            state, reward, done, info = play_one_step(env, initial_state, epsilon, state_manager)
            rewards.append(reward)
            if done:
                break
            elif ep_num > 1:
                training_step(batch_size)
    episode_reward = sum(list(rewards))
    last_scores.append(episode_reward)
    rewards_mean = np.mean(last_scores)
    print(f'Episode reward: {episode_reward:.2f}, mean reward: {rewards_mean:.2f}')
        
env.close()

# REINFORCE with Baseline

In [None]:
model = BaselineREINFORCEAgent(3, Adam(learning_rate=1e-2, clipnorm=40.0))

In [None]:
def play_one_step(env, state, state_manager):
    a_info = model.choose_action(state, training=True)
    next_obv, reward, done, _ = env.step(a_info['action'].value)
    next_state = state_manager.get_current_state(next_obv['rgb'].transpose(2,0,1))
    return (state, a_info, reward, next_state, done)

In [None]:
def play_episode(env, init_state, state_manager):
    ep_steps = 0
    episode_buffer = deque([], maxlen=2100)
    state = init_state
    for step in tf.range(2100-1):
        ep_steps += 1
        state, a_info, reward, next_state, done = play_one_step(env, init_state, state_manager)
        episode_buffer.append((state, a_info, reward, next_state, done))
        if done:
            break
        state = next_state
    states, actions, rewards, next_states, dones = (
        np.array([experience[i] for experience in list(episode_buffer)]) 
        for i in range(5))
    # Compute returns
    returns = []
    discounted_sum = tf.constant(0.0)
    for i in tf.range(ep_steps-1, -1, -1):
        discounted_sum = rewards[i] + 0.99 * discounted_sum
        returns.append(discounted_sum)
    returns = np.stack(returns[::-1])
    returns = (returns - tf.math.reduce_mean(returns)) / (tf.math.reduce_std(returns))
    return states, actions, rewards, returns, next_states, dones

In [None]:
def training_step(env, initial_state, state_manager):
    # We open a GradientTape because we want to reverse these operations to obtain the gradient
    # of the loss with respect to the model's parameter
    with tf.GradientTape() as tape:
        # 1) Get the experience playing the episode
        states, actions, rewards, returns, next_states, dones = play_episode(env, initial_state, state_manager)
        v_st_pred = tf.squeeze(tf.stack([a['value'] for a in actions]))
        a_probs = tf.stack([a['policy'] for a in actions])
        a_indices = tf.stack([a['action'].value for a in actions])
        a_probs = tf.gather(a_probs, a_indices, batch_dims=1)
        a_log_probs = tf.math.log(a_probs)
        # Compute delta
        delta = returns - v_st_pred
        # Actor loss
        actor_loss = -tf.reduce_sum(tf.expand_dims(delta, axis=-1)*a_log_probs)
        # Critic loss
        critic_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)(
            v_st_pred, returns)
        # Entropy loss
        entropy_loss = -tf.reduce_sum(a_log_probs*a_probs)
        # Total loss
        loss = tf.reduce_sum(actor_loss + critic_loss) #+ 0.01*entropy_loss)
    # 9) Obtain the gradient of the loss with respect to the model's parameters
    grads = tape.gradient(loss, model.trainable_variables)
    # 10) Apply the update
    model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return rewards

In [None]:
TRAINING_STEPS = 500

In [None]:
from tqdm import trange

last_scores = deque([], maxlen=10)
with trange(TRAINING_STEPS) as t:
    for ep_num in t:
        state_manager = StateManager()
        initial_obs = env.reset()
        initial_state = state_manager.get_current_state(initial_obs['rgb'].transpose(2,0,1))
        rewards = training_step(env, initial_state, state_manager)
        episode_reward = sum(rewards)
        last_scores.append(episode_reward)
        rewards_mean = np.mean(last_scores)
        t.set_description(f'Episode reward: {episode_reward:.2f}, mean reward: {rewards_mean:.2f}')
        
env.close()

Show some epochs with the trained model

In [None]:
def testing_step(env, initial_state, state_manager):
    ep_steps = 0
    state = initial_state
    rewards = []
    env.render()
    for step in tf.range(2100-1):
        ep_steps += 1
        a_info = model.choose_action(state)
        next_obv, reward, done, _ = env.step(a_info['action'].value)
        state = state_manager.get_current_state(next_obv['rgb'].transpose(2,0,1))
        env.render()
        rewards.append(reward)
        if done:
            break
    return rewards

In [None]:
env = gym.make("VizdoomHealthGatheringSupreme-v0", frame_skip=4)

with trange(10) as t:
    for ep_num in t:
        state_manager = StateManager()
        initial_obs = env.reset()
        initial_state = state_manager.get_current_state(initial_obs['rgb'].transpose(2,0,1))
        rewards = testing_step(env, initial_state, state_manager)
        episode_reward = sum(rewards)
        last_scores.append(episode_reward)
        rewards_mean = np.mean(last_scores)
        t.set_description(f'Episode reward: {episode_reward:.2f}, mean reward: {rewards_mean:.2f}')
        
env.close()