In [12]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm

from matplotlib import pyplot as plt
from keras.api import keras
from typing import Any, List, Sequence, Tuple

In [7]:
# Create the environment
env = gym.make("CartPole-v0")

# Set seed for experiment reproducibility
seed = 42
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

In [None]:
ActorPredictions = tf.Tensor
CriticPredictions = tf.Tensor

class ActorCritic(keras.Model):
  def __init__(self, num_actions: int, num_hidden_units: int):
    super().__init__()
    self.num_actions = num_actions
    self.num_hidden_units = num_hidden_units
    
    self.common = keras.layers.Dense(num_hidden_units, activation=tf.nn.relu)
    self.actor = keras.layers.Dense(num_actions)
    self.critic = keras.layers.Dense(1)

  def call(
    self, inputs: tf.Tensor, **kwargs
  ) -> Tuple[ActorPredictions, CriticPredictions]:
    x = self.common(inputs, **kwargs)
    return self.actor(x, **kwargs), self.critic(x, **kwargs)
    
num_actions = env.action_space.n  # 2: move left and move right
num_hidden_units = 28
model = ActorCritic(num_actions, num_hidden_units)


In [10]:
# Wrap OpenAI Gym's `env.step` call as an operation in a TensorFlow function.
# This would allow it to be included in a callable TensorFlow graph.

Action = np.ndarray
State = np.ndarray
Reward = np.ndarray
DoneFlag = np.ndarray
def env_step(action: Action) -> Tuple[State, Reward, DoneFlag]:
  state, reward, done, info = env.step(action)  # np.ndarray, float, bool, dict
  return (
    state.astype(np.float32), 
    np.array(reward, np.int32),
    np.array(done, np.int32)
  )

def tf_env_step(action: tf.Tensor) -> list[tf.Tensor]:
  return tf.numpy_function(env_step, [action], [tf.float32, tf.int32, tf.int32])


In [13]:
ActionProbs = tf.Tensor
Values = tf.Tensor
Rewards = tf.Tensor
def run_episode(
  initial_state: tf.Tensor,
  model: keras.Model,
  max_steps: int,
) -> Tuple[ActionProbs, Values, Rewards]:
  """
  Runs a single episode to collect training data.
  """
  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

  initial_state_shape = initial_state.shape
  state = initial_state
  for t in tf.range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)
    
    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model.call(state)

    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))
    
    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])

    # Apply action to the environment to get next state and reward
    state, reward, done = tf_env_step(action)
    state.set_shape(initial_state_shape)

    # Store reward
    rewards = rewards.write(t, reward)
    
    if tf.cast(done, bool) is True:
      break

  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()
  
  return action_probs, values, rewards
