In [1]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

# Create the environment
env = gym.make("CartPole-v0")

# Set seed for experiment reproducibility
seed = 42
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

Wiki of CartPole-v0: https://github-wiki-see.page/m/openai/gym/wiki/CartPole-v0

In [2]:
print(env.observation_space)

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


The envirionment returns a reward 1 at each step. Define the (step) reward threshold:

In [3]:
step_rew0 = 15 # step reward threshold

Implement a task of "achieving a position going beyond of range given by 'cart_pos'", and define a one-off reward and a probability threshold of archieving this task:

In [4]:
cart_pos = 0.10 # the position for the task

one_off_reward = 10 # one-off reward
task_prob0 = 0.8 # the probability threhold of archieving the above task

class PosTask():
  def __init__(
      self,
      ini_status=0):
    super().__init__()
    self.status = ini_status
    self.position = cart_pos
    
  def update(self, pos):
    ## The two 'if' conditoins ensures to gets a score 1 
    ## only when the target is achieved for the FIRST time.
    if self.status == 1:
        self.status = -1
    
    if abs(pos) >= self.position and self.status == 0:
        self.status = 1
    else:
        pass

  def check(self):
    return self.status

  def reset(self):
    self.status = 0
    
task = PosTask()

Each ActorCritic network models the environment for one robot. The variables "num_agents" and "num_tasks" below define the number of robots and tasks, respectively.

In [5]:
num_agents = 2
num_tasks = 1

class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""

  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units: int,
      name=None):
    """Initialize."""
    super().__init__(name=name)

    self.common = layers.Dense(num_hidden_units, activation="relu")
    self.actor = layers.Dense(num_actions)
    ## Set "critic" layer dim to 'num_tasks+1'.
    self.critic = layers.Dense(num_tasks+1)

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common(inputs)
    return self.actor(x), self.critic(x)

num_actions = env.action_space.n  # 2
num_hidden_units = 128

models = [ActorCritic(num_actions, num_hidden_units, name="AC{}".format(i)) for i in range(num_agents)]

2021-10-07 11:32:06.016560: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
## The reward includes a step reward and a task reward (score).
## Warning: the number of task is hardcode into the env_step function. To improve in future.

# Wrap OpenAI Gym's `env.step` call as an operation in a TensorFlow function.
# This would allow it to be included in a callable TensorFlow graph.

def env_step(state: np.ndarray, action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """Returns state, reward and done flag given an action."""

  state_new, step_reward, done, _ = env.step(action)
    
  ## Get a one-off reward when reaching the position threshold for the first time.   
  task.update(state_new[0])
  if task.check() == 1:
    np.append(state_new, -1.0)
    task_reward = int(one_off_reward)
  elif task.check() == 0:
    np.append(state_new, 0.0)
    task_reward = 0
  else:
    np.append(state_new, -1.0)
    task_reward = 0
    
  return (state.astype(np.float32), 
          #np.array(step_reward, np.int32),
          np.array([step_reward, task_reward], np.int32), 
          np.array(done, np.int32))

def tf_env_step(state: tf.Tensor, action: tf.Tensor) -> List[tf.Tensor]:
  #return tf.numpy_function(env_step, [action],
  return tf.numpy_function(env_step, [state, action],
                           [tf.float32, tf.int32, tf.int32])

In [7]:
def run_episode(
    initial_state: tf.Tensor,  
    model: tf.keras.Model, 
    max_steps: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
  """Runs a single episode to collect training data."""

  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
    
  initial_state_shape = initial_state.shape
  state = initial_state
    
  for t in tf.range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state1 = tf.expand_dims(state, 0)

    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state1)

    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])

    # Apply action to the environment to get next state and reward
    state, reward, done = tf_env_step(state, action)
    state.set_shape(initial_state_shape)

    # Store reward
    rewards = rewards.write(t, reward)

    if tf.cast(done, tf.bool):
      break

  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()
  
  ## Reset the task score at the end of each episode.
  task.reset()

  return action_probs, values, rewards

In [8]:
## Change "standardize" to "False", and 
## Instantiate "discounted_sum" to a list "[0.0]*(num_tasks+1)".

def get_expected_return(
    rewards: tf.Tensor, 
    gamma: float, 
    standardize: bool = False) -> tf.Tensor:
  """Compute expected returns per timestep."""

  n = tf.shape(rewards)[0]
  returns = tf.TensorArray(dtype=tf.float32, size=n)

  # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
  rewards = tf.cast(rewards[::-1], dtype=tf.float32)
  #discounted_sum = tf.constant(0.0)
  discounted_sum = tf.constant([0.0]*(num_tasks+1))
  discounted_sum_shape = discounted_sum.shape
  for i in tf.range(n):
    reward = rewards[i]
    discounted_sum = reward + gamma * discounted_sum
    discounted_sum.set_shape(discounted_sum_shape)
    returns = returns.write(i, discounted_sum)
  returns = returns.stack()[::-1]

  if standardize:
    returns = ((returns - tf.math.reduce_mean(returns)) / 
               (tf.math.reduce_std(returns) + eps))

  return returns

In [9]:
## Some auxiliary functions for defining the "compute_loss" function.
mu = 1.0 / num_agents # fixed even probability of allocating each task to each agent 
lam = 1.0 
chi = 1.0
c = step_rew0  
e = task_prob0 * one_off_reward # task reward threshold

def df(x: tf.Tensor) -> tf.Tensor:
  """Threshold '<=c' is used as running rewards (not costs) are considered."""
  if x <= c:
    return 2*(x-c)
  else:
    return tf.convert_to_tensor(0.0)

def dh(x: tf.Tensor) -> tf.Tensor:
  if x <= e:
    return 2*(x-e)
  else:
    return tf.convert_to_tensor(0.0)

"""
[TO-FIX] Intend to implement the following derivative for the KL loss but get an error.
def dh(x: tf.Tensor) -> tf.Tensor:
  if x <= e and x > 0:
    return tf.math.log(x/e) - tf.math.log((1-x)/(1-e))
  else:
    return tf.convert_to_tensor(0.0)
"""

def compute_H(X: tf.Tensor, Xi: tf.Tensor) -> tf.Tensor:
  _, y = X.get_shape()
  ###Try to use tf.TensorArray to implement H but get an error.!!!
  H = [lam * df(Xi[0])]
  for j in range(1,y):
    H.append(chi * dh(tf.math.reduce_sum(mu * X[:,j])) * mu)
  return tf.expand_dims(tf.convert_to_tensor(H), 1)

In [10]:
## The compute_loss function (with the above aux definitions) implements our loss function
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,  
    values: tf.Tensor,  
    returns: tf.Tensor,
    ini_value: tf.Tensor,
    ini_values_i: tf.Tensor) -> tf.Tensor:
  """Computes the combined actor-critic loss."""

  #advantage = returns - values
  H = compute_H(ini_value, ini_values_i)
  advantage =  tf.matmul(returns - values, H)
  action_log_probs = tf.math.log(action_probs)
  actor_loss = tf.math.reduce_sum(action_log_probs * advantage)

  critic_loss = huber_loss(values, returns)
    
  #print(f'shape of action_log_probs:, {action_log_probs.get_shape()}')
  #print(f'shape of H:, {H.get_shape()}')
  #print(f'shape of advantage:, {advantage.get_shape()}')
  #print(f'shape of actor_loss:, {actor_loss.get_shape()}')
  #print(f'shape of critic_loss:, {critic_loss.get_shape()}')

  return actor_loss + critic_loss

In [11]:
## Have to use a smaller learning_rate to make the training convergent
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) #0.01


## Comment out '@tf.function'. 
## This annotation gives me an incorrect (non-reasonable) result.
## Need to figure out whether it is possible to 
##  change our customerised implementation in order to use this feature.
##@tf.function
def train_step0(
    models: List[tf.keras.Model], 
    optimizer: tf.keras.optimizers.Optimizer, 
    gamma: float, 
    max_steps_per_episode: int) -> tf.Tensor:
  """Runs a model training step."""
  
  num_model = len(models)
  action_probs_l = []
  values_l = []
  rewards_l = []
  returns_l = []

  with tf.GradientTape() as tape:
        
    for i in range(num_model):
        
      ## add the location into the state 
      initial_state = env.reset()
      np.append(initial_state, 0.0)
      initial_state = tf.constant(env.reset(), dtype=tf.float32)

      # Run the model for one episode to collect training data
      action_probs, values, rewards = run_episode(
        initial_state, models[i], max_steps_per_episode) 

      # Calculate expected returns
      returns = get_expected_return(rewards, gamma)
      
      action_probs_l.append(action_probs)
      values_l.append(values)
      rewards_l.append(rewards)
      returns_l.append(returns)
    
    ini_values = tf.convert_to_tensor([x[0,:] for x in values_l])
    loss_l = []
    for i in range(num_model):

      # Convert training data to appropriate TF tensor shapes
      action_probs = tf.expand_dims(action_probs_l[i], 1)
      ## Don't need to convert the shapes of values and returns from our networks
      # action_probs, values, returns = [
      #  tf.expand_dims(x, 1) for x in [action_probs_l[i], values_l[i], returns_l[i]]]
      
      values = values_l[i]
      returns = returns_l[i]

      # Calculating loss values to update our network 
      ini_values_i = ini_values[i,:] 
      loss = compute_loss(action_probs, values, returns, ini_values, ini_values_i)
      loss_l.append(loss)
     
      #print(f'ini_values for model#{i}: {ini_values_i}')
      #print(f'loss value for model#{i}: {loss}')
      #print(f'returns for model#{i}: {returns[0]}')
  
  # Compute the gradients from the loss vector
  vars_l = [m.trainable_variables for m in models]
  grads_l = tape.gradient(loss_l, vars_l)

  # Apply the gradients to the model's parameters
  grads_l_f = [x for y in grads_l for x in y]
  vars_l_f = [x for y in vars_l for x in y]
  optimizer.apply_gradients(zip(grads_l_f, vars_l_f))

  episode_reward_l = [tf.math.reduce_sum(rewards_l[i]) for i in range(num_agents)]
  
  ## For convenience, just return the first episode_reward to the console. 
  ## To improve the 'tqdm.trange' code (below) in future.
  return episode_reward_l[0], ini_values

In [12]:
%%time

min_episodes_criterion = 100
max_episodes = 1000 #10000
max_steps_per_episode = 50 #1000

# Cartpole-v0 is considered solved if average reward is >= 195 over 100 
# consecutive trials
reward_threshold = 195
running_reward = 0

## No discount
# Discount factor for future rewards
gamma = 1.00 #0.99

# Keep last episodes reward
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

with tqdm.trange(max_episodes) as t:
  for i in t:
    #initial_state = tf.constant(env.reset(), dtype=tf.float32)
    #episode_reward = int(train_step(
    #    initial_state, models, optimizer, gamma, max_steps_per_episode))
    episode_reward, ini_values = train_step0(
        models, optimizer, gamma, max_steps_per_episode)
    
    episode_reward = int(episode_reward)

    episodes_reward.append(episode_reward)
    running_reward = statistics.mean(episodes_reward)

    t.set_description(f'Episode {i}')
    t.set_postfix(
        episode_reward=episode_reward, running_reward=running_reward)

    # Show average episode reward every 10 episodes
    if i % 20 == 0:
        for k in range(num_agents):
          print(f'values at the initial state for model#{k}: {ini_values[k]}')
          #pass # print(f'Episode {i}: average reward: {avg_reward}')

    if running_reward > reward_threshold and i >= min_episodes_criterion:  
        break

print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')

Episode 1:   0%|             | 1/1000 [00:00<02:49,  5.88it/s, episode_reward=19, running_reward=21]

values at the initial state for model#0: [-0.00256066  0.00394337]
values at the initial state for model#1: [ 0.00505149 -0.00811187]


Episode 21:   2%|▏        | 22/1000 [00:02<01:55,  8.45it/s, episode_reward=29, running_reward=27.4]

values at the initial state for model#0: [0.09545043 0.03662683]
values at the initial state for model#1: [0.07616318 0.02120497]


Episode 41:   4%|▍        | 42/1000 [00:05<02:12,  7.24it/s, episode_reward=49, running_reward=28.6]

values at the initial state for model#0: [0.26457986 0.10558325]
values at the initial state for model#1: [0.2251103  0.08345726]


Episode 61:   6%|▌        | 62/1000 [00:07<01:41,  9.23it/s, episode_reward=22, running_reward=27.9]

values at the initial state for model#0: [0.44333014 0.23096271]
values at the initial state for model#1: [0.42735445 0.18248394]


Episode 81:   8%|▋        | 81/1000 [00:10<02:01,  7.58it/s, episode_reward=21, running_reward=28.3]

values at the initial state for model#0: [0.6870237  0.38878995]
values at the initial state for model#1: [0.6863609 0.3827847]


Episode 101:  10%|▋      | 102/1000 [00:13<01:57,  7.65it/s, episode_reward=31, running_reward=28.4]

values at the initial state for model#0: [0.99241644 0.6302023 ]
values at the initial state for model#1: [1.0457394 0.5986865]


Episode 121:  12%|▊      | 122/1000 [00:15<01:49,  8.02it/s, episode_reward=18, running_reward=29.6]

values at the initial state for model#0: [1.4981639 1.0079836]
values at the initial state for model#1: [1.5420327 0.9719183]


Episode 141:  14%|▉      | 142/1000 [00:18<01:52,  7.63it/s, episode_reward=17, running_reward=28.2]

values at the initial state for model#0: [1.9467208 1.3807025]
values at the initial state for model#1: [2.089262  1.4075079]


Episode 162:  16%|█▏     | 162/1000 [00:21<01:43,  8.12it/s, episode_reward=19, running_reward=29.1]

values at the initial state for model#0: [2.5445607 1.8733883]
values at the initial state for model#1: [2.639825  1.9014337]


Episode 181:  18%|█▋       | 182/1000 [00:23<02:07,  6.40it/s, episode_reward=41, running_reward=29]

values at the initial state for model#0: [3.095605 2.366052]
values at the initial state for model#1: [3.398965  2.4790869]


Episode 201:  20%|█▍     | 202/1000 [00:26<01:35,  8.35it/s, episode_reward=40, running_reward=29.2]

values at the initial state for model#0: [3.7556157 2.9342217]
values at the initial state for model#1: [4.1455865 3.0917428]


Episode 221:  22%|█▌     | 222/1000 [00:28<01:37,  7.95it/s, episode_reward=17, running_reward=28.4]

values at the initial state for model#0: [4.4885097 3.5957696]
values at the initial state for model#1: [4.911976  3.7417238]


Episode 241:  24%|█▋     | 242/1000 [00:31<01:45,  7.19it/s, episode_reward=11, running_reward=29.8]

values at the initial state for model#0: [5.335581 4.339545]
values at the initial state for model#1: [5.605027 4.406044]


Episode 261:  26%|█▊     | 262/1000 [00:33<01:32,  7.94it/s, episode_reward=31, running_reward=28.9]

values at the initial state for model#0: [5.9752126 4.844207 ]
values at the initial state for model#1: [6.5111465 5.085073 ]


Episode 281:  28%|█▉     | 282/1000 [00:36<01:27,  8.21it/s, episode_reward=16, running_reward=28.5]

values at the initial state for model#0: [6.7773743 5.4806714]
values at the initial state for model#1: [7.284358 5.725028]


Episode 300:  30%|██     | 301/1000 [00:38<01:24,  8.23it/s, episode_reward=29, running_reward=28.4]

values at the initial state for model#0: [7.642619  6.0896263]
values at the initial state for model#1: [8.278649  6.3955555]


Episode 321:  32%|██▎    | 322/1000 [00:41<01:20,  8.38it/s, episode_reward=28, running_reward=27.7]

values at the initial state for model#0: [8.264267 6.371301]
values at the initial state for model#1: [9.186085  6.8859615]


Episode 341:  34%|██▍    | 342/1000 [00:44<01:27,  7.52it/s, episode_reward=47, running_reward=27.5]

values at the initial state for model#0: [8.966725 6.681676]
values at the initial state for model#1: [9.99425  7.232072]


Episode 361:  36%|██▌    | 362/1000 [00:46<01:22,  7.78it/s, episode_reward=49, running_reward=28.1]

values at the initial state for model#0: [9.927776  7.0238786]
values at the initial state for model#1: [10.806014   7.4932666]


Episode 382:  38%|██▋    | 383/1000 [00:49<01:04,  9.55it/s, episode_reward=12, running_reward=27.9]

values at the initial state for model#0: [10.61826   7.104053]
values at the initial state for model#1: [11.122501  7.401661]


Episode 401:  40%|██▊    | 402/1000 [00:51<01:10,  8.52it/s, episode_reward=33, running_reward=27.9]

values at the initial state for model#0: [11.174941   7.1280923]
values at the initial state for model#1: [11.415193  7.311904]


Episode 422:  42%|██▉    | 423/1000 [00:54<01:01,  9.43it/s, episode_reward=24, running_reward=28.8]

values at the initial state for model#0: [11.82047   7.160266]
values at the initial state for model#1: [11.891487   7.2982883]


Episode 441:  44%|███▉     | 442/1000 [00:56<01:21,  6.82it/s, episode_reward=26, running_reward=29]

values at the initial state for model#0: [12.15732    7.1057777]
values at the initial state for model#1: [12.377342   7.3188143]


Episode 461:  46%|███▏   | 462/1000 [00:59<01:02,  8.56it/s, episode_reward=28, running_reward=28.9]

values at the initial state for model#0: [12.623369   7.0621448]
values at the initial state for model#1: [12.869357   7.2421885]


Episode 481:  48%|███▎   | 482/1000 [01:02<01:17,  6.69it/s, episode_reward=34, running_reward=29.9]

values at the initial state for model#0: [13.003497  7.077408]
values at the initial state for model#1: [13.008285   7.1705284]


Episode 501:  50%|████▌    | 502/1000 [01:04<01:02,  7.99it/s, episode_reward=24, running_reward=30]

values at the initial state for model#0: [13.429703  6.943088]
values at the initial state for model#1: [13.737362   7.1550636]


Episode 521:  52%|███▋   | 522/1000 [01:07<01:13,  6.47it/s, episode_reward=21, running_reward=29.8]

values at the initial state for model#0: [13.667142   6.7700133]
values at the initial state for model#1: [13.928774  7.006451]


Episode 540:  54%|███▊   | 541/1000 [01:10<01:10,  6.48it/s, episode_reward=46, running_reward=30.4]

values at the initial state for model#0: [14.119379   7.0164065]
values at the initial state for model#1: [13.953038  7.049357]


Episode 561:  56%|███▉   | 562/1000 [01:14<01:05,  6.72it/s, episode_reward=21, running_reward=31.8]

values at the initial state for model#0: [14.644855  7.36516 ]
values at the initial state for model#1: [14.083061   7.2496037]


Episode 581:  58%|████   | 582/1000 [01:17<01:11,  5.81it/s, episode_reward=20, running_reward=31.9]

values at the initial state for model#0: [14.919836  7.20123 ]
values at the initial state for model#1: [14.019248  7.027171]


Episode 600:  60%|█████▍   | 601/1000 [01:20<00:54,  7.35it/s, episode_reward=33, running_reward=32]

values at the initial state for model#0: [15.025076   6.9337616]
values at the initial state for model#1: [14.188987   6.8432226]


Episode 621:  62%|█████▌   | 622/1000 [01:23<00:53,  7.09it/s, episode_reward=24, running_reward=32]

values at the initial state for model#0: [14.792558  6.810871]
values at the initial state for model#1: [14.653498   6.9139433]


Episode 641:  64%|████▍  | 642/1000 [01:26<00:43,  8.31it/s, episode_reward=27, running_reward=30.8]

values at the initial state for model#0: [14.602512   6.6130404]
values at the initial state for model#1: [14.931447   6.8231187]


Episode 661:  66%|████▋  | 662/1000 [01:28<00:42,  7.87it/s, episode_reward=38, running_reward=29.4]

values at the initial state for model#0: [14.421197  6.488619]
values at the initial state for model#1: [15.138824   6.7897077]


Episode 681:  68%|████▊  | 682/1000 [01:31<00:39,  8.15it/s, episode_reward=20, running_reward=28.1]

values at the initial state for model#0: [14.290456  6.493517]
values at the initial state for model#1: [15.160047  6.843665]


Episode 701:  70%|████▉  | 702/1000 [01:33<00:36,  8.08it/s, episode_reward=18, running_reward=27.7]

values at the initial state for model#0: [14.067087  6.643815]
values at the initial state for model#1: [15.164185   7.0896025]


Episode 721:  72%|█████  | 722/1000 [01:35<00:38,  7.25it/s, episode_reward=26, running_reward=27.2]

values at the initial state for model#0: [13.875162   6.6361885]
values at the initial state for model#1: [15.173551   7.1667314]


Episode 741:  74%|██████▋  | 742/1000 [01:38<00:29,  8.71it/s, episode_reward=21, running_reward=27]

values at the initial state for model#0: [13.655541   6.5967255]
values at the initial state for model#1: [15.106042   7.1550817]


Episode 761:  76%|██████▊  | 762/1000 [01:40<00:28,  8.30it/s, episode_reward=20, running_reward=27]

values at the initial state for model#0: [13.888246   6.6174707]
values at the initial state for model#1: [15.127593   7.0833616]


Episode 781:  78%|█████▍ | 782/1000 [01:43<00:25,  8.41it/s, episode_reward=24, running_reward=27.4]

values at the initial state for model#0: [13.9094515  6.5291166]
values at the initial state for model#1: [15.050762  6.992025]


Episode 801:  80%|█████▌ | 802/1000 [01:45<00:26,  7.55it/s, episode_reward=27, running_reward=27.5]

values at the initial state for model#0: [13.884877   6.4417157]
values at the initial state for model#1: [15.145258   7.0390997]


Episode 821:  82%|█████▊ | 822/1000 [01:48<00:28,  6.35it/s, episode_reward=26, running_reward=28.9]

values at the initial state for model#0: [14.116487   6.5390687]
values at the initial state for model#1: [15.112576  7.068258]


Episode 841:  84%|█████▉ | 842/1000 [01:51<00:25,  6.14it/s, episode_reward=29, running_reward=29.5]

values at the initial state for model#0: [14.393679   6.6225595]
values at the initial state for model#1: [15.018619  6.977735]


Episode 861:  86%|██████ | 862/1000 [01:54<00:20,  6.82it/s, episode_reward=25, running_reward=29.5]

values at the initial state for model#0: [14.478207   6.6866546]
values at the initial state for model#1: [15.025182   7.0251617]


Episode 881:  88%|██████▏| 882/1000 [01:57<00:16,  7.28it/s, episode_reward=49, running_reward=29.4]

values at the initial state for model#0: [14.246804   6.4539127]
values at the initial state for model#1: [15.014981   6.8749294]


Episode 900:  90%|██████▎| 901/1000 [01:59<00:16,  5.91it/s, episode_reward=21, running_reward=29.9]

values at the initial state for model#0: [14.131335  6.534855]
values at the initial state for model#1: [14.906793   6.9316015]


Episode 921:  92%|██████▍| 922/1000 [02:03<00:09,  7.90it/s, episode_reward=23, running_reward=29.5]

values at the initial state for model#0: [14.4303665  6.6874723]
values at the initial state for model#1: [14.775463  6.996707]


Episode 940:  94%|██████▌| 941/1000 [02:05<00:09,  6.44it/s, episode_reward=59, running_reward=29.8]

values at the initial state for model#0: [14.425964  6.68997 ]
values at the initial state for model#1: [14.419263   6.8500514]


Episode 960:  96%|██████▋| 961/1000 [02:08<00:05,  7.58it/s, episode_reward=28, running_reward=30.1]

values at the initial state for model#0: [14.557999  6.864701]
values at the initial state for model#1: [14.131088  6.857766]


Episode 981:  98%|██████▊| 981/1000 [02:11<00:02,  7.06it/s, episode_reward=23, running_reward=31.2]

values at the initial state for model#0: [14.837031  7.10012 ]
values at the initial state for model#1: [14.359616  7.104296]


Episode 999: 100%|██████| 1000/1000 [02:14<00:00,  7.45it/s, episode_reward=17, running_reward=30.6]


Solved at episode 999: average reward: 30.57!
CPU times: user 2min 13s, sys: 1.57 s, total: 2min 14s
Wall time: 2min 14s





### About the Experiment Outcome
The outcome data makes sense, because we can observe the following convergeces for both models:

- The two intial **step reward** cirtic values converge to a value below the threshold `step_rew0 = 15`, and 
- The two intial **task reward** cirtic values converge to a value below the threshold `task_prob0 * one_off_reward = 8`. 

This experiment setting use a position parameter `cart_pos = 0.10` and the maximum step per episode `max_steps_per_episode = 50`

In [13]:
for i in range(num_agents):
    print(models[i].summary())

Model: "AC0"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  640       
_________________________________________________________________
dense_1 (Dense)              multiple                  258       
_________________________________________________________________
dense_2 (Dense)              multiple                  258       
Total params: 1,156
Trainable params: 1,156
Non-trainable params: 0
_________________________________________________________________
None
Model: "AC1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  640       
_________________________________________________________________
dense_4 (Dense)              multiple                  258       
___________________________________________

In [14]:
#models[0].trainable_weights

In [15]:
#models[-1].trainable_weights

In [16]:
#from IPython import get_ipython; get_ipython().magic('reset -sf')