In [1]:
import gym
import tensorflow as tf
from tensorflow import keras
import numpy as np
from collections import deque

env = gym.make("CartPole-v1")

  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORSHAPEPROTO_DIM = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.EnumValueDescriptor(
  _DATATYPE = _descriptor.EnumDescriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _RESOURCEHANDLEPROTO_DTYPEANDSHAPE = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORPROTO = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _ATTRVALUE_LISTVALUE = _descriptor.Descriptor(
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):


In [2]:
from typing import Dict, Tuple
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from agent import Agent
from state import State
from action import Action
from variables import GAMMA, SIGMA

class BaselineActorCriticAgent(Agent):
    def __init__(self, num_actions, optimizer, discount:float=GAMMA) -> None:
        super().__init__(num_actions, optimizer)
        self.num_actions = num_actions
        self.optimizer = optimizer
        self.discount = discount
        self.dense1 = layers.Dense(64)
        self.dense2 = layers.Dense(64)
        self.actor  = layers.Dense(self.num_actions)        # Produce logits
        self.critic = layers.Dense(1)                      # Produce the state-value directly

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        x = self.dense2(self.dense1(inputs))
        # Then we produce the policy values
        action_logits = self.actor(x)                    # 1xnum_actions
        action_probs  = tf.nn.softmax(action_logits)     # 1xnum_actions probabilities
        # Avoid producing a tensor containing probability 0 for some actions.
        action_probs = tf.clip_by_value(action_probs, 1e-10, 1.0)
        # ... and the state value.
        state_value = self.critic(x)                    # 1x1
        return action_logits, action_probs, state_value

    def choose_action(self, state:np.ndarray) -> Tuple:
        action_logits, action_probs, state_value = self(
            tf.expand_dims(tf.cast(state, tf.float32), axis=0))
        # Sample from the actions probability distribution
        action = tf.random.categorical(action_logits, 1)
        return {
            'action': action.numpy()[0,0],
            'policy': action_probs[0],
            'value' : state_value[0]
        }

In [3]:
model = BaselineActorCriticAgent(2, Adam(learning_rate=1e-2, clipnorm=40.0))

In [4]:
def play_one_step(env, state):
    a_info = model.choose_action(state)
    next_state, reward, done, _ = env.step(a_info['action'])
    return (state, a_info, reward, next_state, done)

In [5]:
def play_episode(env, init_state):
    ep_steps = 0
    episode_buffer = deque([], maxlen=200)
    state = init_state
    for step in tf.range(200):
        ep_steps += 1
        state, a_info, reward, next_state, done = play_one_step(env, state)
        episode_buffer.append((state, a_info, reward, next_state, done))
        if done:
            break
        state = next_state
    states, actions, rewards, next_states, dones = (
        np.array([experience[i] for experience in list(episode_buffer)]) 
        for i in range(5))
    # Compute returns
    returns = []
    discounted_sum = tf.constant(0.0)
    for i in tf.range(ep_steps-1, -1, -1):
        discounted_sum = rewards[i] + 0.99 * discounted_sum
        returns.append(discounted_sum)
    returns = np.stack(returns[::-1])
    returns = (returns - tf.math.reduce_mean(returns)) / (tf.math.reduce_std(returns))
    return states, actions, rewards, returns, next_states, dones

In [6]:
def training_step(env, initial_state):
    # We open a GradientTape because we want to reverse these operations to obtain the gradient
    # of the loss with respect to the model's parameter
    with tf.GradientTape() as tape:
        # 1) Get the experience playing the episode
        states, actions, rewards, returns, next_states, dones = play_episode(env, initial_state)
        v_st_pred = tf.squeeze(tf.stack([a['value'] for a in actions]))
        a_probs = tf.stack([a['policy'] for a in actions])
        a_indices = tf.stack([a['action'] for a in actions])
        a_probs = tf.gather(a_probs, a_indices, batch_dims=1)
        a_log_probs = tf.math.log(a_probs)
        # Compute delta
        delta = returns - v_st_pred
        # Actor loss
        actor_loss = -tf.reduce_sum(tf.expand_dims(delta, axis=-1)*a_log_probs)
        # Critic loss
        critic_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)(v_st_pred, returns)
        # Entropy loss
        # entropy_loss = -tf.reduce_sum(a_log_probs*a_probs)
        # Total loss
        loss = tf.reduce_sum(actor_loss + critic_loss) #+ 0.01*entropy_loss)
    # 9) Obtain the gradient of the loss with respect to the model's parameters
    grads = tape.gradient(loss, model.trainable_variables)
    # 10) Apply the update
    model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return rewards

In [7]:
from tqdm import trange

rewards_mean = 0
with trange(800) as t:
    for ep_num in t:
        initial_state = env.reset()
        rewards = training_step(env, initial_state)
        episode_reward = sum(rewards)
        rewards_mean = rewards_mean + (episode_reward-rewards_mean)/(ep_num+1)
        t.set_description(f'Episode reward: {episode_reward:.2f}, mean reward: {rewards_mean:.2f}')
        
env.close()

Episode reward: 17.00, mean reward: 20.43: 100%|██████████| 800/800 [01:35<00:00,  8.41it/s] 


With curiosity

In [11]:
model = BaselineActorCriticAgent(2, Adam(learning_rate=1e-2, clipnorm=40.0))

In [12]:
from keras import losses, Model
from keras.layers import Layer
from variables import *

class ICM(Model):
    def __init__(self, num_actions, optimizer, beta=BETA, eta=ETA, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.num_actions = num_actions
        self.optimizer = optimizer
        self.beta = beta                                # Weight of the forward model loss against the inverse model loss
        self.eta = eta                                  # Scaling factor for the intrinsic reward signal
        self.encoding_layer = EncodingLayer()
        self.forward_model  = ForwardModel(num_actions)
        self.inverse_model  = InverseModel(num_actions)

    def change_scaling_factor(self, new_eta) -> None:
        '''
        Function that allows dynamic changes to the scaling factor.
        '''
        self.eta = new_eta

    def call(self, inputs, training=False) -> tf.Tensor:
        # Inputs are the state St, action At and state St+1
        # States are [1,42,42,4] tensors, while action At is a [1,num_actions] tensor
        st, at, st1 = inputs
        # Computing state encodings
        e_st, e_st1 = self.encoding_layer((st, st1))
        # Predict the encoding of state st1 and the action.
        pred_e_st1 = self.forward_model((at, e_st))
        pred_at = self.inverse_model((e_st, e_st1))
        if training:
            # We compute the loss of the ICM. It's a composite loss, because we have two 
            # communicating modules:
            # - The loss of the forward model is a regression loss between the 
            #   ground truth encoding and the predicted one
            # - The loss of the inverse model is a cross-entropy loss between the
            #   ground truth action probability distribution and the predicted one.
            loss_inverse = losses.categorical_crossentropy(at, pred_at)
            loss_forward = losses.huber(e_st1, pred_e_st1, delta=1.0)
            loss_value = (1-self.beta)*tf.reduce_sum(loss_inverse) + self.beta*tf.reduce_sum(loss_forward)
            # Use the add_loss API to retrieve this value as a loss to minimize later
            self.add_loss(ICM_LW*loss_value)
        # Finally, compute the output (intrinsic reward)
        # ri = self.eta/2*tf.norm(pred_e_st1 - e_st1)
        ri = tf.math.minimum(CLIP_RE, self.eta/2*tf.norm(pred_e_st1 - e_st1))
        return ri


class EncodingLayer(Layer):
    '''
    Utility layer for computing the encodings of the states, separated from the rest since 
    encodings are shared between the inverse and forward models.
    '''
    def __init__(self) -> None:
        super().__init__()
        self.dense   = layers.Dense(288)
        self.dropout = layers.Dropout(0.2)
        self.flatten = layers.Flatten()
    
    def call(self, inputs) -> Tuple[tf.Tensor, tf.Tensor]:
        # Inputs are the states St and St+1, [1, 42, 42, 4] tensors
        st, st1 = inputs
        # Compute encoding of state st and st1
        # 1x288 <- 1x3x3x32 <- 1x6x6x32 <- 1x11x11x32 <- 1x21x21x32 <- 1x42x42x4
        e_st  = self.dense(st)
        e_st1 = self.dense(st1)
        return e_st, e_st1


class ForwardModel(Layer):
    '''
    The forward model of the ICM takes as input the action At (one-hot encoded)
    and the encoding of the state St (`e(St)`). It tries to predict the encoding
    of state St+1 (`pred[e(St+1)]`)
    '''
    def __init__(self, num_actions, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.num_actions = num_actions
        self.concat = layers.Concatenate(axis=1)
        self.dense1 = layers.Dense(128, activation='relu')                                                           # Original is 256
        self.dense2 = layers.Dense(288)

    def call(self, inputs) -> tf.Tensor:
        # Inputs: the action At and the encoding of the state e(St)
        at, e_st = inputs
        # at is [1, num_actions]
        # enc_st is [1, 288]
        x = self.concat([at, e_st])                 # [1, num_actions + 288]
        pred_e_st1 = self.dense2(self.dense1(x))    # [1, 288]
        return pred_e_st1


class InverseModel(Layer):
    '''
    The inverse model of the ICM takes as input the encoding of states St and St+1
    `e(St)` and `e(St+1)` and tries to predict the action (`pred[At]`).
    '''
    def __init__(self, num_actions, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.num_actions = num_actions
        self.concat  = layers.Concatenate(axis=1)
        self.dense1  = layers.Dense(128, activation='relu')                                                          # Original is 256
        self.dense2  = layers.Dense(self.num_actions, activation='softmax')

    def call(self, inputs) -> tf.Tensor:
        e_st, e_st1 = inputs
        # Concatenate the encodings
        e_states = self.concat([e_st, e_st1])             # [1, 288*2]
        # Dense layers for action prediction
        pred_at = self.dense2(self.dense1(e_states))      # [1, num_actions], probability distribution
        return pred_at


In [20]:
curiosity_model = ICM(2, optimizer=keras.optimizers.Adam(learning_rate=1e-2))

In [21]:
intrinsic_rewards = []

def play_one_step_with_curiosity(env, state):
    a_info = model.choose_action(state)
    next_state, reward, done, _ = env.step(a_info['action'])
    intrinsic_reward = curiosity_model(
        (tf.cast(tf.expand_dims(state, axis=0     ), tf.float32), 
         tf.cast(tf.expand_dims(tf.one_hot(a_info['action'], depth=2), axis=0), tf.float32), 
         tf.cast(tf.expand_dims(next_state, axis=0), tf.float32)),
         training=True
    )
    reward += intrinsic_reward
    intrinsic_rewards.append(intrinsic_reward)
    return (state, a_info, reward, next_state, done)

def play_episode_with_curiosity(env, init_state):
    ep_steps = 0
    episode_buffer = deque([], maxlen=200)
    state = init_state
    for step in tf.range(200):
        ep_steps += 1
        state, a_info, reward, next_state, done = play_one_step_with_curiosity(env, state)
        episode_buffer.append((state, a_info, reward, next_state, done))
        if done:
            break
        state = next_state
    states, actions, rewards, next_states, dones = (
        np.array([experience[i] for experience in list(episode_buffer)]) 
        for i in range(5))
    # Compute returns
    returns = []
    discounted_sum = tf.constant(0.0)
    for i in tf.range(ep_steps-1, -1, -1):
        discounted_sum = rewards[i] + 0.99 * discounted_sum
        returns.append(discounted_sum)
    returns = np.stack(returns[::-1])
    returns = (returns - tf.math.reduce_mean(returns)) / (tf.math.reduce_std(returns))
    return states, actions, rewards, returns, next_states, dones

def training_step_with_curiosity(env, initial_state):
    # We open a GradientTape because we want to reverse these operations to obtain the gradient
    # of the loss with respect to the model's parameter
    with tf.GradientTape(persistent=True) as tape:
        # 1) Get the experience playing the episode
        states, actions, rewards, returns, next_states, dones = play_episode_with_curiosity(env, initial_state)
        v_st_pred = tf.squeeze(tf.stack([a['value'] for a in actions]))
        a_probs = tf.stack([a['policy'] for a in actions])
        a_indices = tf.stack([a['action'] for a in actions])
        a_probs = tf.gather(a_probs, a_indices, batch_dims=1)
        a_log_probs = tf.math.log(a_probs)
        # Compute delta
        delta = returns - v_st_pred
        # Actor loss
        actor_loss = -tf.reduce_sum(tf.expand_dims(delta, axis=-1)*a_log_probs)
        # Critic loss
        critic_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)(v_st_pred, returns)
        # Entropy loss
        # entropy_loss = -tf.reduce_sum(a_log_probs*a_probs)
        # Total loss
        loss = tf.reduce_sum(actor_loss + critic_loss) #+ 0.01*entropy_loss)
        # Get curiosity loss
        intrinsic_loss = tf.reduce_sum(curiosity_model.losses)
        total_loss = loss + intrinsic_loss
    grads = tape.gradient(total_loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
    grads_curiosity = tape.gradient(total_loss, curiosity_model.trainable_variables)
    curiosity_model.optimizer.apply_gradients(zip(grads_curiosity, curiosity_model.trainable_variables))
    del tape
    return rewards

In [22]:
from tqdm import trange

rewards_mean = 0
with trange(800) as t:
    for ep_num in t:
        initial_state = env.reset()
        rewards = training_step_with_curiosity(env, initial_state)
        episode_reward = sum(rewards)
        intrinsic_reward = sum(intrinsic_rewards)
        intrinsic_rewards.clear()
        rewards_mean = rewards_mean + (episode_reward-rewards_mean)/(ep_num+1)
        t.set_description(f'Episode reward: {episode_reward:.2f}, mean reward: {rewards_mean:.2f}, intrinsic: {intrinsic_reward:.2f}')
        
env.close()

Episode reward: 9.41, mean reward: 9.73, intrinsic: 0.41:  58%|█████▊    | 462/800 [01:02<00:45,  7.40it/s] 


KeyboardInterrupt: 