In [0]:
!pip install tensorflow==2.0.0-rc0
!pip install gym

In [0]:
import tensorflow as tf
import gym
import numpy as np
import scipy.signal

In [0]:
def mlp(ob_space, hidden_sizes=(32,), activation='tanh'):
  model = tf.keras.Sequential()
  for h in hidden_sizes[:-1]:
    model.add(tf.keras.layers.Dense(units=h, activation=activation))
  model.add(tf.keras.layers.Dense(units=hidden_sizes[-1]))
  model.build(input_shape=(None,) + ob_space.shape)
  return model

class PPOAgent():

  def __init__(self, ob_space, ac_space, hidden_sizes=(64, 64), activation='tanh'):
    self.act_dim = ac_space.n
    self.actor_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[self.act_dim])
    self.critic_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[1])

  @tf.function
  def __call__(self, observations):
    logits = self.actor_mlp(observations)
    logp_all = tf.nn.log_softmax(logits)
    pi = tf.squeeze(tf.random.categorical(logits, num_samples=1, seed=0), axis=1)
    logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=self.act_dim) * logp_all, axis=1)
    vf = self.critic_mlp(observations)
    return pi, logp_pi, vf

  @tf.function
  def get_logp(self, observations, actions):
    logits = self.actor_mlp(observations)
    logp_all = tf.nn.log_softmax(logits)
    return tf.reduce_sum(tf.one_hot(actions, depth=self.act_dim) * logp_all, axis=1)

  @tf.function
  def get_v(self, observations):
    return tf.squeeze(self.critic_mlp(observations), axis=1)

In [0]:
def discount_cumsum(x, discount):
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

class PPOBuffer:

    def __init__(self, ob_space, ac_space, size, gamma=0.99, lam=0.97):
        self.obs_buf = np.zeros((size,) + ob_space.shape, dtype=ob_space.dtype)
        self.act_buf = np.zeros((size,) + ac_space.shape, dtype=ac_space.dtype)
        self.adv_buf = np.zeros(size, dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.val_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.ptr, self.path_start_idx, self.max_size = 0, 0, size

    def store(self, obs, act, rew, val, logp):
        assert self.ptr < self.max_size
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.val_buf[self.ptr] = val
        self.logp_buf[self.ptr] = logp
        self.ptr += 1

    def finish_path(self, last_val=0):
        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)
        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)
        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
        self.path_start_idx = self.ptr

    def get(self):
        assert self.ptr == self.max_size    # buffer has to be full before you can get
        self.ptr, self.path_start_idx = 0, 0
        # the next two lines implement the advantage normalization trick
        adv_mean = np.mean(self.adv_buf)
        adv_std = np.std(self.adv_buf)
        self.adv_buf = (self.adv_buf - adv_mean) / adv_std
        return [self.obs_buf, self.act_buf, self.adv_buf, 
                self.ret_buf, self.logp_buf]

In [0]:
def update(obs, acs, advs, rets, logp_olds, agent, opt_pi, opt_v,
           train_pi_iters=80, train_v_iters=80, clip_ratio=0.2, target_kl=0.01):
  actor_weights = agent.actor_mlp.trainable_weights
  critic_weights = agent.critic_mlp.trainable_weights
  @tf.function
  def update_pi():
    pi_loss = 0.
    for i in tf.range(train_pi_iters):
      with tf.GradientTape() as tape:
        logp = agent.get_logp(obs, acs)
        ratio = tf.exp(logp - logp_olds)
        min_adv = tf.where(advs > 0, (1+clip_ratio)*advs, (1-clip_ratio)*advs)
        pi_loss = -tf.reduce_mean(tf.minimum(ratio * advs, min_adv))
      grads = tape.gradient(pi_loss, actor_weights)
      opt_pi.apply_gradients(zip(grads, actor_weights))
      kl = tf.reduce_mean(logp_olds - logp)
      if kl > 1.5 * target_kl:
        break
    return pi_loss

  @tf.function
  def update_v():
    v_loss = 0.
    for i in tf.range(train_v_iters):
      with tf.GradientTape() as tape:
        v = agent.get_v(obs)
        v_loss = tf.reduce_mean((rets - v)**2)
      grads = tape.gradient(v_loss, critic_weights)
      opt_v.apply_gradients(zip(grads, critic_weights))
    return v_loss

  return update_pi().numpy(), update_v().numpy()

In [0]:
def ppo(env_name='CartPole-v1', steps_per_epoch=4000, epochs=50, pi_lr=3e-4, vf_lr=1e-3):
  env = gym.make(env_name)
  ob_space = env.observation_space
  ac_space = env.action_space
  agent = PPOAgent(ob_space, ac_space)
  opt_pi, opt_v = tf.optimizers.Adam(pi_lr), tf.optimizers.Adam(vf_lr)
  # Experience PPO buffer from SpinningUp.
  buf = PPOBuffer(ob_space, ac_space, steps_per_epoch)
  o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

  # Main loop: collect experience in env and update/log each epoch
  for epoch in range(epochs):
    Ep_Ret = []
    for t in range(steps_per_epoch):
      a, logp_t, v_t = [res.numpy()[0] for res in agent(o.reshape(1, -1))]
      buf.store(o, a, r, v_t, logp_t)
      o, r, d, _ = env.step(a)
      ep_ret += r
      ep_len += 1
      if d or t==steps_per_epoch-1:
          last_val = r if d else agent.get_v(o.reshape(1, -1)).numpy()
          buf.finish_path(last_val)
          Ep_Ret.append(ep_ret)
          o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    obs, acs, advs, rets, logp_olds = buf.get()
    pi_loss, v_loss = update(obs, acs, advs, rets, logp_olds, agent, opt_pi, opt_v)
    print('epoch {}, avg episode return {}'.format(epoch, np.mean(Ep_Ret)))

  return agent, env

In [0]:
agent, env = ppo()

In [0]:
obs = env.reset()
reward = 0
epoch = 0
while epoch < 20:
  action, _, _ = agent.get_pi_logpi_vf(obs.reshape(1, -1))
  obs, r, d, _ = env.step(action.numpy()[0])
  reward += r
  # env.render()
  if d:
    print('episode reward {}'.format(reward))
    reward = 0
    epoch += 1
    obs = env.reset()

In [0]:
env = gym.make('CartPole-v1')

In [0]:
env.observation_space.shape

In [0]:
env.action_space.n