[View in Colaboratory](https://colab.research.google.com/github/ysulsky/notebooks/blob/master/DDPG.ipynb)

## Following https://arxiv.org/abs/1509.02971

---
**Bellman Equation for Q-functions, with a deterministic policy**

$Q(s_t, a_t | \theta_Q) = y_t$

where:

$y_t = E_{r_t,s_{t+1}}\big[r_t + \gamma Q(s_{t+1}, a_{t+1} | \theta'_Q)\big]$ \\
$r_t \sim R(s_t, a_t)$ \\
$s_{t + 1} \sim T(s_t, a_t)$ \\
$a_{t + 1} = \pi(s_{t + 1} | \theta'_\pi)$

---
**Critic Objective**

$L_Q(s_t, a_t, \theta_Q) = (Q(s_t, a_t | \theta_Q) - y_t)^2$

---
**Policy Objective**

$L_\pi(s_t, \theta_Q, \theta_\pi) = -Q(s_t, \pi(s_t | \theta_\pi) | \theta_Q)$

---
**Target Update Rule**

$\theta'_Q \leftarrow \tau \cdot \theta_Q + (1 - \tau) \cdot \theta'_Q$ \\
$\theta'_\pi \leftarrow \tau \cdot \theta_\pi + (1 - \tau) \cdot \theta'_\pi$ \\

where $\tau \ll 1$

In [1]:
!pip install gym

Collecting gym
  Downloading gym-0.10.5.tar.gz (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 648kB/s 
Collecting pyglet>=1.2.0 (from gym)
  Downloading pyglet-1.3.1-py2.py3-none-any.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 996kB/s 
Building wheels for collected packages: gym
  Running setup.py bdist_wheel for gym ... [?25l- \ | / done
[?25h  Stored in directory: /content/.cache/pip/wheels/f8/88/f2/22e53080a462567706fad31295462941b3b06b16b51c3ab3e1
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.10.5 pyglet-1.3.1


In [0]:
#@title Imports

import numpy as np
import tensorflow as tf
import gym

In [0]:
#@title Replay Buffer

class ReplayBuffer(object):
  def __init__(self, max_size=10000):
    self._arr = []
    self._max_size = max_size
    self._start = 0
    
  def record_transition(self, transition):
    if len(self._arr) < self._max_size:
      self._arr.append(transition)
    else:
      self._arr[self._start] = transition
      self._start = (self._start + 1) % self._max_size
      
  def sample_batch(self, batch_size):
    # np.random.choice doesn't work here because self._arr isn't 1-D
    return [self._arr[i] 
            for i in np.random.randint(0, len(self._arr), size=batch_size)]

In [0]:
#@title Wrapper around Gym and ReplayBuffer to operate in the TF graph.

class Env(object):
  def __init__(self, name, reward_decay=0.99):
    self._env = gym.make(name)
    self._replay = ReplayBuffer()
    self._obs = self._env.reset()
    self._state_dtype = self.observation_space.dtype
    self._state_shape = self.observation_space.shape
    self._total_rewards = 0.
    self._num_episodes = 0
    self._current_episode_total = 0
    self._current_timestep = 0
    self._reward_decay = reward_decay

  
  def mean_episodic_reward(self):
    return ((self._total_rewards + self._current_episode_total) /
            (self._num_episodes + 1.))

  
  @property
  def reward_decay(self):
    return self._reward_decay

  
  @property
  def action_space(self):
    return self._env.action_space

  
  @property
  def observation_space(self):
    return self._env.observation_space
  

  def _py_observe(self):
    return np.asarray(self._obs, dtype=self.observation_space.dtype)
    
  
  def observe(self):
    state, = tf.py_func(self._py_observe, [], [self._state_dtype], 
                        name="observe")
    state.set_shape(self._state_shape)
    return tf.expand_dims(state, 0)  # Add a batch dimension.

  
  def _py_step(self, action):
    state = self._py_observe()
    self._obs, reward, done, _ = self._env.step(action)
    self._current_episode_total += (
        reward * self._reward_decay ** self._current_timestep)
    self._current_timestep += 1
    reward = np.asarray(reward, dtype=np.float32)
    next_state = self._py_observe()
    ret = (reward, next_state)
    if done:
      self._total_rewards += self._current_episode_total
      self._num_episodes += 1
      self._current_episode_total = 0
      self._obs = self._env.reset()
    return ret


  def step(self, action):
    action = tf.squeeze(action, axis=0)  # Batch dim must be 1.
    reward, state = tf.py_func(self._py_step,
                               [action], [self._state_dtype, tf.float32],
                               name="step")
    reward.set_shape(())
    state.set_shape(self._state_shape)
    return tf.expand_dims(reward, axis=0), tf.expand_dims(state, axis=0)
  
  
  def _py_record_transition_and_sample_minibatch(
      self, state, action, reward, next_state, batch_size):
    for transition in zip(state, action, reward, next_state):
      self._replay.record_transition(transition)
    state = np.zeros((batch_size,) + state.shape[1:], dtype=state.dtype)
    action = np.zeros((batch_size,) + action.shape[1:], dtype=action.dtype)
    reward = np.zeros((batch_size,), dtype=reward.dtype)
    next_state = np.zeros((batch_size,) + next_state.shape[1:],
                          dtype=next_state.dtype)
    for i, transition in enumerate(self._replay.sample_batch(batch_size)):
      state[i], action[i], reward[i], next_state[i] = transition
    return (state, action, reward, next_state)
  
  
  def record_transition_and_sample_minibatch(
      self, state, action, reward, next_state, batch_size):
    state_shape, action_shape, next_state_shape = (
        state.shape, action.shape, next_state.shape)
    state, action, reward, next_state = tf.py_func(
        self._py_record_transition_and_sample_minibatch,
        [state, action, reward, next_state, batch_size],
        [state.dtype, action.dtype, reward.dtype, next_state.dtype])
    state.set_shape([None] + state_shape[1:].as_list())
    action.set_shape([None] + action_shape[1:].as_list())
    reward.set_shape([None])
    next_state.set_shape([None] + next_state_shape[1:].as_list())
    return state, action, reward, next_state
    

In [0]:
#@title Network Definitions

# Hyper-parameters, from https://arxiv.org/abs/1509.02971.
policy_hidden_layers = (400, 300)
critic_hidden_layers = (400, 300)
use_replay_buffer = True
batch_size = 64  # Only used with replay buffers, otherwise batch_size = 1
target_weight_decay = (1. - 0.001)
policy_learning_rate = 1e-4
critic_learning_rate = 1e-2
policy_l2_regularization = 0.
critic_l2_regularization = 1e-2
exploration_stddev = 0.01  # Unlike the paper, we use simple Gaussian noise.


def make_mlp(input_, layer_sizes, l2_reg=0., name=None):
  with tf.name_scope(name, "MLP", [input_]):
    net = input_
    kernel_regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
    for i, layer_size in enumerate(layer_sizes):
      if i + 1 == len(layer_sizes):
        name = "final"
        activation_fn=None
      else:
        name = "hidden_{}".format(i + 1)
        activation_fn=tf.nn.relu
      with tf.name_scope(name):
        net = tf.layers.dense(net, layer_size, 
                              kernel_regularizer=kernel_regularizer,
                              activation=activation_fn)
    return net


def make_critic(state, action, name=None):
  with tf.name_scope(name, "Critic", [state, action]):
    with tf.name_scope("input"):
      input_ = tf.concat([state, action], axis=-1)
    return tf.squeeze(make_mlp(input_, critic_hidden_layers + (1,),
                               l2_reg=critic_l2_regularization),
                      axis=-1, name="value_estimate")
  

def make_policy(state, action_space, name=None):
  with tf.name_scope(name, "Policy", [state]):
    unscaled = make_mlp(state, policy_hidden_layers + action_space.shape,
                        l2_reg=policy_l2_regularization, name="unscaled")
    high = tf.constant(action_space.high)
    low = tf.constant(action_space.low)
    return (0.5 + 0.5 * tf.tanh(unscaled)) * (high - low) + low
            

def record_and_sample(env, policy_network):
  state = env.observe()
  action = policy_network(state)
  reward, next_state = env.step(action)
  if not use_replay_buffer:
    return state, action, reward, next_state
  return env.record_transition_and_sample_minibatch(
      state, action, reward, next_state, batch_size)


def build_graph(env, training=True):
  policy_network = tf.make_template("Policy", make_policy,
                                    action_space=env.action_space)
  critic_network = tf.make_template("Critic", make_critic)
  ret = dict(policy_network=policy_network,
             critic_network=critic_network)
  
  if not training:
    return ret
  
  def _explore_policy(state):
    action = policy_network(state)
    return action + tf.random_normal(shape=tf.shape(action),
                                     stddev=exploration_stddev)
  
  state, action, reward, next_state = record_and_sample(env, _explore_policy)
  
  policy_loss = -tf.reduce_mean(
      critic_network(state, policy_network(state)), axis=0)
  policy_loss += tf.losses.get_regularization_loss("Policy")
  
  target_policy_network = tf.make_template("TargetPolicy", make_policy,
                                           action_space=env.action_space)
  target_critic_network = tf.make_template("TargetCritic", make_critic)
  next_target_action = target_policy_network(next_state)
  next_target_value = target_critic_network(next_state, next_target_action)
  next_target = reward + env.reward_decay * next_target_value

  critic_loss = tf.reduce_mean(
      tf.squared_difference(critic_network(state, action), next_target), axis=0)
  critic_loss += tf.losses.get_regularization_loss("Critic")
  
  get_vars = lambda s: tf.contrib.framework.get_trainable_variables(scope=s)
  policy_vars = get_vars("Policy")
  critic_vars = get_vars("Critic")
  target_policy_vars = get_vars("TargetPolicy")
  target_critic_vars = get_vars("TargetCritic")
  assert all([policy_vars, critic_vars, target_policy_vars, target_critic_vars])
  
  update_target_ops = []
  init_target_ops = []
  for target_var, orig_var in zip(target_policy_vars + target_critic_vars,
                                  policy_vars + critic_vars):
    update_target_ops.append(
        tf.assign(target_var, 
                  target_var * target_weight_decay + 
                  orig_var * (1. - target_weight_decay)))
    init_target_ops.append(tf.assign(target_var, orig_var))   
  init_target_networks = tf.group(*init_target_ops) 
  train_policy = (tf.train.AdamOptimizer(policy_learning_rate)
                  .minimize(policy_loss, 
                            global_step=tf.train.get_or_create_global_step(),
                            var_list=policy_vars))
  train_critic = (tf.train.AdamOptimizer(critic_learning_rate)
                  .minimize(critic_loss, var_list=critic_vars))
  with tf.control_dependencies([train_policy, train_critic]):
    train_op = tf.group(*update_target_ops)  

  ret.update(dict(
      policy_loss=policy_loss,
      critic_loss=critic_loss,
      init_target_networks=init_target_networks,
      train_op=train_op,
  ))
  
  return ret

In [0]:
!rm -rf /tmp/ddpg_pendulum

In [69]:
#@title Train

checkpoint_dir = "/tmp/ddpg_pendulum"
graph = tf.Graph()

with graph.as_default():
  env = Env('Pendulum-v0')
  init_vars_op = tf.global_variables_initializer()
  endpoints = build_graph(env)
  with tf.train.MonitoredTrainingSession(
      checkpoint_dir=checkpoint_dir, log_step_count_steps=10000) as sess:
    sess.run(init_vars_op)
    sess.run(endpoints["init_target_networks"])
    for step in range(100000):
      critic_loss_val, policy_loss_val, _ = sess.run(
          [endpoints["critic_loss"],
           endpoints["policy_loss"],
           endpoints["train_op"]])
      if step % 10000 == 0:
        print("critic loss: {}, policy loss: {}".format(
            critic_loss_val, policy_loss_val))
        print("mean episodic reward: {}".format(env.mean_episodic_reward()))

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/ddpg_pendulum/model.ckpt.
critic loss: 2.613407611846924, policy loss: 0.24113045632839203
mean episodic reward: -0.9461334090949051
INFO:tensorflow:global_step/sec: 146.291
critic loss: 0.646467924118042, policy loss: 28.840438842773438
mean episodic reward: -0.2378833737514516
INFO:tensorflow:global_step/sec: 146.166
critic loss: 0.3349519371986389, policy loss: 24.854310989379883
mean episodic reward: -0.12011932733

In [79]:
#@title Eval

env = gym.make('Pendulum-v0')
graph = tf.Graph()
with graph.as_default():
  endpoints = build_graph(env, training=False)
  policy_network = endpoints['policy_network']
  critic_network = endpoints['critic_network']
  
  state = env.reset()
  state_ph = tf.placeholder(tf.float32, shape=(None,) + state.shape)
  
  action = policy_network(state_ph)
  value = critic_network(state_ph, action)

  with tf.train.MonitoredSession(
      session_creator=tf.train.ChiefSessionCreator(
          checkpoint_dir=checkpoint_dir)) as sess:
    done = False
    undiscounted_reward = 0
    num_steps = 0
    while not done:
      action_val, value_val = sess.run(
          [action, value], feed_dict={
              state_ph: np.asarray(state[np.newaxis], dtype=np.float32)})
      state, reward, done, _ = env.step(action_val[0])
      undiscounted_reward += reward
      num_steps += 1
      # print("State:", state, "Action", action_val, "Value:", value_val)
  
  print("Done. Mean reward: {} ({} steps)".format(
      undiscounted_reward / num_steps, num_steps))

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/ddpg_pendulum/model.ckpt-100000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Done. Mean reward: -2.649027388043568 (200 steps)
