In [0]:
!git clone -b experimental https://git@github.com/wenkesj/alchemy.git ~/alchemy
!(cd ~/alchemy; pip install .)
!git clone https://github.com/openai/gym.git
!(cd ~/gym; pip install .)

## Imports

In [0]:
import gym
import numpy as np
import tensorflow as tf
import alchemy as ay

## Environment

In [3]:
env = gym.make('CartPole-v0')
action_value_shape = [2]
action_value_dtype = tf.float32
stream = ay.rl.SimpleReplayStream.from_gym_env(
    env, action_value_shape, action_value_dtype)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




## Graph

### `ay.rl.ReplayDataset`

In [0]:
hparams = tf.contrib.training.HParams(
    learning_rate=1e-3,
    initial_exploration=.5,
    discount=.99,
    exploration_decay_steps=2000,
    exploration_decay_rate=.99,
    max_sequence_length=200,
    num_episodes=2000,
    report_episode=50)


replay_dataset = ay.rl.ReplayDataset(
    stream, max_sequence_length=hparams.max_sequence_length)
replay_op = replay_dataset.make_one_shot_iterator().get_next()

"""
TODO(wenkesj): Add this to `alchemy`, as a sort-of "boostrapping".

>>> replay_op_with_advantage = ay.rl.add_advantage(replay_op)
"""

sequence_length = tf.squeeze(replay_op.sequence_length, 0)
discounted_reward_op = ay.rl.discount_rewards(
    replay_op.reward, 
    sequence_length=sequence_length,
    max_sequence_length=hparams.max_sequence_length, 
    discount=hparams.discount)

baseline_op = tf.cumsum(discounted_reward_op, reverse=False) / tf.cast(
    sequence_length, discounted_reward_op.dtype)
advantage_op = discounted_reward_op - baseline_op

### Model

In [0]:
def cart_pole_model_fn(state):
  hidden = tf.layers.dense(state, 16, activation=tf.nn.relu, use_bias=False)
  logits = tf.layers.dense(state, 2, use_bias=False)
  return logits

def cart_pole_policy_fn(action_values, exploration, deterministic):
  shape = tf.shape(action_values)
  deterministic_action = lambda: tf.argmax(
      action_values, axis=-1, output_type=stream.action_dtype)
  return tf.cond(
      deterministic, 
      deterministic_action, 
      lambda: tf.cond(
          exploration < tf.random_uniform([]), 
          deterministic_action,
          lambda: tf.argmax(tf.random_uniform(shape), axis=-1, output_type=stream.action_dtype)))

Define the necessary placeholders

In [0]:
state_ph = tf.placeholder(
    stream.state_dtype, 
    [None] + list(stream.state_shape),
    name='state') # previous states
action_ph = tf.placeholder(
    stream.action_dtype, 
    [None] + list(stream.action_shape), 
    name='action') # previous actions
advantage_ph = tf.placeholder(
    stream.reward_dtype,
    [], name='advantage')
sequence_length_ph = tf.placeholder(
    tf.int32, [], name='sequence_length')
deterministic_ph = tf.placeholder(
    tf.bool, [],
    name='deterministic') # deterministic|epsilon actions

### Epsilon-Greedy Policy

In [0]:
global_step = tf.train.get_or_create_global_step()

exploration_op = tf.train.exponential_decay(
    hparams.initial_exploration, 
    global_step, 
    hparams.exploration_decay_steps, 
    hparams.exploration_decay_rate)

action_values_op = cart_pole_model_fn(state_ph)
action_op = cart_pole_policy_fn(action_values_op, exploration_op, deterministic_ph)

### Loss

In [0]:
loss_op = tf.losses.sparse_softmax_cross_entropy(
    action_ph, action_values_op, reduction=tf.losses.Reduction.NONE)
loss_op = tf.reduce_sum(
    tf.reduce_sum(loss_op, axis=-1) * advantage_ph) / tf.cast(
        sequence_length_ph, loss_op.dtype)

optimizer = tf.train.AdamOptimizer(
    learning_rate=hparams.learning_rate)

train_op = optimizer.minimize(loss_op)

## Training and Evaluating

In [0]:
def run_episode(env, sess):
  experiences = []
  next_state = env.reset()
  while True:
    state = next_state
    action_values, action = sess.run(
        (action_values_op, action_op), 
        feed_dict={
            state_ph: [state], 
            deterministic_ph: False
        })

    next_state, reward, terminal, _ = env.step(np.squeeze(action, 0))
    experiences.append(ay.rl.Experience(
        state, next_state, action, action_values, reward, terminal))
    if terminal:
      break
  return experiences

In [10]:
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())

  for episode in range(hparams.num_episodes):
    experiences = run_episode(env, sess)
    stream.write(
        ay.rl.Replay(
            *zip(*experiences), sequence_length=len(experiences)))

    
    replay, advantage = sess.run((replay_op, advantage_op))
    for t in range(replay.sequence_length[0]):
      _, loss = sess.run(
          (train_op, loss_op), 
          feed_dict={
              state_ph: [replay.state[t]],
              action_ph: [replay.action[t]], 
              advantage_ph: advantage[t],
              deterministic_ph: True,
              sequence_length_ph: replay.sequence_length[0],
          })

    if (episode + 1) % hparams.report_episode == 0:
      experiences = run_episode(env, sess)
      replay = ay.rl.Replay(
            *zip(*experiences), sequence_length=len(experiences))
      print('reward = {}, episode = {}'.format(sum(replay.reward), episode + 1))


reward = 99.0, episode = 50
reward = 40.0, episode = 100
reward = 43.0, episode = 150
reward = 124.0, episode = 200
reward = 158.0, episode = 250
reward = 20.0, episode = 300
reward = 112.0, episode = 350


KeyboardInterrupt: ignored