In [1]:
import os
import random
import time

import numpy as np
import tensorflow as tf

import tensorflow.contrib.layers
import gym

In [2]:
import tools
reload(tools)

<module 'tools' from 'tools.pyc'>

In [3]:
class ToTheRight(gym.Env):
    def __init__(self, num):
        self.num = num
        self.state = 0.
        self.action_space = gym.spaces.Discrete(3)
        self.observation_space = gym.spaces.Box(0., float(num), shape=(1,))
        self.reward_range = (-1., 1.)

    def _step(self, a):
        if a == 2:
            r = 1
            self.state += 1
        elif a == 1:
            r = -0.1
        else:
            r = -1
            self.state = max(0., self.state - 1)
        return np.array([self.state]), r, self.state >= self.num, None
    
    def _reset(self):
        self.state = 0.
        return np.array([self.state])

In [134]:
env = gym.make('LunarLander-v2')

[2017-03-15 08:06:33,103] Making new env: LunarLander-v2


In [6]:
env = ToTheRight(10)

In [5]:
def GenerateExperience(env, policy, rollout_len, gamma, step_callback, stats_callback):
    episode_rew = 0.
    episode_len = 0.
    old_s = env.reset()
    while True:
        ss, aa, rr, ss1, gg = [], [], [], [], []
        done = False
        while not done and len(ss) < rollout_len:
            a = policy(old_s)

            s, r, done, _ = env.step(a)
            ss.append(old_s)
            aa.append(a)
            rr.append(r)
            ss1.append(s)
            gg.append(gamma if not done else 0.)

            episode_rew += r
            episode_len += 1
            old_s = s

        rew = 0.
        g = 1.
        for i in reversed(range(len(ss))):
            rew = rr[i] + gg[i] * rew
            g *= gg[i]
            ss1[i] = old_s
            rr[i] = rew
            gg[i] = g

        if done:
            old_s = env.reset()
            stats_callback(episode_rew, episode_len)
            episode_rew, episode_len = 0., 0.

        should_continue = step_callback(np.array(ss), np.array(aa), np.array(rr),
                                        np.array(ss1), np.array(gg), 100)
        if not should_continue:
            return

In [6]:
def CartPoleQNetwork(state):
#     state = tf.squeeze(tf.one_hot(tf.cast(state, tf.int32), 10), -2)
    hidden1 = tf.contrib.layers.fully_connected(
        state, 32,
        activation_fn=tf.nn.elu,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        weights_regularizer=tf.contrib.layers.l2_regularizer(0.001),
        biases_initializer=tf.zeros_initializer(),
        scope='hidden1')
    hidden2 = tf.contrib.layers.fully_connected(
        hidden1, 32,
        activation_fn=tf.nn.elu,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        weights_regularizer=tf.contrib.layers.l2_regularizer(0.001),
        biases_initializer=tf.zeros_initializer(),
        scope='hidden2')

    value = tf.contrib.layers.linear(hidden2, 1,
                                     weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
#                                      weights_regularizer=tf.contrib.layers.l2_regularizer(0.001),
                                     biases_initializer=tf.zeros_initializer(),
                                     scope='value')
    logits = tf.contrib.layers.linear(hidden2, env.action_space.n,
                                      weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
#                                       weights_regularizer=tf.contrib.layers.l2_regularizer(0.001),
                                      biases_initializer=tf.zeros_initializer(),
                                      scope='logits')

    return logits, value

In [17]:
ROLLOUT_LEN = 1
GAMMA = 0.95
TAU = 0.1

In [135]:
tf.reset_default_graph()

In [136]:
state = tf.placeholder(tf.float32, shape=[None] + list(env.observation_space.shape), name='state')
action = tf.placeholder(tf.int32, shape=[None], name='action')
reward = tf.placeholder(tf.float32, shape=[None], name='reward')
state1 = tf.placeholder(tf.float32, shape=[None] + list(env.observation_space.shape), name='state1')
gamma = tf.placeholder(tf.float32, shape=[None], name='gamma')

In [137]:
with tf.variable_scope('model', reuse=False):
    logits, value = CartPoleQNetwork(state)
with tf.variable_scope('model', reuse=True):
    _, value1 = CartPoleQNetwork(state1)
    
value = tf.squeeze(value)
value1 = tf.squeeze(value1)

In [138]:
policy = tf.reshape(tf.multinomial(logits, 1), [])
probs = tf.nn.softmax(logits)

In [139]:
advantage = reward - (value - gamma * tf.stop_gradient(value1))
G = tools.Select(tf.nn.log_softmax(logits), action)

In [140]:
policy_error = tf.stop_gradient(advantage) / TAU - G

In [141]:
value_error = advantage - TAU * tf.stop_gradient(G)

In [142]:
entropy = -tf.reduce_sum(probs * tf.nn.log_softmax(logits), axis=1)

In [143]:
loss = tf.reduce_mean(0.5 * tools.HuberLoss(value_error, 10) + tools.HuberLoss(policy_error, 10))

In [144]:
tf.summary.histogram('Monitor/TD_Error', advantage)
tf.summary.histogram('Monitor/Q', value)
tf.summary.histogram('Monitor/Logits', logits)
tf.summary.histogram('Monitor/Probs', probs)
tf.summary.scalar("Scalars/Q", tf.reduce_mean(value))
tf.summary.scalar("Scalars/Total_Loss", loss)
tf.summary.scalar("Scalars/Value_Loss", tf.reduce_mean(advantage))
tf.summary.scalar("Scalars/Policy_Loss", tf.reduce_mean(G))

tf.summary.scalar("Scalars/TD_Error", tf.reduce_mean(value_error))
tf.summary.scalar("Scalars/Entropy", tf.reduce_mean(entropy))
tf.summary.scalar("Scalars/Prob/Max", 
                  tf.reduce_mean(tf.reduce_max(probs, axis=1)))
tf.summary.scalar("Scalars/Prob/Min", 
                  tf.reduce_mean(tf.reduce_min(probs, axis=1)))

tf.contrib.layers.summarize_activations()

[]

In [145]:
optimizer = tf.train.AdamOptimizer(0.0001)
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'model')
grads = optimizer.compute_gradients(loss, variables)
grads = tools.ClipGradient(grads, 100.)

In [146]:
global_step = tf.Variable(0, name='global_step', trainable=False)

In [147]:
train_op = tf.group(optimizer.apply_gradients(grads, global_step))

In [148]:
tf.contrib.layers.summarize_collection(tf.GraphKeys.GLOBAL_VARIABLES)
tf.contrib.layers.summarize_activations()
summary_op = tf.summary.merge_all()

In [149]:
sess.close()
# tf.reset_default_graph()
sess = tf.InteractiveSession()

In [150]:
sess.run(tf.global_variables_initializer())

In [151]:
writer = tf.summary.FileWriter('log2/LunarLander/pcl/')
writer.add_graph(tf.get_default_graph())

In [152]:
batch = {'s': [], 'a': [], 'r': [], 'g': [], 's1': []}

def Step(ss, aa, rr, ss1, gg, _):
    batch['s'].append(ss)
    batch['a'].append(aa[0])
    batch['r'].append(rr[0])
    batch['g'].append(gg[0])
    batch['s1'].append(ss1)
    
    if len(batch['s']) >= 32:
        step = sess.run(global_step)

        if step > 1 and step % 500 == 0:
            sumr, _ = sess.run([summary_op, train_op],
                               {state: np.vstack(batch['s']),
                                action: batch['a'],
                                reward: batch['r'],
                                state1: np.vstack(batch['s1']),
                                gamma: batch['g']})
            writer.add_summary(sumr, step)
        else:
            sess.run(train_op,
                               {state: np.vstack(batch['s']),
                                action: batch['a'],
                                reward: batch['r'],
                                state1: np.vstack(batch['s1']),
                                gamma: batch['g']})
        batch['s'], batch['a'], batch['r'] = [], [], []
        batch['g'], batch['s1'] = [], []
    
    return True

In [153]:
def Stats(episode_rew, episode_len):
    writer.add_summary(tf.Summary(
        value=[tf.Summary.Value(tag='Env/Reward', simple_value=episode_rew),
               tf.Summary.Value(tag='Env/Length', simple_value=episode_len)]),
               sess.run(global_step))

In [154]:
def Policy(obs):
    return sess.run(policy, {state: np.expand_dims(obs, 0)})

In [155]:
GenerateExperience(env, Policy, ROLLOUT_LEN, GAMMA, Step, Stats)

KeyboardInterrupt: 

In [1325]:
buf.inserted, buf.buffer_size

(26719, 262144)