In [2]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
import collections


from cliff_walking import CliffWalkingEnv

matplotlib.style.use('ggplot')

In [3]:
env = CliffWalkingEnv()

In [12]:
class PolicyEstimator():
    def __init__(self, learning_rate=0.01, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.action = tf.placeholder(dtype=tf.int32, name='action')
            self.target = tf.placeholder(dtype=tf.float32, name="target")
            
            state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs = tf.expand_dims(state_one_hot, 0),
                num_outputs = env.action_space.n,
                activation_fn = None,
                weights_initializer = tf.zeros_initializer
            )
            
            self.action_probs = tf.squeeze(tf.nn.softmax(self.output_layer))
            self.picked_action_prob = tf.gather(self.action_probs, self.action)
            
            self.loss = -tf.log(self.picked_action_prob) * self.target
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step = tf.contrib.framework.get_global_step())
    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.action_probs, {self.state: state})
    def update(self, state, target, action, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state, self.target: target, self.action: action}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

In [18]:
class ValueEstimator():
    def __init__(self, learning_rate=0.1, scope='value_estimator'):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.target = tf.placeholder(dtype=tf.float32, name='target')
            
            # This is just a table look up estimator
            
            state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs = tf.expand_dims(state_one_hot, 0),
                num_outputs = 1, 
                activation_fn = None,
                weights_initializer = tf.zeros_initializer
            )
            
            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.squared_difference(self.value_estimate, self.target)
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step = tf.contrib.framework.get_global_step())
            
    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.value_estimate, {self.state: state})
    
    def update(self, state, target, sess = None):
        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state, self.target: target}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

In [19]:
import plotting

In [20]:
def reinforce(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    Transistion = collections.namedtuple("Transition", ["state", 'action', 'reward', 'next_state', 'done'])
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards = np.zeros(num_episodes)
    )
    for i_episode in range(num_episodes):
        state = env.reset()
        episode = []
        
        for t in itertools.count():
            action_probs = estimator_policy.predict(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            
            episode.append(Transistion(state=state, action=action, reward=reward,next_state=next_state, done=done))
            
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            print("\rStep {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]))
            if done:
                break
            
            state = next_state
            
        for t, transition in enumerate(episode):
            total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:]))
            estimator_value.update(transition.state, total_return)
            baseline_value = estimator_value.predict(transition.state)
            advantage = total_return - baseline_value
            
            estimator_policy.update(transition.state, advantage, transition.action)
            
    

In [21]:
tf.reset_default_graph()

global_step = tf.Variable(0, name='global_step', trainable=False)
policy_estimator = PolicyEstimator()
value_estimator = ValueEstimator()

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    stats = reinforce(env, policy_estimator, value_estimator, 2000, discount_factor=1.0)

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2017-02-12 08:28:57,463] From <ipython-input-21-e8f4ccc15ed2>:8 in <module>.: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Step 0 @ Episode 1/2000 (0.0)
Step 0 @ Episode 2/2000 (-100.0)
Step 0 @ Episode 3/2000 (-100.0)
Step 1 @ Episode 3/2000 (-100.0)
Step 2 @ Episode 3/2000 (-100.0)
Step 3 @ Episode 3/2000 (-100.0)
Step 4 @ Episode 3/2000 (-100.0)
Step 5 @ Episode 3/2000 (-100.0)
Step 6 @ Episode 3/2000 (-100.0)
Step 7 @ Episode 3/2000 (-100.0)
Step 8 @ Episode 3/2000 (-100.0)
Step 9 @ Episode 3/2000 (-100.0)
Step 10 @ Episode 3/2000 (-100.0)
Step 11 @ Episode 3/2000 (-100.0)
Step 12 @ Episode 3/2000 (-100.0)
Step 13 @ Episode 3/2000 (-100.0)
Step 14 @ Episode 3/2000 (-100.0)
Step 0 @ Episode 4/2000 (-114.0)
Step 1 @ Episode 4/2000 (-114.0)
Step 2 @ Episode 4/2000 (-114.0)
Step 3 @ Episode 4/2000 (-114.0)
Step 4 @ Episode 4/2000 (-114.0)
Step 5 @ Episode 4/2000 (-114.0)
Step 6 @ Episode 4/2000 (-114.0)
Step 0 @ Episode 5/2000 (-106.0)
Step 1 @ Episode 5/2000 (-106.0)
Step 2 @ Episode 5/2000 (-106.0)
Step 3 @ Episode 5/2000 (-106.0)
Step 4 @ Episode 5/2000 (-106.0)
Step 5 @ Episode 5/2000 (-106.0)
Step 0 @

In [22]:
plotting.plot_episode_stats(stats, smoothing_window=25)

AttributeError: 'NoneType' object has no attribute 'episode_lengths'

<matplotlib.figure.Figure at 0x10bf00fd0>