In [26]:
# Automatically reload changes to external code
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In this assignment, you will solve a classic control problem - CartPole using policy gradient methods.

First, you will implement the "vanilla" policy gradient method, i.e., a method that repeatedly computes **unbiased** estimates $\hat{g}$ of $\nabla_{\theta} E[\sum_t R_t]$ and takes gradient ascent steps $\theta \rightarrow \theta + \epsilon \hat{g}$ so as to increase the total rewards collected in each episode. To make sure our code can solve multiple MDPs with different policy parameterizations, provided code follows an OOP manner and represents MDP and Policy as classes.

The following code constructs an instance of the MDP, and then prints its documentation.

In [33]:
import gym
import tensorflow as tf
import numpy as np
from policy_gradient import util
from policy_gradient.policy import CategoricalPolicy
from policy_gradient.baselines.linear_feature_baseline import LinearFeatureBaseline

np.random.seed(0)
tf.set_random_seed(0)

# CartPole-v0 is a MDP with finite state and action space. 
# In this environment, A pendulum is attached by an un-actuated joint to a cart, 
# and the goal is to prevent it from falling over. You can apply a force of +1 or -1 to the cart.
# A reward of +1 is provided for every timestep that the pendulum remains upright. 
# To visualize CartPole-v0, please see https://gym.openai.com/envs/CartPole-v0

env = gym.make('CartPole-v0')

INFO:gym.envs.registration:Making new env: CartPole-v0
[2016-10-02 21:40:03,941] Making new env: CartPole-v0


## Problem 1: construct a neural network to represent policy 

In [34]:
sess = tf.Session()

# Construct a neural network to represent policy which maps observed state to action. 
in_dim = util.flatten_space(env.observation_space)
out_dim = util.flatten_space(env.action_space)
hidden_dim = 8

opt = tf.train.AdamOptimizer(learning_rate=0.01)
policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)

sess.run(tf.initialize_all_variables())

## Problem 2: implement policy gradient computation¶


In [35]:
class PolicyOptimizer(object):
    def __init__(self, env, policy, baseline, n_iter, n_episode, path_length,
        discount_rate=.99):

        self.policy = policy
        self.baseline = baseline
        self.env = env
        self.n_iter = n_iter
        self.n_episode = n_episode
        self.path_length = path_length
        self.discount_rate = discount_rate

    def sample_path(self):
        obs = []
        actions = []
        rewards = []
        ob = self.env.reset()

        for _ in range(self.path_length):
            a = self.policy.act(ob.reshape(1, -1))
            next_ob, r, done, _ = self.env.step(a)
            obs.append(ob)
            actions.append(a)
            rewards.append(r)
            ob = next_ob
            if done:
                break

        return dict(
            observations=np.array(obs),
            actions=np.array(actions),
            rewards=np.array(rewards),
        )

    def process_paths(self, paths):
        for p in paths:
            if self.baseline != None:
                b = self.baseline.predict(p)
            else:
                b = 0

            r = util.discount_cumsum(p["rewards"], self.discount_rate)
            a = r - b

            p["returns"] = r
            p["baselines"] = b
            p["advantages"] = (a - a.mean()) / (a.std() + 1e-8) # normalize
            # TODO: Use the following line to compute advantage and compare
            # how we can reduce variance by adding baseline and why it helps

            #p["advantages"] = a

        #print("Rewards variance: {}".format(np.var(p["returns"])))
        #print("Advantages variance: {}".format(np.var(p["advantages"])))

        obs = np.concatenate([ p["observations"] for p in paths ])
        actions = np.concatenate([ p["actions"] for p in paths ])
        rewards = np.concatenate([ p["rewards"] for p in paths ])
        advantages = np.concatenate([ p["advantages"] for p in paths ])

        return dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            advantages=advantages,
        )

    def train(self):
        for i in range(1, self.n_iter + 1):
            paths = []
            for _ in range(self.n_episode):
                paths.append(self.sample_path())
            data = self.process_paths(paths)
            loss = self.policy.train(data["observations"], data["actions"], data["advantages"])
            avg_return = np.mean([sum(p["rewards"]) for p in paths])
            print("Iteration {}: Average Return = {}".format(i, avg_return))
            
            # CartPole-v0 defines "solving" as getting average reward of 195.0 over 100 consecutive trials.
            if avg_return >= 195:
                print("Solve at {} iterations, which equals {} episodes.".format(i, i*100))
                break

            if self.baseline != None:
                self.baseline.fit(paths)

In [36]:
n_iter = 200
n_episode = 100
path_length = 200
discount_rate = 0.99
baseline = LinearFeatureBaseline(env.spec)

po = PolicyOptimizer(env, policy, baseline, n_iter, n_episode, path_length,
                     discount_rate)

# Train the policy optimizer
po.train()

Iteration 1: Average Return = 29.2
Iteration 2: Average Return = 33.91
Iteration 3: Average Return = 30.96
Iteration 4: Average Return = 36.51
Iteration 5: Average Return = 36.93
Iteration 6: Average Return = 39.19
Iteration 7: Average Return = 35.45
Iteration 8: Average Return = 37.69
Iteration 9: Average Return = 40.46
Iteration 10: Average Return = 43.58
Iteration 11: Average Return = 41.57
Iteration 12: Average Return = 48.54
Iteration 13: Average Return = 44.71
Iteration 14: Average Return = 44.71
Iteration 15: Average Return = 43.5
Iteration 16: Average Return = 46.52
Iteration 17: Average Return = 48.36
Iteration 18: Average Return = 52.09
Iteration 19: Average Return = 53.75
Iteration 20: Average Return = 51.94
Iteration 21: Average Return = 50.26
Iteration 22: Average Return = 52.48
Iteration 23: Average Return = 51.79
Iteration 24: Average Return = 55.16
Iteration 25: Average Return = 52.25
Iteration 26: Average Return = 56.65
Iteration 27: Average Return = 56.35
Iteration 28

In [None]:
## Problem 1: implement baseline
