# Vanilla Policy Gradient Agent (Cartpole Task)

In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
# In this environment, an action receives a reward of +1 if it doesn't cause
# the cartpole to fall, and a reward of -1 if it does. The episode is over
# when the cartpole falls, or if `max_ep` actions are taken without it falling
env = gym.make('CartPole-v0')

total_episodes = 5000
max_ep = 999
update_frequency = 5  # Number of episodes to complete before updating network parameters
gamma = 0.99  # reward discount factor
h_size = 8  # size of the hidden layer in the network

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### The policy-based agent

In [3]:
def discount_rewards(r):
    # Take 1D float array of rewards and compute discounted reward
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add*gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [4]:
class agent():
    # Since the state/observation space is represented by a 4D bounding Box,
    # `s_size = 4` for this example. Can check by doing: `print(env.observation_space)`
    # `h_size` is the dimension of the hidden layer in the network
    def __init__(self, lr, s_size, a_size, h_size):
        # Establish the feed-forward part of the network
        self.state_in = tf.placeholder(shape = [None, s_size], dtype = tf.float32)  # [None, s_size]
        hidden = tf.contrib.layers.fully_connected(                                 # [None, h_size]
            self.state_in, h_size, biases_initializer = None,
            activation_fn = tf.nn.relu
        )
        self.output = tf.contrib.layers.fully_connected(                            # [None, a_size]
            hidden, a_size, activation_fn = tf.nn.softmax,
            biases_initializer = None
        )
        self.chosen_action = tf.argmax(self.output, 1)
        
        # Establish the training procedure. Feed the reward and chosen
        # action into the network to compute the loss, using it to
        # update the network
        self.reward_holder = tf.placeholder(shape = [None], dtype = tf.float32)
        self.action_holder = tf.placeholder(shape = [None], dtype = tf.int32)
        # Gets the index of the action for each row/time step in output
        self.indexes = tf.range(0, tf.shape(self.output)[0])*tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)
        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()  # returns all variables created with `trainable = True`
        self.gradient_holders = []
        for idx, var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name = str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        # Get gradients of each trainable variable wrt loss
        self.gradients = tf.gradients(self.loss, tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate = lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, tvars))
            
            

### Training the agent

In [5]:
s_size = env.observation_space.shape[0]
a_size = env.action_space.n

In [6]:
tf.reset_default_graph()
myAgent = agent(lr = 1e-2, s_size = s_size, a_size = a_size, h_size = h_size)

Instructions for updating:
Use the retry module or similar alternatives.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [7]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    # Zero out gradients of trainable variables
    gradBuffer = sess.run(tf.trainable_variables())  # Get each trainable variables' gradient tensor
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad*0  
    
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        # Replay buffer, consisting of tuples (s, a, r, s1), except they'll be
        # stored as an array instead of a tuple so `ep_history` can be passed
        # into numpy's Array initializer function
        ep_history = []  # Oldest tuples will be first
        for j in range(max_ep):
            # Probabilistically pick an action given our network outputs
            a_dist = sess.run(myAgent.output, feed_dict = {myAgent.state_in: [s]})
            #a = np.random.choice(a_dist[0], p = a_dist[0])
            #a = np.argmax(a_dist == a)
            a = np.random.choice(a_dist.shape[1], p = a_dist[0])
            
            s1, r, d, _ = env.step(a)  # Get reward for taking action a given the bandit
            ep_history.append([s, a, r, s1])
            s = s1  # Update currently observed state
            running_reward += r  # Update running total of rewards
            if d:  # If the agent is done for this episode
                # Update the network
                ep_history = np.array(ep_history)
                # Discount `r` for each row/time step in ep_history
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                feed_dict = {
                    myAgent.reward_holder: ep_history[:,2],       # reward history
                    myAgent.action_holder: ep_history[:,1],       # action history
                    myAgent.state_in: np.vstack(ep_history[:,0])  # state history
                }
                grads = sess.run(myAgent.gradients, feed_dict = feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad  # Accumulate gradients
                
                if i % update_frequency == 0 and i != 0:
                    # Use the accumulated gradients to update the network, then reset gradients
                    feed_dict = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict = feed_dict)
                    for ix, grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad*0  # Zero out gradients
                total_reward.append(running_reward)
                total_length.append(j)
                break
        # Update our running tally of scores
        if i % 100 == 0:
            print("Ep. %d)  Total reward = %.3f" % (i, np.mean(total_reward[-100:])))
        i += 1

Ep. 0)  Total reward = 19.000
Ep. 100)  Total reward = 30.030
Ep. 200)  Total reward = 34.130
Ep. 300)  Total reward = 45.450
Ep. 400)  Total reward = 62.470
Ep. 500)  Total reward = 91.060
Ep. 600)  Total reward = 103.680
Ep. 700)  Total reward = 122.820
Ep. 800)  Total reward = 154.970
Ep. 900)  Total reward = 168.060
Ep. 1000)  Total reward = 186.140
Ep. 1100)  Total reward = 190.470
Ep. 1200)  Total reward = 190.990
Ep. 1300)  Total reward = 192.280
Ep. 1400)  Total reward = 188.910
Ep. 1500)  Total reward = 195.500
Ep. 1600)  Total reward = 195.600
Ep. 1700)  Total reward = 193.070
Ep. 1800)  Total reward = 195.870
Ep. 1900)  Total reward = 196.720
Ep. 2000)  Total reward = 194.350
Ep. 2100)  Total reward = 188.780
Ep. 2200)  Total reward = 189.490
Ep. 2300)  Total reward = 196.130
Ep. 2400)  Total reward = 197.170
Ep. 2500)  Total reward = 199.560
Ep. 2600)  Total reward = 199.380
Ep. 2700)  Total reward = 199.900
Ep. 2800)  Total reward = 197.510
Ep. 2900)  Total reward = 195.99