# Simple Reinforcement Learning using Policy Gradients

Import Libraries

In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range

Load OpenAI's CartPole Gym environment 

In [2]:
env = gym.make('CartPole-v0')

[2018-02-09 21:19:52,895] Making new env: CartPole-v0


Let's make our agent learn the policies, before that we shall define the reward function. This reward function not only takes the current reward after performing an action but also a factor of future reward into consideration. This type of reward function helps in acheiveing reward/goal over time.

In [3]:
gamma = 0.99

def discount_rewards(r):
    """ r: 1-D float array of reward
        return value : commputed discounted reward
    """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [4]:
class agent():
    def __init__(self, lr, s_size, a_size, h_size):
        """
            lr: learning rate,
            s_size: size of the states,
            a_size: number of actions,
            h_size: number of neurons in the hidden layer
        """
        #Feed forward network which takes state as input and the possible action as output.
        self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in, h_size, biases_initializer=None, activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden, a_size, biases_initializer=None, activation_fn=tf.nn.softmax)
        self.chosen_action = tf.argmax(self.output, 1)
        
        #Compute's Loss and train the above neural network
        self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32)
        
        #Gather all the outputs which are responsible
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)
        
        #Loss function
        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx, var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss, tvars)
        
        #The gradients are applied in batches to update the network
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, tvars))

Training Agent

In [5]:
tf.reset_default_graph()

#Loading the agent
myAgent = agent(lr=1e-2, s_size=4, a_size=2, h_size=8)

#Total number of games played by the agent
num_episodes = 5000
max_ep = 3000
update_frequency = 5

init = tf.global_variables_initializer()

#Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while(i<num_episodes):
        s = env.reset()
        running_reward = 0
        ep_history = []
    
        for j in range(max_ep):
            env.render()
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output, feed_dict={myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0], p=a_dist[0])
            a = np.argmax(a_dist == a)
            
            s1, r, d, _ = env.step(a)
            ep_history.append([s, a, r, s1])
            s = s1
            running_reward += r
            if(d == True):
                #Update the network
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discount_rewards(ep_history[:, 2])
                feed_dict = {myAgent.reward_holder: ep_history[:, 2], myAgent.action_holder: ep_history[:, 1], myAgent.state_in: np.vstack(ep_history[:, 0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad
        
                if(i % update_frequency == 0 and i != 0):
                    feed_dict = dict(zip(myAgent.gradient_holders, gradBuffer))
                    sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix, grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
        
                total_reward.append(running_reward)
                total_length.append(j)
                break
                            
        #Update our scoreboard
        if(i % 100 == 0):
            print(np.mean(total_reward[-100:]))
        i += 1
        
                

Tensor("add:0", shape=(?,), dtype=int32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


37.0
31.21
33.41
41.64
43.65
49.77
79.53
113.96
132.89
165.09
171.81
177.06
184.69
190.57
192.1
187.08
163.77
167.99
190.36
190.98
171.16
181.65
178.58
172.16
163.93
166.04
181.54
176.73
156.16
173.32
178.99
184.49
180.99
177.06
185.13
169.97
160.52
167.23
174.57
171.82
170.18
172.81
175.53
193.26
198.71
199.07
198.48
196.77
197.47
191.81
