In [6]:
import tensorflow as tf
import numpy as np
import gym

import matplotlib.pyplot as plt

%matplotlib inline

In [28]:
try:
    xrange = xrange
except:
    xrange = range

In [7]:
env = gym.make('CartPole-v0')

In [8]:
action_size = env.action_space.n
observation_size = env.observation_space.shape[0]

In [59]:
class vanillaPG():
    def __init__(self, lr, observation_size, action_size, layers):
        self.lr = lr
        self.observation_size = observation_size
        self.action_size = action_size
        self.layers = layers
        
        self.obs_tfp, self.output, self.chosen_action = self.buildNet()
              
        self.reward_tfp = tf.placeholder(tf.float32, [None])
        self.action_tfp = tf.placeholder(tf.int32, [None])
        
        self.gradients, self.update_batch, self.gradient_holders = self.train()
        
    def buildNet(self):
        obs = tf.placeholder(tf.float32, [None, self.observation_size])
        w1 = tf.get_variable('w1', [self.observation_size, self.layers[0]])
        b1 = tf.Variable(tf.random_normal([self.layers[0]]))
        l1 = tf.nn.relu(tf.matmul(obs, w1) + b1)
        
        w2 = tf.get_variable('w2', [self.layers[0], self.layers[1]])
        b2 = tf.Variable(tf.random_normal([self.layers[1]]))
        l2 = tf.nn.relu(tf.matmul(l1, w2) + b2)
        
        w3 = tf.get_variable('w3', [self.layers[1], self.action_size])
        b3 = tf.Variable(tf.random_normal([self.action_size]))
        output = tf.nn.softmax(tf.matmul(l2, w3) + b3)
        
        chosen_action = tf.argmax(output, 1)
        
        return obs, output, chosen_action
        
    def train(self):
        
        indexes = tf.range(0, tf.shape(self.output)[0])*tf.shape(self.output)[1] \
                  + self.action_tfp
        responsible_output = tf.gather(tf.reshape(self.output, [-1]), indexes)
        loss = -tf.reduce_mean(tf.log(responsible_output)*self.reward_tfp)
        
        tvars = tf.trainable_variables()
        
        gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            gradient_holders.append(placeholder)
        
        gradients = tf.gradients(loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        update_batch = optimizer.apply_gradients(zip(gradient_holders,tvars))     
    
        return gradients, update_batch, gradient_holders


In [60]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [61]:
tf.reset_default_graph()

myAgent = vanillaPG(1e-2, observation_size, action_size, [32, 32])


total_episodes = 5000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_lenght = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.obs_tfp:[s]})
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)

            s1,r,d,_ = env.step(a) #Get our reward for taking an action given a bandit.
            ep_history.append([s,a,r,s1])
            s = s1
            running_reward += r
            if d == True:
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                feed_dict={myAgent.reward_tfp:ep_history[:,2],
                        myAgent.action_tfp:ep_history[:,1],myAgent.obs_tfp:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_lenght.append(j)
                break

        
            #Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1

13.0
9.51
9.36
9.44
9.4
9.28
9.27
9.43
9.32
9.42
9.41
9.21
9.36
9.32
9.24
9.33
9.27
9.23
9.41
9.32
9.33
9.4
9.33
9.45
9.3
9.28
9.37
9.4
9.39
9.45
9.42
9.41
9.4
9.45
9.41
9.21
9.43
9.36
9.26


KeyboardInterrupt: 