# Cart pole balancing with policy gradient

In [1]:
import tensorflow as tf
print(tf.__version__)

2.5.0


In [2]:
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import gym

In [12]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [13]:
env = gym.make('CartPole-v0')

In [14]:
state_shape = env.observation_space.shape[0]

In [15]:
state_shape

4

In [16]:
num_actions = env.action_space.n

In [17]:
num_actions

2

## Computing discounted and normalized reward

In [18]:
gamma = 0.95

In [19]:
def discount_and_normalize_rewards(episode_rewards):
    
    # initialize an array for storing the discounted reward
    discounted_rewards = np.zeros_like(episode_rewards)
    
    # compute the discounted reward
    reward_to_go = 0.0
    for i in reversed(range(len(episode_rewards))):
        reward_to_go = reward_to_go * gamma + episode_rewards[i]
        discounted_rewards[i] = reward_to_go
        
    # normalize and return the reward 
    discounted_rewards = discounted_rewards - np.mean(discounted_rewards)
    discounted_rewards = discounted_rewards / np.std(discounted_rewards)
    
    return discounted_rewards

## Building the policy work

In [20]:
state_ph = tf.placeholder(tf.float32, [None, state_shape], name="state_ph")

In [21]:
action_ph = tf.placeholder(tf.int32, [None, num_actions], name="action_ph")

In [22]:
discounted_rewards_ph = tf.placeholder(tf.float32, [None,], name="discounted_rewards")

In [23]:
layer1 = tf.layers.dense(state_ph, units=32, activation=tf.nn.relu)

In [24]:
layer2 = tf.layers.dense(layer1, units=num_actions)

In [25]:
prob_dist = tf.nn.softmax(layer2)

In [26]:
neg_log_policy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=layer2, labels=action_ph)

In [27]:
loss = tf.reduce_mean(neg_log_policy * discounted_rewards_ph)

In [28]:
train = tf.train.AdamOptimizer(0.01).minimize(loss)

## Training the network

In [29]:
num_iterations = 1000

In [32]:
# start the tensorflow session
with tf.Session() as sess:
    
    # initialize all the tensorflow variables
    sess.run(tf.global_variables_initializer())
    
    # for every iteration 
    for i in range(num_iterations):
        
        # initialize an empty list for storing the states, actions and rewards obtained in the episode
        episode_states, episode_actions, episode_rewards = [], [], []
        
        # set the done to False
        done = False
        
        # initialize the state by resetting the environment
        state = env.reset()
        
        # initialize the return 
        Return = 0
        
        # while the episode is not ever
        while not done:
            
            # reshape the state
            state = state.reshape([1, 4])
            
            # feed the state to the policy network. Then, network returns the probability distribution
            # over the action space as output which becomes our stochastic policy
            pi = sess.run(prob_dist, feed_dict={state_ph: state})
            
            # now we select an action using this stochastic policy
            a = np.random.choice(range(pi.shape[1]), p= pi.ravel())
            
            # perform the selected action
            next_state, reward, done, info = env.step(a)
            
            # render the environment
            env.render()
            
            # update the return
            Return = Return + reward
            
            # one-hot encode the action
            action = np.zeros(num_actions)
            action[a] = 1
            
            # store the state, action and reward into their respective list
            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)
            
            
            # update the state to the next state
            state = next_state
            
        # compute the discounted and normalized reward
        discounted_rewards = discount_and_normalize_rewards(episode_rewards)
        
        # define the feed dictionary 
        feed_dict = {state_ph: np.vstack(np.array(episode_states)),
                     action_ph: np.vstack(np.array(episode_actions)),
                     discounted_reward_ph: discounted_rewards}
        
        # train the network
        loss_, _ = sess.run([loss, train], feed_dict=feed_dict)
        
        # print the return for every 10 iteration
        if i%10==0:
            print("Iteration:{}, Return: {}".format(i, Return))

MissingFunctionException: glCreateShader is not exported by the available OpenGL driver.  OpenGL 2.0 is required for this functionality.