# Policy Gradient

In this notebook, we will implement an RL algorithm with policy gradient to play Cartpole.

Code is from https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Policy%20Gradients/Cartpole/Cartpole%20REINFORCE%20Monte%20Carlo%20Policy%20Gradients.ipynb

In [1]:
import tensorflow as tf
import numpy as np
import gym


env = gym.make('CartPole-v0')
# Policy gradient has high variance, seed for reproducability
env.seed(1)

## ENVIRONMENT Hyperparameters
state_size = 4
action_size = env.action_space.n

## TRAINING Hyperparameters
max_episodes = 300
learning_rate = 0.01
gamma = 0.95 # Discount rate

  from ._conv import register_converters as _register_converters


In [2]:
env.observation_space.sample()

array([4.6860966e-01, 1.4645028e+38, 8.6090848e-02, 3.0545910e+37],
      dtype=float32)

In [3]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

discount_and_normalize_rewards([2,3,4,5,6])

array([ 1.07863874,  0.83349357,  0.34320324, -0.63737744, -1.61795811])

In [4]:
with tf.name_scope("inputs"):
    input_ = tf.placeholder(tf.float32, [None, state_size], name="input_")
    actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
    discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], name="discounted_episode_rewards")
    
    # Add this placeholder for having this variable in tensorboard
    mean_reward_ = tf.placeholder(tf.float32 , name="mean_reward")

    with tf.name_scope("fc1"):
        fc1 = tf.contrib.layers.fully_connected(inputs = input_,
                                                num_outputs = 10,
                                                activation_fn=tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("fc2"):
        fc2 = tf.contrib.layers.fully_connected(inputs = fc1,
                                                num_outputs = action_size,
                                                activation_fn= tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())
    
    with tf.name_scope("fc3"):
        fc3 = tf.contrib.layers.fully_connected(inputs = fc2,
                                                num_outputs = action_size,
                                                activation_fn= None,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("softmax"):
        action_distribution = tf.nn.softmax(fc3)

    with tf.name_scope("loss"):
        # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
        # If you have single-class labels, where an object can only belong to one class, you might now consider using 
        # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array. 
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3, labels = actions)
        loss = tf.reduce_mean(neg_log_prob * discounted_episode_rewards_) 
        
    
    with tf.name_scope("train"):
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)
        
# Setup TensorBoard Writer
writer = tf.summary.FileWriter(".")

## Losses
tf.summary.scalar("Loss", loss)

## Reward mean
tf.summary.scalar("Reward_mean", mean_reward_)

write_op = tf.summary.merge_all()

allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [],[],[]

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for episode in range(max_episodes):
        
        episode_rewards_sum = 0

        # Launch the game
        state = env.reset()
           
        while True:
            
            # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
            action_probability_distribution = sess.run(action_distribution, feed_dict={input_: state.reshape([1,4])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel()) # select action w.r.t the actions prob
            
            
            
            # Perform a
            new_state, reward, done, info = env.step(action)

            # Store s, a, r
            episode_states.append(state)
                        
            # For actions because we output only one (the index) we need 2 (1 is for the action taken)
            # We need [0., 1.] (if we take right) not just the index
            action_ = np.zeros(action_size)
            action_[action] = 1
            
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            if done:
                # Calculate sum reward
                episode_rewards_sum = np.sum(episode_rewards)
                
                allRewards.append(episode_rewards_sum)
                
                total_rewards = np.sum(allRewards)
                
                # Mean reward
                mean_reward = np.divide(total_rewards, episode+1)
                
                
                maximumRewardRecorded = np.amax(allRewards)
                
                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)
                print("Mean Reward", mean_reward)
                print("Max reward so far: ", maximumRewardRecorded)
                
                # Calculate discounted reward
                discounted_episode_rewards = discount_and_normalize_rewards(episode_rewards)
                                
                # Feedforward, gradient and backpropagation
                loss_, _ = sess.run([loss, train_opt], feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards 
                                                                })
                
 
                                                                 
                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={input_: np.vstack(np.array(episode_states)),
                                                                 actions: np.vstack(np.array(episode_actions)),
                                                                 discounted_episode_rewards_: discounted_episode_rewards,
                                                                    mean_reward_: mean_reward
                                                                })
                
               
                writer.add_summary(summary, episode)
                writer.flush()
                
            
                
                # Reset the transition stores
                episode_states, episode_actions, episode_rewards = [],[],[]
                
                break
            
            state = new_state
        
        # Save Model
        if episode % 100 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")

Episode:  0
Reward:  20.0
Mean Reward 20.0
Max reward so far:  20.0
Model saved
Episode:  1
Reward:  35.0
Mean Reward 27.5
Max reward so far:  35.0
Episode:  2
Reward:  21.0
Mean Reward 25.333333333333332
Max reward so far:  35.0
Episode:  3
Reward:  31.0
Mean Reward 26.75
Max reward so far:  35.0
Episode:  4
Reward:  25.0
Mean Reward 26.4
Max reward so far:  35.0
Episode:  5
Reward:  27.0
Mean Reward 26.5
Max reward so far:  35.0
Episode:  6
Reward:  25.0
Mean Reward 26.285714285714285
Max reward so far:  35.0
Episode:  7
Reward:  20.0
Mean Reward 25.5
Max reward so far:  35.0
Episode:  8
Reward:  15.0
Mean Reward 24.333333333333332
Max reward so far:  35.0
Episode:  9
Reward:  27.0
Mean Reward 24.6
Max reward so far:  35.0
Episode:  10
Reward:  34.0
Mean Reward 25.454545454545453
Max reward so far:  35.0
Episode:  11
Reward:  13.0
Mean Reward 24.416666666666668
Max reward so far:  35.0
Episode:  12
Reward:  16.0
Mean Reward 23.76923076923077
Max reward so far:  35.0
Episode:  13
Rewa

Episode:  75
Reward:  26.0
Mean Reward 28.32894736842105
Max reward so far:  71.0
Episode:  76
Reward:  18.0
Mean Reward 28.194805194805195
Max reward so far:  71.0
Episode:  77
Reward:  12.0
Mean Reward 27.987179487179485
Max reward so far:  71.0
Episode:  78
Reward:  21.0
Mean Reward 27.89873417721519
Max reward so far:  71.0
Episode:  79
Reward:  19.0
Mean Reward 27.7875
Max reward so far:  71.0
Episode:  80
Reward:  46.0
Mean Reward 28.012345679012345
Max reward so far:  71.0
Episode:  81
Reward:  29.0
Mean Reward 28.024390243902438
Max reward so far:  71.0
Episode:  82
Reward:  77.0
Mean Reward 28.6144578313253
Max reward so far:  77.0
Episode:  83
Reward:  14.0
Mean Reward 28.44047619047619
Max reward so far:  77.0
Episode:  84
Reward:  27.0
Mean Reward 28.423529411764704
Max reward so far:  77.0
Episode:  85
Reward:  88.0
Mean Reward 29.11627906976744
Max reward so far:  88.0
Episode:  86
Reward:  37.0
Mean Reward 29.20689655172414
Max reward so far:  88.0
Episode:  87
Reward:  

Episode:  144
Reward:  200.0
Mean Reward 38.206896551724135
Max reward so far:  200.0
Episode:  145
Reward:  12.0
Mean Reward 38.02739726027397
Max reward so far:  200.0
Episode:  146
Reward:  78.0
Mean Reward 38.29931972789116
Max reward so far:  200.0
Episode:  147
Reward:  31.0
Mean Reward 38.25
Max reward so far:  200.0
Episode:  148
Reward:  23.0
Mean Reward 38.147651006711406
Max reward so far:  200.0
Episode:  149
Reward:  55.0
Mean Reward 38.26
Max reward so far:  200.0
Episode:  150
Reward:  69.0
Mean Reward 38.4635761589404
Max reward so far:  200.0
Episode:  151
Reward:  78.0
Mean Reward 38.723684210526315
Max reward so far:  200.0
Episode:  152
Reward:  38.0
Mean Reward 38.71895424836601
Max reward so far:  200.0
Episode:  153
Reward:  50.0
Mean Reward 38.79220779220779
Max reward so far:  200.0
Episode:  154
Reward:  89.0
Mean Reward 39.116129032258065
Max reward so far:  200.0
Episode:  155
Reward:  124.0
Mean Reward 39.66025641025641
Max reward so far:  200.0
Episode:  1

Episode:  209
Reward:  161.0
Mean Reward 51.766666666666666
Max reward so far:  200.0
Episode:  210
Reward:  13.0
Mean Reward 51.58293838862559
Max reward so far:  200.0
Episode:  211
Reward:  88.0
Mean Reward 51.75471698113208
Max reward so far:  200.0
Episode:  212
Reward:  47.0
Mean Reward 51.732394366197184
Max reward so far:  200.0
Episode:  213
Reward:  190.0
Mean Reward 52.37850467289719
Max reward so far:  200.0
Episode:  214
Reward:  140.0
Mean Reward 52.78604651162791
Max reward so far:  200.0
Episode:  215
Reward:  161.0
Mean Reward 53.28703703703704
Max reward so far:  200.0
Episode:  216
Reward:  55.0
Mean Reward 53.294930875576036
Max reward so far:  200.0
Episode:  217
Reward:  176.0
Mean Reward 53.857798165137616
Max reward so far:  200.0
Episode:  218
Reward:  164.0
Mean Reward 54.36073059360731
Max reward so far:  200.0
Episode:  219
Reward:  200.0
Mean Reward 55.02272727272727
Max reward so far:  200.0
Episode:  220
Reward:  137.0
Mean Reward 55.39366515837104
Max re

Episode:  274
Reward:  177.0
Mean Reward 75.84363636363636
Max reward so far:  200.0
Episode:  275
Reward:  200.0
Mean Reward 76.29347826086956
Max reward so far:  200.0
Episode:  276
Reward:  200.0
Mean Reward 76.74007220216606
Max reward so far:  200.0
Episode:  277
Reward:  200.0
Mean Reward 77.18345323741008
Max reward so far:  200.0
Episode:  278
Reward:  200.0
Mean Reward 77.6236559139785
Max reward so far:  200.0
Episode:  279
Reward:  200.0
Mean Reward 78.06071428571428
Max reward so far:  200.0
Episode:  280
Reward:  200.0
Mean Reward 78.49466192170819
Max reward so far:  200.0
Episode:  281
Reward:  194.0
Mean Reward 78.90425531914893
Max reward so far:  200.0
Episode:  282
Reward:  200.0
Mean Reward 79.3321554770318
Max reward so far:  200.0
Episode:  283
Reward:  200.0
Mean Reward 79.75704225352112
Max reward so far:  200.0
Episode:  284
Reward:  180.0
Mean Reward 80.10877192982456
Max reward so far:  200.0
Episode:  285
Reward:  200.0
Mean Reward 80.52797202797203
Max rewa

In [5]:
tf.__version__

'1.8.0'

In [6]:
env.action_space.sample()

1