# Cart Pole with Policy Gradient

[reference](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724)

[ *policy based agent*, *cart pole game*, *delayed reward* ]

Also try [Vanilla Policy Gradient with 2 neuron action space](https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb)

In [1]:
import tensorflow as tf
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import math

In [2]:
import gym
# setup env
env = gym.make('CartPole-v0')

[2016-12-20 16:33:20,223] Making new env: CartPole-v0


In [3]:
# network parameters
num_hidden = 10
batch_size = 50
lr = 0.01
discount = 0.99
num_input   = 4 # 4 observations
num_actions = 2 # left/right

In [7]:
tf.reset_default_graph()
# observations - fed as input - [batch_size, 4]
obs = tf.placeholder(shape=[None,num_input],dtype=tf.float32)
W1 = tf.get_variable("W1", shape=[num_input, num_hidden],
           initializer=tf.contrib.layers.xavier_initializer()) # fancy way of initialization
h1 = tf.nn.relu(tf.matmul(obs,W1))
W2 = tf.get_variable("W2", shape=[num_hidden, 1],
           initializer=tf.contrib.layers.xavier_initializer()) 
action_prob = tf.nn.sigmoid(tf.matmul(h1,W2))

# we need to collect the gradients and
#  update the network later
# keep track of gradients
train_vars = tf.trainable_variables()

# training 
action_prob_ = tf.placeholder(shape=[None,1], dtype=tf.float32)
# reward signal
advantages = tf.placeholder(shape=[1], dtype=tf.float32)
loglik = tf.log(action_prob_*(action_prob_ - action_prob) + (1 - action_prob_)*(action_prob_ + action_prob))
loss = -tf.reduce_mean(loglik * advantages) # increase the likelihood of the action that yields "advantageous" action
#   action with high reward

# constructs symbolic partial derivatives of loss w.r.t. trainable variables
gradients = tf.gradients(loss, train_vars)

# we collect a series of gradients from running multiple episodes
#  and then apply them - why? to account for noise in the reward signal!
w1grad = tf.placeholder(tf.float32)
w2grad = tf.placeholder(tf.float32)
wgrad = [w1grad, w2grad]
update_grad = tf.train.AdamOptimizer(learning_rate=lr).apply_gradients(zip(wgrad,train_vars))