# Balancing a Pole on a Cart

### Using a Basic Policy Algorithm

In [4]:
import gym
import numpy as np

def basic_policy(obs):
    angle = obs[2]
    if angle < 0: return 0
    else: return 1

env    = gym.make('CartPole-v0')
totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()

    for _ in range(1000):
        env.render()
        action = basic_policy(obs)
        obs, reward, done, _ = env.step(action)
        episode_rewards += reward
        if done: break
    
    totals.append(episode_rewards)
    
env.close()

print("Results (in seconds)")
print("--------------------")
print("Mean Time: %.2f" % np.mean(totals))
print("Std Time: %.2f" % np.std(totals))
print("Min Time: %.2f" % np.min(totals))
print("Max Time: %.2f" % np.max(totals))

Results (in seconds)
--------------------
Mean Time: 42.47
Std Time: 8.53
Min Time: 24.00
Max Time: 66.00


### Using A Neural Network Policy Algorithm

In [1]:
import tensorflow as tf #TensorFlow version 1.13

n_inputs  = 4
n_hidden  = 4
n_outputs = 1
learning_rate = 0.01

initializer = tf.contrib.layers.variance_scaling_initializer()

X       = tf.placeholder(tf.float32, shape = [None, n_inputs])
hidden  = tf.layers.dense(X, n_hidden, activation = tf.nn.elu, kernel_initializer = initializer)
logits  = tf.layers.dense(hidden, n_outputs, kernel_initializer = initializer)
outputs = tf.nn.sigmoid(logits)

p_left_and_right = tf.concat(axis = 1, values = [outputs, 1 - outputs])
action           = tf.multinomial(tf.log(p_left_and_right), num_samples = 1)

y = 1. - tf.to_float(action)
cross_entropy  = tf.nn.sigmoid_cross_entropy_with_logits(labels = y, logits = logits)

optimizer      = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients      = [grad for grad, variable in grads_and_vars]

gradient_placeholders = []
grads_and_vars_feed   = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape = grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
    
training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.random.categorical instead.
Instructions for updating:
Use tf.cast instead.


In [2]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean  = flat_rewards.mean()
    reward_std   = flat_rewards.std()
    
    return [(discounted_rewards - reward_mean)/reward_std 
            for discounted_rewards in all_discounted_rewards]


In [7]:
n_iterations       = 250
n_max_steps        = 1000
n_games_per_update = 10
save_iterations    = 10
discount_rate      = 0.95

with tf.Session() as sess:
    #Training the Neural Network
    init.run()
    for iteration in range(n_iterations):
        
        all_rewards   = []
        all_gradients = []
        for game in range(n_games_per_update):
            
            current_rewards   = []
            current_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run(
                    [action, gradients],
                    feed_dict = {X: obs.reshape(1, n_inputs)})
                
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done: break
            
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)
        
        all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate)
        feed_dict = {}
        for var_index, grad_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean(
                [reward * all_gradients[game_index][step][var_index]
                    for game_index, rewards in enumerate(all_rewards)
                    for step, reward in enumerate(rewards)],
                axis = 0)
            feed_dict[grad_placeholder] = mean_gradients
            
        sess.run(training_op, feed_dict = feed_dict)
    
    #Evaluating the Neural Network
    totals = []
    for episode in range(500):
        episode_rewards = 0
        obs = env.reset()

        for _ in range(1000):
            env.render()
            action_val = sess.run(action, feed_dict = {X: obs.reshape(1, n_inputs)})
            obs, reward, done, _ = env.step(action_val[0][0])
            episode_rewards += reward
            if done: break

        totals.append(episode_rewards)
    
    env.close()

print("Results (in seconds)")
print("--------------------")
print("Mean Time: %.2f" % np.mean(totals))
print("Std Time: %.2f" % np.std(totals))
print("Min Time: %.2f" % np.min(totals))
print("Max Time: %.2f" % np.max(totals))

Results (in seconds)
--------------------
Mean Time: 186.90
Std Time: 23.91
Min Time: 42.00
Max Time: 200.00
