In [1]:
def discount_rewards(rewards,discount_rate):
    discounted_rewards=np.empty(len(rewards))
    cumulative_rewards=0
    for step in reversed(range(len(rewards))):
        cumulative_rewards=rewards[step]+cumulative_rewards*discount_rate
        discounted_rewards[step]=cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards,discount_rate):
    all_discounted_rewards=[discount_rewards(rewards,discount_rate)
                           for rewards in all_rewards]
    flat_rewards=np.concatenate(all_discounted_rewards)
    reward_mean=flat_rewards.mean()
    reward_std=flat_rewards.std()
    return [(discount_rewards-reward_mean)/reward_std
            for discount_rewards in all_discounted_rewards]

In [3]:
#define the neuron network
import tensorflow as tf
tf.reset_default_graph()
n_inputs=4
n_hidden=6
n_outputs=1
initializer=tf.contrib.layers.variance_scaling_initializer()

#the network
X=tf.placeholder(tf.float32,shape=[None,n_inputs])
hidden=tf.layers.dense(X,n_hidden,activation=tf.nn.elu,
                      kernel_initializer=initializer)
logits=tf.layers.dense(hidden,n_outputs,
                      kernel_initializer=initializer)
outputs=tf.nn.sigmoid(logits)

p_left_and_right=tf.concat(axis=1,values=[outputs,1-outputs])
action=tf.multinomial(tf.log(p_left_and_right),num_samples=1)


#take the random action as the best action
y=1.0-tf.to_float(action)
learning_rate=0.01
cross_entropy=tf.nn.sigmoid_cross_entropy_with_logits(labels=y,
                                                      logits=logits)
optimizer=tf.train.AdamOptimizer(learning_rate)
grads_and_vars=optimizer.compute_gradients(cross_entropy)

gradients=[grad for grad,variable in grads_and_vars]

gradient_placeholders=[]
grads_and_vars_feed=[]
for grad,variable in grads_and_vars:
    gradient_placeholder=tf.placeholder(tf.float32,shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder,variable))

training_op=optimizer.apply_gradients(grads_and_vars_feed)
init=tf.global_variables_initializer()
saver=tf.train.Saver()


In [8]:
import gym
import numpy as np
env = gym.make('CartPole-v1')
env._max_episode_steps = 1000
n_iterations=250
n_max_step=1000
##Important
obs=env.reset()
%matplotlib inline


n_games_per_update=10
save_iterations=10
discount_rate=0.95

try:
    with tf.Session() as sess:
        init.run()
        try:
            saver.restore(sess, "model_checkpoints/carpole_net_pg.ckpt")
        except:
            print("start over!")
        for iteration in range(n_iterations):
            
            all_rewards=[]
            all_gradients=[]
            for game in range(n_games_per_update):
                current_rewards=[]
                current_gradients=[]
                obs=env.reset()
                for step in range(n_max_step):
                    action_val,gradients_val=sess.run(
                    [action,gradients],
                    feed_dict={X:obs.reshape(1,n_inputs)}
                    )
                    obs,reward,done,info=env.step(action_val[0][0])
                    current_rewards.append(reward)
                    current_gradients.append(gradients_val)
                    if (iteration % save_iterations)==0 and (game==n_games_per_update-1):
                        env.render()
                    if done:
                        break
                all_rewards.append(current_rewards)
                all_gradients.append(current_gradients) # index by game, step, var_index
            # gradient ascent
            all_rewards=discount_and_normalize_rewards(all_rewards,discount_rate)
            feed_dict={}
            for var_index,grad_placeholder in enumerate(gradient_placeholders):
                mean_gradients=np.mean(
                [reward*all_gradients[game_index][step][var_index]
                for game_index,rewards in enumerate(all_rewards) 
                for step,reward in enumerate(rewards)],axis=0
                )
                feed_dict[grad_placeholder]=mean_gradients
            sess.run(training_op,feed_dict=feed_dict)
            if (iteration % save_iterations)==0:
                print("iteration: {} ".format(iteration),max([len(rewards) for rewards in all_rewards]))
                saver.save(sess,"model_checkpoints/carpole_net_pg.ckpt")
except:
    env.close()

            

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from model_checkpoints/carpole_net_pg.ckpt
iteration: 0  1000
iteration: 10  1000
iteration: 20  406
iteration: 30  1000
iteration: 40  1000
iteration: 50  1000
iteration: 60  1000
iteration: 70  1000
iteration: 80  1000
iteration: 90  1000
iteration: 100  1000
iteration: 110  1000
iteration: 120  1000
iteration: 130  1000
iteration: 140  1000
iteration: 150  786
iteration: 160  531
iteration: 170  1000
iteration: 180  1000
iteration: 190  1000
iteration: 200  1000
iteration: 210  1000
iteration: 220  1000
iteration: 230  1000
iteration: 240  1000
