In [3]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [4]:
import gym

In [5]:
env = gym.make('CartPole-v0')

[2017-05-17 15:54:31,894] Making new env: CartPole-v0


In [6]:
env.action_space

Discrete(2)

In [7]:
env.observation_space

Box(4,)

In [28]:
#running with random actions

env.reset()
random_episodes = 0
reward_sum = 0

while random_episodes < 10:
    env.render()
    observation, reward,done,_ = env.step(np.random.randint(0,2))
    reward_sum += reward
    if done:
        random_episodes += 1
        print("reward for episode was ",reward_sum)
        reward_sum = 0
        env.reset()

reward for episode was  42.0
reward for episode was  38.0
reward for episode was  37.0
reward for episode was  14.0
reward for episode was  47.0
reward for episode was  13.0
reward for episode was  13.0
reward for episode was  35.0
reward for episode was  36.0
reward for episode was  23.0


In [8]:
H = 10 #number of hidden neurons
batch_size = 5 #num of episodes after which to update params
learning_rate = 1e-2
gamma = 0.99 #discount factor for future rewards

D = 4 # input dimensionality

In [9]:
tf.reset_default_graph()

In [10]:
# network takes in obs and gives probablitiy of actions - left or right (2 actions)

observations = tf.placeholder(tf.float32, [None, D], name='input_x')
W1 = tf.get_variable("W1", shape=[D,H], initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations, W1))
W2 = tf.get_variable("W2", shape=[H,1], initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

In [11]:
# define parts of network to learn good policy
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32, [None,1], name='input_y')
advantages = tf.placeholder(tf.float32, name='reward_signal')

In [12]:
# the loss function, increase weight of good actions and decrease of bad actions
loglik = tf.log(input_y*(input_y - probability) + (1-input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik * advantages)
newGrads = tf.gradients(loss, tvars)

In [13]:
# collect series of gradients from multiple episodes and then apply them and notafterevery episode
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
W1Grad = tf.placeholder(tf.float32, name='batch_grad1') #placeholder to send final gradient through when we update
W2Grad = tf.placeholder(tf.float32, name='batch_grad2')
batch_grad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batch_grad, tvars))

In [14]:
# advantage function

def discount_rewards(r):
    # take 1D array of rewards and compute discounted reward
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        #print(running_add)
        discounted_r[t] = running_add
    return discounted_r

In [15]:
t = np.array([1,2,3,4,5])
print(discount_rewards(t))

[14 13 11  8  5]


In [16]:
xs, hs, dlogps, drs, ys, tfps = [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()

In [17]:
with tf.Session() as sess:
    rendering = False
    sess.run (init)
    observation = env.reset() #get initial obs from env
    
    # reset the gradient buffer
    gradBuffer = sess.run(tvars)
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while episode_number <= total_episodes:
        
        # render onlyy if doing a good job
        if reward_sum/batch_size > 100 or rendering == True:
            env.render()
            rendering = True
            
        x = np.reshape(observation, [1,D])
        
        # run the policy network and get an action
        tfprob = sess.run(probability, feed_dict={observations:x})
        action = 1 if np.random.uniform() < tfprob else 0
        
        xs.append(x) #add observation
        y = 1 if action == 0 else 0 #fake label
        ys.append(y)
        
        # step env and get new obs
        observation, reward, done, info = env.step(action)
        reward_sum += reward
        
        drs.append(reward)
        
        if done:
            episode_number += 1
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            tfp = tfps
            xs, hs, dlogps, drs, ys, tfps = [],[],[],[],[],[]  #reset array memory
            
            discounted_epr = discount_rewards(epr)
            discounted_epr -= np.mean(discounted_epr) #normalize
            discounted_epr /= np.std(discounted_epr)
            
            #get gradient for this episode and save in gradbuffer
            tGrad = sess.run(newGrads, feed_dict={observations:epx,
                                                 input_y:epy,
                                                 advantages:discounted_epr})
            
            for ix, grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            if episode_number % batch_size == 0:
                sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad: gradBuffer[1]})
                for ix, grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                    
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print("Average reward for epi %f. Total average reward %f" % (reward_sum/batch_size, running_reward/batch_size))
                
                if reward_sum/batch_size > 200:
                    print('Task is solved in episode', episode_number)
                    break
                    
                reward_sum = 0
                
            observation = env.reset()
            
print(episode_number, " episodes completed")
            
            
            
            
            

Average reward for epi 19.600000. Total average reward 19.600000
Average reward for epi 28.600000. Total average reward 19.690000
Average reward for epi 24.800000. Total average reward 19.741100
Average reward for epi 19.400000. Total average reward 19.737689
Average reward for epi 30.200000. Total average reward 19.842312
Average reward for epi 16.000000. Total average reward 19.803889
Average reward for epi 28.600000. Total average reward 19.891850
Average reward for epi 42.000000. Total average reward 20.112932
Average reward for epi 29.000000. Total average reward 20.201802
Average reward for epi 39.400000. Total average reward 20.393784
Average reward for epi 30.200000. Total average reward 20.491846
Average reward for epi 29.800000. Total average reward 20.584928
Average reward for epi 24.000000. Total average reward 20.619079
Average reward for epi 27.000000. Total average reward 20.682888
Average reward for epi 41.200000. Total average reward 20.888059
Average reward for epi 37