In [1]:
import tensorflow as tf
import numpy as np
import random
import gym
import math
import matplotlib.pyplot as plt

In [2]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    out = e_x / e_x.sum()
    return out

In [49]:
def policy_gradient():
    
    with tf.variable_scope("policy1", reuse=True):
        params=tf.get_variable("policy_parameters",[4,2])
        state=tf.placeholder("float",[None,4])
        actions=tf.placeholder("float",[None,2])
        advantages=tf.placeholder("float",[None,1])
        linear=tf.matmul(state,params)
        probabilities=tf.nn.softmax(linear)
        good_probs=tf.reduce_sum(tf.mul(probabilities, actions), reduction_indices=[1])
        eligibility = tf.log(good_probs)*advantages
        loss=-tf.reduce_sum(eligibility)
        optimizer=tf.train.AdamOptimizer(0.01).minimize(loss)
        return probabilities, state, actions, advantages, optimizer


In [50]:
def value_fn():
    with tf.variable_scope("value1" , reuse=True):
        state=tf.placeholder("float",[None,4])
        newvals=tf.placeholder("float",[None,1])
        w1 = tf.get_variable("w1",[4,20])
        b1 = tf.get_variable("b1",[20])
        h1 = tf.matmul(state,w1)+b1
        h1=tf.nn.relu(h1)
        w2 = tf.get_variable("w2",[20,1])
        b2 = tf.get_variable("b2",[1])
        calculated = tf.matmul(h1,w2) + b2
        diffs = calculated-newvals
        loss = tf.nn.l2_loss(diffs)
        optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
        return calculated, state, newvals, optimizer, loss
        

In [51]:
def run_episode(env, policy_gradient, value_fn, sess):
    pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_gradient
    vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_fn
    observation = env.reset()
    totalreward = 0
    states = []
    actions = []
    advantages = []
    transitions = []
    update_vals = []


    for _ in xrange(200):
        obs_vector=np.expand_dims(observation,axis=0)
        probs = sess.run(pl_calculated,feed_dict={pl_state:obs_vector})
        action = 0 if random.uniform(0,1)< probs[0][0] else 1
        states.append(observation)
        action_blank=np.zeros(2)
        action_blank[action]=1
        actions.append(action_blank)
        old_observation = observation
        observation, reward, done, info = env.step(action)
        transitions.append((old_observation,action,reward))
        totalreward+=reward
        if done:
            break
    
    for index, trans in enumerate(transitions):
        obs,action,reward = trans
        
        future_reward = 0
        future_steps = len(transitions)-index
        
        decrease = 1
        for index2 in xrange(future_steps):
            future_reward+=transitions[index2 + index][2]*decrease
            decrease*=0.97
        obs_vector = np.expand_dims(obs, axis=0)
        currentval = sess.run(vl_calculated, feed_dict={vl_state:obs_vector})[0][0]
        
        advantages.append(future_reward-currentval)
        update_vals.append(future_reward)
        
    update_vals_vector=np.expand_dims(update_vals, axis=1)
    sess.run(vl_optimizer, feed_dict={vl_state:states, vl_newvals:update_vals_vector})
    advantages_vector = np.expand_dims(advantages, axis=1)
    sess.run(pl_optimizer, feed_dict={pl_state:states, pl_actions:actions, pl_advantages:advantages_vector})
    
    return totalreward

        

In [52]:

from gym import wrappers
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, '/home/vishal/RL/cartpole-experiment-1')

policy_grad=policy_gradient()
value_grad=value_fn()
sess=tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
for i in xrange(1000):
    reward = run_episode(env, policy_grad, value_grad, sess)
    if reward ==200:
        print "success - reward is 200"
        print i
        break
t = 0
for _ in xrange(1000):
    reward = run_episode(env, policy_grad, value_grad, sess)
    t+= reward
print t/1000


INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-01-26 11:03:58,642] Making new env: CartPole-v0


Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.
[2017-01-26 11:03:58,926] From <ipython-input-52-9c24845f66ac>:9 in <module>.: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
INFO:gym.monitoring.video_recorder:Starting new video recorder writing to /home/vishal/RL/cartpole-experiment-1/openaigym.video.12.6912.video000000.mp4
[2017-01-26 11:03:59,363] Starting new video recorder writing to /home/vishal/RL/cartpole-experiment-1/openaigym.video.12.6912.video000000.mp4
INFO:gym.monitoring.video_recorder:Starting new video recorder writing to /home/vishal/RL/cartpole-experiment-1/openaigym.video.12.6912.video000001.mp4
[2017-01-26 11:04:00,621] Starting new video recorder writing to /home/vishal/RL/cartpole-experiment-1/openaigym.video.12.6912.video000001.mp4
INFO:gym.monitoring.video_recorder:Starting new video recorder writ

success - reward is 200
362
164.329


In [53]:
reward = run_episode(env, policy_grad, value_grad, sess)

In [54]:
print reward

200.0
