In [51]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range

In [52]:
env = gym.make('CartPole-v0')

[2017-06-21 16:36:04,568] Making new env: CartPole-v0


In [53]:

gamma = 1.0

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add   
    return discounted_r

In [54]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        hidden2= slim.fully_connected(hidden,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden2,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * 2 + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [55]:
tf.reset_default_graph() #Clear the Tensorflow graph.

myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.

total_episodes = 5000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_lenght = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)

            s1,r,d,_ = env.step(a) #Get our reward for taking an action given a bandit.
            ep_history.append([s,a,r])
            s = s1
            running_reward += r
            if d == True:
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict= dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_lenght.append(j)
                break

        
            #Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


18.0
27.31
48.18
113.33
174.55
188.88
181.13
164.98
186.59
154.86
176.19
198.72
200.0
200.0
195.99
196.28
192.12
197.22
198.29
198.61
200.0
200.0
200.0
200.0
198.46
184.4
196.09
198.71
200.0
199.92
200.0
200.0
197.21
199.99
198.63
198.35
183.26
198.17
198.03
198.97
199.68
196.61
196.3
197.85
199.4
199.14
200.0
199.88
199.68
200.0


In [55]:
print(np.vstack(ep_history[20:30:,0]))
print((ep_history[20:30:,0]))


[[ -4.76735919e-02  -4.27363388e-02   6.79212013e-02   3.03759914e-01]
 [ -4.85283187e-02   1.51355209e-01   7.39963996e-02   3.32473673e-02]
 [ -4.55012145e-02   3.45342315e-01   7.46613469e-02  -2.35201933e-01]
 [ -3.85943682e-02   5.39322541e-01   6.99573083e-02  -5.03431620e-01]
 [ -2.78079174e-02   3.43287938e-01   5.98886759e-02  -1.89548295e-01]
 [ -2.09421586e-02   1.47362603e-01   5.60977100e-02   1.21409576e-01]
 [ -1.79949066e-02   3.41637836e-01   5.85259015e-02  -1.53060785e-01]
 [ -1.11621499e-02   5.35875066e-01   5.54646858e-02  -4.26720992e-01]
 [ -4.44648534e-04   3.40013192e-01   4.69302659e-02  -1.17081708e-01]
 [  6.35561530e-03   1.44251332e-01   4.45886318e-02   1.90030209e-01]]
[array([-0.04767359, -0.04273634,  0.0679212 ,  0.30375991])
 array([-0.04852832,  0.15135521,  0.0739964 ,  0.03324737])
 array([-0.04550121,  0.34534231,  0.07466135, -0.23520193])
 array([-0.03859437,  0.53932254,  0.06995731, -0.50343162])
 array([-0.02780792,  0.34328794,  0.05988868

In [15]:
#tf.reset_default_graph() #Clear the Tensorflow graph.
myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.
max_ep=999
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    s = env.reset()
    for j in range(max_ep):
        env.render()
        a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
        a = np.random.choice(a_dist[0],p=a_dist[0])
        a = np.argmax(a_dist == a)
        s1=env.step(a)
        s = s1[0]
        

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[2017-06-20 14:40:14,816] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


In [92]:
#np.random.choice([2,4],p=[.5,.5])
# indexes = tf.range(0, tf.shape(output)[0]) * tf.shape(output)[1] + action_holder
#         tf.Print(indexes, [indexes], message="This is a: ")
output

<tf.Tensor 'fully_connected_17/Softmax:0' shape=(1, 2) dtype=float32>

In [48]:
sess2 = tf.InteractiveSession()
tf.global_variables_initializer()
s_size=4
h_size=160
a_size=2
#state_in=  tf.constant([[3.0,4.0,2.0,1.0],[3.0,4.0,2.0,1.0]])
output=tf.constant([[1.0,2.0],[3.0,4.0],[5.0,6.0],[7.0,8.0]])
var = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
                      name="weights") 
action_holder=  tf.constant([0,0,0,1])
reward_holder=  tf.constant([4.5,3.6,2.5,1])
indexes = tf.range(0, 4) * 2 + action_holder
indexes=tf.Print(indexes, [indexes], message="Indexes: ")
responsible_outputs = tf.gather(tf.reshape(output, [-1]), indexes)
responsible_outputs=tf.Print(responsible_outputs, [responsible_outputs], message="Responsible Outputs: ")
loss=-tf.reduce_mean(tf.log(responsible_outputs)*reward_holder)
gradients = tf.gradients(loss,[var])[0]
print(gradients)
sess2.run(loss)




None


-2.5145102