In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range

In [2]:
env = gym.make('CartPole-v0')

[2017-07-01 20:59:18,456] Making new env: CartPole-v0


In [3]:

gamma = 1.0

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add   
    return discounted_r

In [4]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        hidden2= slim.fully_connected(hidden,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden2,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * 2 + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [7]:
tf.reset_default_graph() #Clear the Tensorflow graph.

myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.

total_episodes = 5000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_lenght = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
            print(a_dist)
            a = np.random.choice(a_dist[0],p=a_dist[0])
            a = np.argmax(a_dist == a)

            s1,r,d,_ = env.step(a) #Get our reward for taking an action given a bandit.
            ep_history.append([s,a,r])
            s = s1
            running_reward += r
            if d == True:
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict= dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_lenght.append(j)
                break

        
            #Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[[ 0.49947435  0.50052559]]
[[ 0.48720866  0.51279128]]
[[ 0.49930152  0.50069851]]
[[ 0.4874467   0.51255327]]
[[ 0.47574583  0.52425414]]
[[ 0.46484095  0.53515899]]
[[ 0.47805887  0.52194118]]
[[ 0.48952234  0.51047766]]
[[ 0.49856395  0.50143605]]
[[ 0.48584142  0.51415861]]
[[ 0.49541399  0.50458604]]
[[ 0.48896113  0.5110389 ]]
[[ 0.47481236  0.52518767]]
[[ 0.49149802  0.50850201]]
[[ 0.48373133  0.51626867]]
[[ 0.49209994  0.50790012]]
[[ 0.48093987  0.51906013]]
[[ 0.49290258  0.50709748]]
[[ 0.48282319  0.51717675]]
[[ 0.49367243  0.50632763]]
[[ 0.48368695  0.51631302]]
[[ 0.49214932  0.50785071]]
[[ 0.4635478   0.53645223]]
[[ 0.45053267  0.54946733]]
[[ 0.46223009  0.53776991]]
[[ 0.48144966  0.51855028]]
[[ 0.48562956  0.51437044]]
[[ 0.47067493  0.52932507]]
[[ 0.4866108   0.51338917]]
[[ 0.46117434  0.53882563]]
[[ 0.48762676  0.51237327]]
[[ 0.47790408  0.52209592]]
[[ 0.48717564  0.5128243 ]]
[[ 0.47906029  0.52093971]]
[[ 0.46945116  0.53054881]]
[[ 0.45963198  0.540

[[ 0.4558444   0.54415566]]
[[ 0.4377147   0.56228524]]
[[ 0.41972867  0.58027136]]
[[ 0.40180504  0.59819496]]
[[ 0.38388079  0.61611921]]
[[ 0.36591586  0.63408411]]
[[ 0.38113555  0.61886448]]
[[ 0.49675682  0.50324315]]
[[ 0.48234293  0.5176571 ]]
[[ 0.465864  0.534136]]
[[ 0.48742509  0.51257497]]
[[ 0.4941462   0.50585389]]
[[ 0.4886393   0.51136065]]
[[ 0.47055322  0.52944672]]
[[ 0.45299405  0.54700589]]
[[ 0.47304842  0.52695161]]
[[ 0.49185935  0.50814062]]
[[ 0.47157872  0.52842122]]
[[ 0.48968306  0.51031697]]
[[ 0.49476025  0.50523978]]
[[ 0.4799118  0.5200882]]
[[ 0.46486714  0.53513283]]
[[ 0.44947454  0.55052543]]
[[ 0.47036254  0.5296374 ]]
[[ 0.45419505  0.54580498]]
[[ 0.47446111  0.52553886]]
[[ 0.45720324  0.54279673]]
[[ 0.44009084  0.55990916]]
[[ 0.4590928   0.54090714]]
[[ 0.44111025  0.55888969]]
[[ 0.45924267  0.5407573 ]]
[[ 0.44045085  0.55954909]]
[[ 0.45767617  0.54232389]]
[[ 0.47414792  0.52585208]]
[[ 0.45385906  0.54614097]]
[[ 0.43382952  0.56617045]

[[ 0.4947913   0.50520867]]
[[ 0.46055898  0.53944105]]
[[ 0.49671829  0.50328177]]
[[ 0.46264821  0.53735179]]
[[ 0.43806306  0.56193703]]
[[ 0.46525803  0.53474194]]
[[ 0.43962395  0.56037605]]
[[ 0.4658092   0.53419083]]
[[ 0.49485564  0.50514436]]
[[ 0.46305078  0.53694928]]
[[ 0.436124    0.56387603]]
[[ 0.46085247  0.5391475 ]]
[[ 0.48836821  0.51163179]]
[[ 0.48676839  0.51323158]]
[[ 0.47832593  0.5216741 ]]
[[ 0.4884688   0.51153117]]
[[ 0.47736901  0.52263099]]
[[ 0.48943654  0.51056349]]
[[ 0.47962976  0.52037024]]
[[ 0.46690804  0.53309202]]
[[ 0.45209828  0.54790175]]
[[ 0.47006243  0.52993751]]
[[ 0.45483372  0.54516631]]
[[ 0.47139606  0.52860397]]
[[ 0.48205858  0.51794142]]
[[ 0.48955542  0.51044458]]
[[ 0.44488877  0.55511123]]
[[ 0.48992243  0.51007754]]
[[ 0.48268911  0.51731086]]
[[ 0.46901989  0.53098011]]
[[ 0.48287421  0.51712579]]
[[ 0.46772525  0.53227478]]
[[ 0.45031887  0.54968113]]
[[ 0.43304276  0.56695718]]
[[ 0.44839367  0.55160636]]
[[ 0.43016994  0.569

[[ 0.40517613  0.59482384]]
[[ 0.49273616  0.50726384]]
[[ 0.47053149  0.52946848]]
[[ 0.43817377  0.56182623]]
[[ 0.46699575  0.53300428]]
[[ 0.49509123  0.50490874]]
[[ 0.48553362  0.51446635]]
[[ 0.471865    0.52813506]]
[[ 0.45792529  0.54207474]]
[[ 0.47585905  0.52414089]]
[[ 0.46120176  0.53879821]]
[[ 0.47852239  0.52147758]]
[[ 0.49173915  0.50826085]]
[[ 0.46597412  0.53402591]]
[[ 0.43791369  0.56208628]]
[[ 0.46825907  0.53174096]]
[[ 0.43974715  0.56025279]]
[[ 0.46943161  0.53056842]]
[[ 0.44028994  0.55971003]]
[[ 0.41007942  0.58992064]]
[[ 0.4401089  0.5598911]]
[[ 0.46906909  0.53093094]]
[[ 0.43596107  0.56403893]]
[[ 0.46418399  0.53581607]]
[[ 0.4305068   0.56949323]]
[[ 0.45790482  0.54209524]]
[[ 0.42364264  0.57635736]]
[[ 0.39048946  0.60951054]]
[[ 0.35836896  0.64163107]]
[[ 0.3829481  0.6170519]]
[[ 0.34917459  0.65082538]]
[[ 0.37091142  0.62908858]]
[[ 0.33577564  0.66422439]]
[[ 0.48104665  0.51895326]]
[[ 0.48515692  0.51484299]]
[[ 0.48304874  0.5169513

[[ 0.48574382  0.51425618]]
[[ 0.4958019   0.50419807]]
[[ 0.46390572  0.53609437]]
[[ 0.42652932  0.57347065]]
[[ 0.39012152  0.60987848]]
[[ 0.35466522  0.64533472]]
[[ 0.38697174  0.61302823]]
[[ 0.41807824  0.58192176]]
[[ 0.37731043  0.62268955]]
[[ 0.33811155  0.66188848]]
[[ 0.36458963  0.63541037]]
[[ 0.32356     0.67644006]]
[[ 0.49681208  0.50318789]]
[[ 0.47494021  0.52505976]]
[[ 0.4415535   0.55844653]]
[[ 0.40882668  0.59117329]]
[[ 0.4475131  0.5524869]]
[[ 0.41276982  0.58723021]]
[[ 0.44972208  0.55027795]]
[[ 0.48495111  0.51504886]]
[[ 0.48069289  0.51930708]]
[[ 0.4811362  0.5188638]]
[[ 0.44245297  0.55754703]]
[[ 0.47718364  0.52281642]]
[[ 0.4375186  0.5624814]]
[[ 0.4710122   0.52898788]]
[[ 0.43037987  0.56962013]]
[[ 0.46250787  0.5374921 ]]
[[ 0.42092311  0.57907695]]
[[ 0.45151222  0.54848784]]
[[ 0.40900207  0.59099799]]
[[ 0.437828  0.562172]]
[[ 0.4681218   0.53187823]]
[[ 0.47885376  0.52114618]]
[[ 0.44562823  0.55437177]]
[[ 0.47570714  0.52429283]]
[[

[[ 0.47880918  0.52119088]]
[[ 0.48558816  0.51441181]]
[[ 0.49174955  0.50825047]]
[[ 0.49650499  0.50349504]]
[[ 0.49636251  0.50363755]]
[[ 0.49474651  0.50525349]]
[[ 0.49461243  0.50538766]]
[[ 0.49191165  0.50808829]]
[[ 0.49252495  0.50747508]]
[[ 0.48260739  0.51739264]]
[[ 0.45600495  0.54399508]]
[[ 0.49340379  0.50659621]]
[[ 0.46875921  0.53124076]]
[[ 0.48878384  0.51121622]]
[[ 0.48575538  0.51424462]]
[[ 0.48606172  0.51393825]]
[[ 0.48417848  0.51582152]]
[[ 0.48203424  0.51796579]]
[[ 0.47931072  0.52068931]]
[[ 0.47832152  0.52167845]]
[[ 0.47570449  0.52429551]]
[[ 0.46808773  0.53191233]]
[[ 0.46056318  0.53943682]]
[[ 0.45304465  0.54695535]]
[[ 0.45753974  0.5424602 ]]
[[ 0.49846077  0.50153917]]
[[ 0.49175119  0.50824881]]
[[ 0.48524174  0.51475829]]
[[ 0.47882044  0.5211795 ]]
[[ 0.47243494  0.52756512]]
[[ 0.47861993  0.52138001]]
[[ 0.48406935  0.51593065]]
[[ 0.4889535   0.51104653]]
[[ 0.48850241  0.51149762]]
[[ 0.49419197  0.505808  ]]
[[ 0.48623678  0.513

[[ 0.49564922  0.50435078]]
[[ 0.49551445  0.50448555]]
[[ 0.49482891  0.50517106]]
[[ 0.49432087  0.50567919]]
[[ 0.49344918  0.50655079]]
[[ 0.49279425  0.50720567]]
[[ 0.49173459  0.50826544]]
[[ 0.49092162  0.50907838]]
[[ 0.48965335  0.51034665]]
[[ 0.48866427  0.51133573]]
[[ 0.4877663   0.51223373]]
[[ 0.48685637  0.51314366]]
[[ 0.4858115   0.51418853]]
[[ 0.49708167  0.50291836]]
[[ 0.50135809  0.49864197]]
[[ 0.49828324  0.50171685]]
[[ 0.44626606  0.55373394]]
[[ 0.49895993  0.5010401 ]]
[[ 0.45255348  0.54744655]]
[[ 0.40766764  0.59233236]]
[[ 0.45833224  0.54166776]]
[[ 0.49845082  0.50154918]]
[[ 0.46073389  0.53926611]]
[[ 0.41379848  0.58620149]]
[[ 0.46326014  0.53673983]]
[[ 0.49617967  0.5038203 ]]
[[ 0.49917844  0.50082165]]
[[ 0.49712312  0.50287688]]
[[ 0.49952772  0.50047219]]
[[ 0.49786934  0.50213069]]
[[ 0.460648    0.53935206]]
[[ 0.49729657  0.50270343]]
[[ 0.46152261  0.53847736]]
[[ 0.41352969  0.58647031]]
[[ 0.3671762   0.63282382]]
[[ 0.41370878  0.586

KeyboardInterrupt: 

In [55]:
print(np.vstack(ep_history[20:30:,0]))
print((ep_history[20:30:,0]))


[[ -4.76735919e-02  -4.27363388e-02   6.79212013e-02   3.03759914e-01]
 [ -4.85283187e-02   1.51355209e-01   7.39963996e-02   3.32473673e-02]
 [ -4.55012145e-02   3.45342315e-01   7.46613469e-02  -2.35201933e-01]
 [ -3.85943682e-02   5.39322541e-01   6.99573083e-02  -5.03431620e-01]
 [ -2.78079174e-02   3.43287938e-01   5.98886759e-02  -1.89548295e-01]
 [ -2.09421586e-02   1.47362603e-01   5.60977100e-02   1.21409576e-01]
 [ -1.79949066e-02   3.41637836e-01   5.85259015e-02  -1.53060785e-01]
 [ -1.11621499e-02   5.35875066e-01   5.54646858e-02  -4.26720992e-01]
 [ -4.44648534e-04   3.40013192e-01   4.69302659e-02  -1.17081708e-01]
 [  6.35561530e-03   1.44251332e-01   4.45886318e-02   1.90030209e-01]]
[array([-0.04767359, -0.04273634,  0.0679212 ,  0.30375991])
 array([-0.04852832,  0.15135521,  0.0739964 ,  0.03324737])
 array([-0.04550121,  0.34534231,  0.07466135, -0.23520193])
 array([-0.03859437,  0.53932254,  0.06995731, -0.50343162])
 array([-0.02780792,  0.34328794,  0.05988868

In [15]:
#tf.reset_default_graph() #Clear the Tensorflow graph.
myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.
max_ep=999
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    s = env.reset()
    for j in range(max_ep):
        env.render()
        a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
        a = np.random.choice(a_dist[0],p=a_dist[0])
        a = np.argmax(a_dist == a)
        s1=env.step(a)
        s = s1[0]
        

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[2017-06-20 14:40:14,816] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


In [92]:
#np.random.choice([2,4],p=[.5,.5])
# indexes = tf.range(0, tf.shape(output)[0]) * tf.shape(output)[1] + action_holder
#         tf.Print(indexes, [indexes], message="This is a: ")
output

<tf.Tensor 'fully_connected_17/Softmax:0' shape=(1, 2) dtype=float32>

In [48]:
sess2 = tf.InteractiveSession()
tf.global_variables_initializer()
s_size=4
h_size=160
a_size=2
#state_in=  tf.constant([[3.0,4.0,2.0,1.0],[3.0,4.0,2.0,1.0]])
output=tf.constant([[1.0,2.0],[3.0,4.0],[5.0,6.0],[7.0,8.0]])
var = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
                      name="weights") 
action_holder=  tf.constant([0,0,0,1])
reward_holder=  tf.constant([4.5,3.6,2.5,1])
indexes = tf.range(0, 4) * 2 + action_holder
indexes=tf.Print(indexes, [indexes], message="Indexes: ")
responsible_outputs = tf.gather(tf.reshape(output, [-1]), indexes)
responsible_outputs=tf.Print(responsible_outputs, [responsible_outputs], message="Responsible Outputs: ")
loss=-tf.reduce_mean(tf.log(responsible_outputs)*reward_holder)
gradients = tf.gradients(loss,[var])[0]
print(gradients)
sess2.run(loss)




None


-2.5145102