In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

env = gym.make('CartPole-v0')



[2017-02-13 21:05:28,263] Making new env: CartPole-v0


In [2]:
# Input and output size based on the Env
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
learining_rate = 1e-1

In [3]:
# These lines establish the feed-forward part of the network used to choose actions
X = tf.placeholder(tf.float32, [None,input_size], name="input_x") # state input

In [4]:
# First layer of weights
W = tf.get_variable("W1", shape=[input_size, output_size],
                    initializer = tf.contrib.layers.xavier_initializer())

In [5]:
Qpred = tf.matmul(X, W) # Out Q prediction

In [6]:
# We need to define the parts of the network needed for learning a policy
Y = tf.placeholder(shape=[None, output_size],dtype=tf.float32) # Y label

In [7]:
loss = tf.reduce_sum(tf.square(Y - Qpred))

In [8]:
train = tf.train.AdamOptimizer(learning_rate=learining_rate).minimize(loss)

In [9]:
# Set Q-learning related parameters
dis = .99
num_episodes = 2000

In [10]:
# Create lists to contain total rewards and steps per episode
rList = []

In [11]:
init = tf.global_variables_initializer()

In [12]:
sess = tf.Session()
sess.run(init)

for i in range(num_episodes):
    # Reset environment and get first new observation
    e = 1. / ((i / 10) + 1)
    rAll = 0
    step_count = 0
    s = env.reset()
    done = False

    # The Q-Network training
    while not done:
        step_count += 1
        x = np.reshape(s, [1, input_size])
        # Choose and action by greedily (with a chance of random action) from the Q-network
        Qs = sess.run(Qpred,feed_dict={X: x})
        if np.random.rand(1) < e:
            a = env.action_space.sample()
        else:
            a = np.argmax(Qs)

        # Get new state and reward from environment 
        s1, reward, done, _ = env.step(a)
        if done:
            # Update Q, and no Qs+1, since it's a terminal state
            Qs[0, a] = -100
        else:
            x1 = np.reshape(s1, [1, input_size])
            # Obtain the Q_s1 values by feeding the new state through our network
            Qs1 = sess.run(Qpred, feed_dict={X: x1})
            # Update Q
            Qs[0, a] = reward + dis * np.max(Qs1)

        # Train our network using target (Y) and predicted Q (Qpred) values
        sess.run(train, feed_dict={X: x, Y: Qs})

        s = s1

    rList.append(step_count)
    print("Episode: {} steps: {}".format(i, step_count))
    # If last 10's avg steps are 500, it's good enough
    if len(rList) > 10 and np.mean(rList[-10:]) > 500:
        break

Episode: 0 steps: 15
Episode: 1 steps: 14
Episode: 2 steps: 15
Episode: 3 steps: 16
Episode: 4 steps: 13
Episode: 5 steps: 11
Episode: 6 steps: 12
Episode: 7 steps: 17
Episode: 8 steps: 15
Episode: 9 steps: 11
Episode: 10 steps: 9
Episode: 11 steps: 11
Episode: 12 steps: 9
Episode: 13 steps: 11
Episode: 14 steps: 9
Episode: 15 steps: 14
Episode: 16 steps: 13
Episode: 17 steps: 11
Episode: 18 steps: 11
Episode: 19 steps: 12
Episode: 20 steps: 66
Episode: 21 steps: 17
Episode: 22 steps: 13
Episode: 23 steps: 10
Episode: 24 steps: 28
Episode: 25 steps: 66
Episode: 26 steps: 166
Episode: 27 steps: 39
Episode: 28 steps: 30
Episode: 29 steps: 22
Episode: 30 steps: 22
Episode: 31 steps: 25
Episode: 32 steps: 16
Episode: 33 steps: 9
Episode: 34 steps: 11
Episode: 35 steps: 18
Episode: 36 steps: 30
Episode: 37 steps: 37
Episode: 38 steps: 31
Episode: 39 steps: 47
Episode: 40 steps: 15
Episode: 41 steps: 28
Episode: 42 steps: 23
Episode: 43 steps: 49
Episode: 44 steps: 20
Episode: 45 steps: 29
E

In [14]:
#See out trained network in action
observation = env.reset()
reward_sum = 0
while True:
    env.render()
    
    x = np.reshape(observation, [1, input_size])
    Qs = sess.run(Qpred, feed_dict={X: x})
    a = np.argmax(Qs)
    
    observation, reward, done, _ = env.step(a)
    reward_sum += reward
    if done:
        print("Total score: {}".format(reward_sum))
        break


Total score: 15.0
