In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

env = gym.make('CartPole-v0')



[2017-03-04 02:25:35,168] Making new env: CartPole-v0


In [2]:
# Input and output size based on the Env
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
learining_rate = 1e-1

In [3]:
print(input_size, output_size, learining_rate)

4 2 0.1


In [4]:
# These lines establish the feed-forward part of the network used to choose actions
X = tf.placeholder(tf.float32, [None,input_size], name="input_x") # state input

In [5]:
# First layer of weights
W = tf.get_variable("W1", shape=[input_size, output_size],
                    initializer = tf.contrib.layers.xavier_initializer())

In [6]:
Qpred = tf.matmul(X, W) # Out Q prediction

In [7]:
# We need to define the parts of the network needed for learning a policy
Y = tf.placeholder(shape=[None, output_size],dtype=tf.float32) # Y label

In [8]:
loss = tf.reduce_sum(tf.square(Y - Qpred))

In [9]:
train = tf.train.AdamOptimizer(learning_rate=learining_rate).minimize(loss)

In [10]:
# Set Q-learning related parameters
dis = .99
num_episodes = 2000

In [11]:
# Create lists to contain total rewards and steps per episode
rList = []

In [12]:
init = tf.global_variables_initializer()

In [13]:
sess = tf.Session()
sess.run(init)

for i in range(num_episodes):
    # Reset environment and get first new observation
    e = 1. / ((i / 10) + 1)
    rAll = 0
    step_count = 0
    s = env.reset()
    done = False

    # The Q-Network training
    while not done:
        step_count += 1
        x = np.reshape(s, [1, input_size])
        # Choose and action by greedily (with a chance of random action) from the Q-network
        Qs = sess.run(Qpred,feed_dict={X: x})
        if np.random.rand(1) < e:
            a = env.action_space.sample()
        else:
            a = np.argmax(Qs)

        # Get new state and reward from environment 
        s1, reward, done, _ = env.step(a)
        if done:
            # Update Q, and no Qs+1, since it's a terminal state
            Qs[0, a] = -100
        else:
            x1 = np.reshape(s1, [1, input_size])
            # Obtain the Q_s1 values by feeding the new state through our network
            Qs1 = sess.run(Qpred, feed_dict={X: x1})
            # Update Q
            Qs[0, a] = reward + dis * np.max(Qs1)

        # Train our network using target (Y) and predicted Q (Qpred) values
        sess.run(train, feed_dict={X: x, Y: Qs})

        s = s1

    rList.append(step_count)
    if i % 50 == 0 :
        print("Episode: {} steps: {}".format(i, step_count))
    # If last 10's avg steps are 500, it's good enough
    if len(rList) > 10 and np.mean(rList[-10:]) > 500:
        break

Episode: 0 steps: 14
Episode: 50 steps: 25
Episode: 100 steps: 38
Episode: 150 steps: 28
Episode: 200 steps: 9
Episode: 250 steps: 17
Episode: 300 steps: 22
Episode: 350 steps: 15
Episode: 400 steps: 36
Episode: 450 steps: 26
Episode: 500 steps: 15
Episode: 550 steps: 50
Episode: 600 steps: 31
Episode: 650 steps: 26
Episode: 700 steps: 42
Episode: 750 steps: 22
Episode: 800 steps: 16
Episode: 850 steps: 9
Episode: 900 steps: 17
Episode: 950 steps: 30
Episode: 1000 steps: 16
Episode: 1050 steps: 19
Episode: 1100 steps: 15
Episode: 1150 steps: 9
Episode: 1200 steps: 20
Episode: 1250 steps: 23
Episode: 1300 steps: 62
Episode: 1350 steps: 11
Episode: 1400 steps: 9
Episode: 1450 steps: 44
Episode: 1500 steps: 10
Episode: 1550 steps: 33
Episode: 1600 steps: 24
Episode: 1650 steps: 43
Episode: 1700 steps: 34
Episode: 1750 steps: 10
Episode: 1800 steps: 23
Episode: 1850 steps: 19
Episode: 1900 steps: 31
Episode: 1950 steps: 20


In [14]:
#See out trained network in action
observation = env.reset()
reward_sum = 0
while True:
    env.render()
    
    x = np.reshape(observation, [1, input_size])
    Qs = sess.run(Qpred, feed_dict={X: x})
    a = np.argmax(Qs)
    
    observation, reward, done, _ = env.step(a)
    reward_sum += reward
    if done:
        print("Total score: {}".format(reward_sum))
        break


Total score: 29.0


In [15]:
1e-1

0.1

In [16]:
import cmath

In [17]:
cmath.log(cmath.exp(1)), cmath.exp(1)

((1+0j), (2.718281828459045+0j))