In [1]:
import gym
import tensorflow as tf
import numpy as np
from collections import defaultdict

In [2]:
class NeuralAgent(object):
    def __init__(self, state_shape, num_actions, learning_rate=1e-3, hidden_size=32, gamma=0.9):
        self.gamma = gamma
        
        self.action = tf.placeholder(tf.int32, shape=[None])
        self.actions_one_hot = tf.one_hot(self.action, num_actions)
        
        self.state = tf.placeholder(tf.float32, shape=[None, *state_shape])
        self.layer0 = tf.layers.dense(self.state, hidden_size, activation=tf.nn.sigmoid)
        self.layer1 = tf.layers.dense(self.layer0, hidden_size, activation=tf.nn.sigmoid)
        self.value = tf.layers.dense(self.layer1, num_actions, activation=None)

        self.best_action = tf.squeeze(tf.argmax(self.value, axis=1))
        self.best_reward = tf.squeeze(tf.reduce_max(self.value, axis=1))

        # self.expected_reward = tf.squeeze(tf.gather(self.value, self.action, axis=1))
        self.expected_reward = tf.reduce_sum(tf.multiply(self.value, self.actions_one_hot), axis=1)
        
        # self.reward = tf.placeholder(tf.float32, shape=[None])
        self.target = tf.placeholder(tf.float32, shape=[None])
        # self.next_reward = tf.placeholder(tf.float32, shape=[None])
        # self.loss = tf.nn.l2_loss(self.reward + self.gamma * self.next_reward - self.expected_reward)
        self.loss = tf.reduce_mean(tf.square(self.target - self.expected_reward))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

def train(sess, env, agent, num_episodes, explore_decay=1e-4):
    for i in range(num_episodes):
        epsilon = np.exp(-explore_decay*i)
        state = env.reset()
        episode = []
        while True:
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action, = sess.run([agent.best_action], feed_dict={
                    agent.state: [state],
                })
            next_state, reward, done, info = env.step(action)
            if done:
                # next_state is 0
                episode.append((state, action, reward, np.zeros(state.shape)))
                break
            episode.append((state, action, reward, next_state))
            next_state = state
        state, action, reward, next_state = zip(*episode)
        next_reward, = sess.run([agent.best_reward], feed_dict={
            agent.state: next_state,
        })
        episode_ends = (next_state == np.zeros(state[0].shape)).all(axis=1)
        target = reward + agent.gamma * next_reward
        target[episode_ends] = 0
        _, loss = sess.run([agent.train, agent.loss], feed_dict={
            agent.state: state,
            agent.action: action,
            agent.target: target,
            # agent.reward: reward,
            # agent.next_reward: next_reward,
        })
        if i % (num_episodes//30) == 0:
            print(i, loss, epsilon)
        
def play(sess, env, agent):
    state = env.reset()
    env.render()
    done = False
    while not done:
        action, = sess.run([agent.best_action], feed_dict={
            agent.state: [state],
        })
        state, reward, done, info = env.step(action)
        print(state, action, reward)
        env.render()
    env.close()

In [3]:
env = gym.make('CartPole-v0')
tf.reset_default_graph()
agent = NeuralAgent(env.reset().shape, env.action_space.n, learning_rate=1e-3, hidden_size=32, gamma=0.99)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
train(sess, env, agent, 10000, explore_decay=1e-4)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
0 9.737889 1.0
333 5.8244243 0.9672483415560369
666 11.266284 0.9355693542429038
999 6.285566 0.9049279063021011
1332 19.181429 0.8752900165984839
1665 7.7142377 0.8466228169354396
1998 15.550501 0.8188945156043043
2331 12.143339 0.7920743621275974
2664 6.5326624 0.7661326131569743
2997 14.248684 0.7410404994880763
3330 13.471622 0.716770194155699
3663 6.231778 0.6932947815738983
3996 11.107171 0.670588227686808
4329 11.821829 0.6486253510970671
4662 13.661849 0.6273817951398404
4995 14.910696 0.6068340008714599
5328 10.649235 0.5869591809427341
5661 10.137462 0.5677352943279493
5994 11.536904 0.5491410218815375
6327 17.509365 0.5311557426953045
6660 8.896585 0.5137595112299983
6993 8.924464 0.496933035195856
7326 9.115854 0.4806576541575994
7659 7.055336 0.4649153188401532
7992 9.06588 0.4496885711121343
8325 10.184616 0.43496052462491586
8658 12.971654 0.42071484608579357
8991 

In [4]:
play(sess, env, agent)

[-0.04523102  0.18193199 -0.00183326 -0.31767383] 1 1.0
[-0.04159238  0.37708    -0.00818674 -0.61093434] 1 1.0
[-0.03405078  0.57231542 -0.02040542 -0.90618452] 1 1.0
[-0.02260447  0.76770764 -0.03852911 -1.20521061] 1 1.0
[-0.00725032  0.96330578 -0.06263333 -1.50971484] 1 1.0
[ 0.0120158   1.15912832 -0.09282762 -1.82127484] 1 1.0
[ 0.03519837  1.35515071 -0.12925312 -2.14129464] 1 1.0
[ 0.06230138  1.55129075 -0.17207901 -2.47094523] 1 1.0
[ 0.09332719  1.74739146 -0.22149792 -2.81109387] 1 1.0
