In [40]:
import gym
import numpy as np
import tensorflow as tf
from collections import deque

In [59]:
class DeepQNet:
    def __init__(self,
                 learning_rate=1e-2,
                 state_shape=4,
                 num_actions=2,
                 hidden=16,
                 name='DeepQNet'
                ):
        with tf.variable_scope(name):
            self.state = tf.placeholder(tf.float32, [None, state_shape], name='state')
            self.action = tf.placeholder(tf.int32, [None], name='action')
            action_one_hot = tf.one_hot(self.action, num_actions)
            
            self.target = tf.placeholder(tf.float32, [None], name='target')
            
            self.hidden0 = tf.contrib.layers.fully_connected(self.state, hidden)
            self.hidden1 = tf.contrib.layers.fully_connected(self.hidden0, hidden)
            
            self.value = tf.contrib.layers.fully_connected(self.hidden1, num_actions,
                                                           activation_fn=None)
            
            self.predicted_reward = tf.reduce_sum(tf.multiply(self.value, action_one_hot), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.target - self.predicted_reward))
            self.opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]
    
def play(env_name, agent):
    env = gym.make(env_name)
    try:
        with tf.Session() as sess:
            saver.restore(sess, "checkpoints/cartpole.ckpt")
            state = env.reset()
            action = env.action_space.sample()
            env.render()
            state, reward, done, info = env.step(action)
            env.render()
            done = False
            total_reward = 0
            total_reward += reward
            while not done:
                value = sess.run(agent.value, feed_dict={
                    agent.state: [state],
                })
                action = np.argmax(value)
                state, reward, done, info = env.step(action)
                total_reward += reward
                print(state, action, reward)
                env.render()
            print(total_reward)
    finally:
        env.close()

In [60]:
train_episodes = 1000          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 64               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 10000            # memory capacity
batch_size = 20                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [68]:
tf.reset_default_graph()

env = gym.make('CartPole-v1')
agent = DeepQNet(name='agent', hidden=hidden_size, learning_rate=learning_rate)
memory = Memory(max_size=memory_size)

# seed memory
print("Seeding memory...")
# get it moving
env.reset()
state, reward, done, _ = env.step(env.action_space.sample())
for i in range(pretrain_length):
    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        
        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state

print("Running training...")
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    epsilon_counter = 0
    for episode in range(train_episodes):
        total_reward = 0
        while not done:
            # Explore/exploit trade-off
            epsilon_counter += 1
            epsilon = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * epsilon_counter)
            if epsilon > np.random.rand():
                action = env.action_space.sample()
            else:
                value = sess.run(agent.value, feed_dict={
#                     agent.state: [state],
                    agent.state: state.reshape((1, *state.shape))
                })
                action = np.argmax(value)
            
            # Take an action
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            
            if done:
                next_state = np.zeros(state.shape)
                t = max_steps
                memory.add((state, action, reward, next_state))
                
                print('Episode: {}'.format(episode),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Epsilon: {:.4f}'.format(epsilon))
                
                env.reset()
                state, reward, done, _ = env.step(env.action_space.sample())
                break
            else:
                memory.add((state, action, reward, next_state))
                state = next_state
            
            # train
            states, actions, rewards, next_states = zip(*memory.sample(batch_size))
            # print("States", states)
            # print("Actions", actions)
            # print("Rewards", rewards)
            # print("Next states", next_states)
            
            next_values = sess.run(agent.value, feed_dict={
#                 agent.state: states,
                agent.state: next_states,
            })
            episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
            # next_values[episode_ends] = [0, 0]
            next_values[episode_ends] = (0,0)
            
            targets = rewards + gamma * np.max(next_values, axis=1)
            # print("Targets", targets)
            loss, _ = sess.run([agent.loss, agent.opt], feed_dict={
                agent.state: states,
                agent.target: targets,
                agent.action: actions,
            })
            # print("Loss", loss)
        # break
    saver.save(sess, "checkpoints/cartpole.ckpt")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Seeding memory...
Running training...
Episode: 0 Total reward: 9.0 Training loss: 1.3202 Epsilon: 0.9991
Episode: 1 Total reward: 26.0 Training loss: 1.2901 Epsilon: 0.9965
Episode: 2 Total reward: 37.0 Training loss: 1.5444 Epsilon: 0.9929
Episode: 3 Total reward: 24.0 Training loss: 1.2669 Epsilon: 0.9905
Episode: 4 Total reward: 21.0 Training loss: 0.9769 Epsilon: 0.9885
Episode: 5 Total reward: 22.0 Training loss: 1.4089 Epsilon: 0.9863
Episode: 6 Total reward: 52.0 Training loss: 1.2830 Epsilon: 0.9813
Episode: 7 Total reward: 12.0 Training loss: 1.2137 Epsilon: 0.9801
Episode: 8 Total reward: 16.0 Training loss: 1.2570 Epsilon: 0.9786
Episode: 9 Total reward: 21.0 Training loss: 1.2114 Epsilon: 0.9765
Episode: 10 Total reward: 23.0 Training loss: 1.4420 Epsilon: 0.9743
Episode: 11 Total reward: 40.0 Training loss: 1.8219 Epsilon: 0.9705
Episode: 12 Total reward: 22.0 Traini

Episode: 132 Total reward: 10.0 Training loss: 234.3138 Epsilon: 0.7807
Episode: 133 Total reward: 17.0 Training loss: 65.8640 Epsilon: 0.7794
Episode: 134 Total reward: 16.0 Training loss: 4.8000 Epsilon: 0.7782
Episode: 135 Total reward: 7.0 Training loss: 76.0896 Epsilon: 0.7776
Episode: 136 Total reward: 22.0 Training loss: 67.2840 Epsilon: 0.7759
Episode: 137 Total reward: 12.0 Training loss: 137.0765 Epsilon: 0.7750
Episode: 138 Total reward: 8.0 Training loss: 105.0211 Epsilon: 0.7744
Episode: 139 Total reward: 29.0 Training loss: 4.9385 Epsilon: 0.7722
Episode: 140 Total reward: 48.0 Training loss: 141.5664 Epsilon: 0.7685
Episode: 141 Total reward: 13.0 Training loss: 4.6600 Epsilon: 0.7676
Episode: 142 Total reward: 45.0 Training loss: 125.7756 Epsilon: 0.7642
Episode: 143 Total reward: 13.0 Training loss: 96.9360 Epsilon: 0.7632
Episode: 144 Total reward: 23.0 Training loss: 205.0935 Epsilon: 0.7615
Episode: 145 Total reward: 16.0 Training loss: 3.9069 Epsilon: 0.7602
Episod

Episode: 267 Total reward: 9.0 Training loss: 1.7540 Epsilon: 0.6289
Episode: 268 Total reward: 9.0 Training loss: 19.6852 Epsilon: 0.6284
Episode: 269 Total reward: 18.0 Training loss: 46.4078 Epsilon: 0.6273
Episode: 270 Total reward: 14.0 Training loss: 59.7192 Epsilon: 0.6264
Episode: 271 Total reward: 7.0 Training loss: 16.4538 Epsilon: 0.6260
Episode: 272 Total reward: 12.0 Training loss: 34.7354 Epsilon: 0.6252
Episode: 273 Total reward: 15.0 Training loss: 1.9164 Epsilon: 0.6243
Episode: 274 Total reward: 10.0 Training loss: 105.1852 Epsilon: 0.6237
Episode: 275 Total reward: 18.0 Training loss: 58.6560 Epsilon: 0.6226
Episode: 276 Total reward: 25.0 Training loss: 35.2930 Epsilon: 0.6211
Episode: 277 Total reward: 20.0 Training loss: 25.6033 Epsilon: 0.6198
Episode: 278 Total reward: 16.0 Training loss: 1.2940 Epsilon: 0.6189
Episode: 279 Total reward: 19.0 Training loss: 1.7307 Epsilon: 0.6177
Episode: 280 Total reward: 9.0 Training loss: 59.9218 Epsilon: 0.6172
Episode: 281 

Episode: 384 Total reward: 70.0 Training loss: 17.3016 Epsilon: 0.3719
Episode: 385 Total reward: 47.0 Training loss: 22.6296 Epsilon: 0.3702
Episode: 386 Total reward: 50.0 Training loss: 0.7775 Epsilon: 0.3684
Episode: 387 Total reward: 28.0 Training loss: 1.3259 Epsilon: 0.3674
Episode: 388 Total reward: 45.0 Training loss: 1.4220 Epsilon: 0.3658
Episode: 389 Total reward: 74.0 Training loss: 0.7588 Epsilon: 0.3632
Episode: 390 Total reward: 101.0 Training loss: 20.6088 Epsilon: 0.3596
Episode: 391 Total reward: 77.0 Training loss: 21.9845 Epsilon: 0.3569
Episode: 392 Total reward: 185.0 Training loss: 37.4617 Epsilon: 0.3506
Episode: 393 Total reward: 192.0 Training loss: 48.4974 Epsilon: 0.3441
Episode: 394 Total reward: 71.0 Training loss: 33.0499 Epsilon: 0.3417
Episode: 395 Total reward: 55.0 Training loss: 49.3607 Epsilon: 0.3399
Episode: 396 Total reward: 51.0 Training loss: 33.1060 Epsilon: 0.3382
Episode: 397 Total reward: 50.0 Training loss: 2.0712 Epsilon: 0.3366
Episode:

Episode: 507 Total reward: 25.0 Training loss: 0.8357 Epsilon: 0.1249
Episode: 508 Total reward: 18.0 Training loss: 302.3828 Epsilon: 0.1247
Episode: 509 Total reward: 21.0 Training loss: 1.9696 Epsilon: 0.1245
Episode: 510 Total reward: 23.0 Training loss: 2.7236 Epsilon: 0.1242
Episode: 511 Total reward: 19.0 Training loss: 1.1180 Epsilon: 0.1240
Episode: 512 Total reward: 22.0 Training loss: 554.8776 Epsilon: 0.1238
Episode: 513 Total reward: 23.0 Training loss: 2.2141 Epsilon: 0.1235
Episode: 514 Total reward: 22.0 Training loss: 0.9496 Epsilon: 0.1232
Episode: 515 Total reward: 18.0 Training loss: 87.0895 Epsilon: 0.1230
Episode: 516 Total reward: 16.0 Training loss: 1.7243 Epsilon: 0.1229
Episode: 517 Total reward: 18.0 Training loss: 1.5886 Epsilon: 0.1227
Episode: 518 Total reward: 22.0 Training loss: 1.8271 Epsilon: 0.1224
Episode: 519 Total reward: 14.0 Training loss: 1.9353 Epsilon: 0.1222
Episode: 520 Total reward: 16.0 Training loss: 1.7351 Epsilon: 0.1221
Episode: 521 To

Episode: 633 Total reward: 12.0 Training loss: 5.3152 Epsilon: 0.0357
Episode: 634 Total reward: 13.0 Training loss: 9.3164 Epsilon: 0.0357
Episode: 635 Total reward: 10.0 Training loss: 7.7914 Epsilon: 0.0357
Episode: 636 Total reward: 8.0 Training loss: 8.4832 Epsilon: 0.0356
Episode: 637 Total reward: 10.0 Training loss: 8.2978 Epsilon: 0.0356
Episode: 638 Total reward: 16.0 Training loss: 1434.7599 Epsilon: 0.0356
Episode: 639 Total reward: 10.0 Training loss: 4.5956 Epsilon: 0.0356
Episode: 640 Total reward: 14.0 Training loss: 932.8688 Epsilon: 0.0355
Episode: 641 Total reward: 13.0 Training loss: 3.9004 Epsilon: 0.0355
Episode: 642 Total reward: 12.0 Training loss: 6.4953 Epsilon: 0.0355
Episode: 643 Total reward: 14.0 Training loss: 3.3754 Epsilon: 0.0354
Episode: 644 Total reward: 14.0 Training loss: 3.2676 Epsilon: 0.0354
Episode: 645 Total reward: 17.0 Training loss: 1288.5881 Epsilon: 0.0353
Episode: 646 Total reward: 16.0 Training loss: 6.0662 Epsilon: 0.0353
Episode: 647 

Episode: 750 Total reward: 193.0 Training loss: 10.6497 Epsilon: 0.0236
Episode: 751 Total reward: 245.0 Training loss: 4.4834 Epsilon: 0.0233
Episode: 752 Total reward: 499.0 Training loss: 3.1278 Epsilon: 0.0227
Episode: 753 Total reward: 430.0 Training loss: 6.7297 Epsilon: 0.0221
Episode: 754 Total reward: 412.0 Training loss: 1.7403 Epsilon: 0.0216
Episode: 755 Total reward: 499.0 Training loss: 4.3917 Epsilon: 0.0211
Episode: 756 Total reward: 288.0 Training loss: 1.5160 Epsilon: 0.0208
Episode: 757 Total reward: 323.0 Training loss: 3.2289 Epsilon: 0.0204
Episode: 758 Total reward: 381.0 Training loss: 1.2445 Epsilon: 0.0200
Episode: 759 Total reward: 468.0 Training loss: 0.8396 Epsilon: 0.0196
Episode: 760 Total reward: 286.0 Training loss: 299.7230 Epsilon: 0.0193
Episode: 761 Total reward: 230.0 Training loss: 204.0134 Epsilon: 0.0191
Episode: 762 Total reward: 258.0 Training loss: 1.1457 Epsilon: 0.0189
Episode: 763 Total reward: 361.0 Training loss: 0.7914 Epsilon: 0.0185
E

Episode: 866 Total reward: 499.0 Training loss: 0.2628 Epsilon: 0.0104
Episode: 867 Total reward: 499.0 Training loss: 0.1465 Epsilon: 0.0103
Episode: 868 Total reward: 499.0 Training loss: 0.2702 Epsilon: 0.0103
Episode: 869 Total reward: 343.0 Training loss: 0.2242 Epsilon: 0.0103
Episode: 870 Total reward: 499.0 Training loss: 0.1864 Epsilon: 0.0103
Episode: 871 Total reward: 499.0 Training loss: 0.1409 Epsilon: 0.0103
Episode: 872 Total reward: 499.0 Training loss: 0.1564 Epsilon: 0.0103
Episode: 873 Total reward: 499.0 Training loss: 0.1657 Epsilon: 0.0103
Episode: 874 Total reward: 499.0 Training loss: 0.1908 Epsilon: 0.0102
Episode: 875 Total reward: 499.0 Training loss: 0.2210 Epsilon: 0.0102
Episode: 876 Total reward: 499.0 Training loss: 0.1578 Epsilon: 0.0102
Episode: 877 Total reward: 499.0 Training loss: 0.2042 Epsilon: 0.0102
Episode: 878 Total reward: 499.0 Training loss: 0.1256 Epsilon: 0.0102
Episode: 879 Total reward: 499.0 Training loss: 0.1145 Epsilon: 0.0102
Episod

Episode: 982 Total reward: 499.0 Training loss: 0.1828 Epsilon: 0.0100
Episode: 983 Total reward: 499.0 Training loss: 0.1225 Epsilon: 0.0100
Episode: 984 Total reward: 499.0 Training loss: 0.2002 Epsilon: 0.0100
Episode: 985 Total reward: 499.0 Training loss: 0.1672 Epsilon: 0.0100
Episode: 986 Total reward: 499.0 Training loss: 0.0355 Epsilon: 0.0100
Episode: 987 Total reward: 499.0 Training loss: 0.0566 Epsilon: 0.0100
Episode: 988 Total reward: 499.0 Training loss: 0.1001 Epsilon: 0.0100
Episode: 989 Total reward: 499.0 Training loss: 0.0906 Epsilon: 0.0100
Episode: 990 Total reward: 499.0 Training loss: 0.0309 Epsilon: 0.0100
Episode: 991 Total reward: 499.0 Training loss: 0.1462 Epsilon: 0.0100
Episode: 992 Total reward: 499.0 Training loss: 0.1986 Epsilon: 0.0100
Episode: 993 Total reward: 499.0 Training loss: 0.1252 Epsilon: 0.0100
Episode: 994 Total reward: 499.0 Training loss: 0.0722 Epsilon: 0.0100
Episode: 995 Total reward: 499.0 Training loss: 0.0994 Epsilon: 0.0100
Episod

In [69]:
play("CartPole-v1", agent)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from checkpoints/cartpole.ckpt
[ 0.02398992  0.02048327 -0.00087136  0.00638182] 0 1.0
[ 0.02439959 -0.17462617 -0.00074373  0.29878969] 0 1.0
[ 0.02090706 -0.36973751  0.00523207  0.59123797] 0 1.0
[ 0.01351231 -0.1746892   0.01705683  0.30020773] 1 1.0
[0.01001853 0.02018553 0.02306098 0.01295262] 1 1.0
[ 0.01042224 -0.17525943  0.02332003  0.31282142] 0 1.0
[0.00691705 0.01952268 0.02957646 0.02758317] 1 1.0
[ 0.0073075  -0.17601066  0.03012813  0.32944912] 0 1.0
[0.00378729 0.01866973 0.03671711 0.04641738] 1 1.0
[ 0.00416069  0.21324647  0.03764546 -0.23445861] 1 1.0
[0.00842562 0.01760744 0.03295628 0.06985718] 0 1.0
[ 0.00877776  0.21224178  0.03435343 -0.21224841] 1 1.0
[0.0130226  0.01664593 0.03010846 0.09107017] 0 1.0
[ 0.01335552  0.21132368  0.03192986 -0.19196355] 1 1.0
[0.01758199 0.01575985 0.02809059 0.11061846] 0 1.0
[ 0.0178

[ 0.51167374  0.39954289  0.00291997 -0.33226931] 1 1.0
[ 0.51966459  0.2043795  -0.00372542 -0.038667  ] 0 1.0
[ 0.52375218  0.00931117 -0.00449876  0.25283819] 0 1.0
[ 0.52393841  0.20449707  0.00055801 -0.04126032] 1 1.0
[ 5.28028348e-01  9.36712023e-03 -2.67201322e-04  2.51598607e-01] 0 1.0
[ 0.52821569  0.20449289  0.00476477 -0.04116859] 1 1.0
[ 0.53230555  0.39954619  0.0039414  -0.33234438] 1 1.0
[ 0.54029647  0.20436836 -0.00270549 -0.03842114] 0 1.0
[ 0.54438384  0.00928531 -0.00347391  0.25340695] 0 1.0
[ 0.54456954  0.20445669  0.00159423 -0.04036969] 1 1.0
[ 0.54865868  0.39955574  0.00078683 -0.33254919] 1 1.0
[ 0.55664979  0.2044226  -0.00586415 -0.03961824] 0 1.0
[ 0.56073825  0.00938523 -0.00665651  0.25120873] 0 1.0
[ 0.56092595  0.2046016  -0.00163234 -0.04356633] 1 1.0
[ 0.56501798  0.0095031  -0.00250367  0.24860113] 0 1.0
[ 0.56520804  0.20466071  0.00246836 -0.04487046] 1 1.0
[ 0.56930126  0.39974718  0.00157095 -0.33677358] 1 1.0
[ 0.5772962   0.20460291 -0.0051

[ 1.01944391  0.39763858  0.00835927 -0.29027376] 1 1.0
[1.02739668 0.20239844 0.00255379 0.00503382] 0 1.0
[1.03144465 0.00723995 0.00265447 0.29852141] 0 1.0
[1.03158945 0.20232397 0.0086249  0.00667683] 1 1.0
[ 1.03563593  0.39732117  0.00875844 -0.2832724 ] 1 1.0
[1.04358235 0.2020754  0.00309299 0.01215997] 0 1.0
[ 1.04762386  0.39715286  0.00333619 -0.27954548] 1 1.0
[ 1.05556692  0.20198347 -0.00225472  0.0141878 ] 0 1.0
[ 1.05960659  0.00689393 -0.00197097  0.30615849] 0 1.0
[1.05974447 0.20204391 0.0041522  0.01285462] 1 1.0
[1.06378535 0.00686266 0.0044093  0.30684472] 0 1.0
[1.0639226  0.2019215  0.01054619 0.01555563] 1 1.0
[ 1.06796103  0.39689063  0.0108573  -0.27378128] 1 1.0
[1.07589884 0.20161546 0.00538168 0.02230619] 0 1.0
[ 1.07993115  0.39665982  0.0058278  -0.26867391] 1 1.0
[1.08786435e+00 2.01455190e-01 4.54323057e-04 2.58414341e-02] 0 1.0
[1.09189345e+00 6.32672650e-03 9.71151739e-04 3.18667670e-01] 0 1.0
[1.09201999 0.20143483 0.00734451 0.02629117] 1 1.0
[ 1.

[ 1.69728919e+00  2.07278410e-01 -1.45873359e-03 -1.02616115e-01] 1 1.0
[ 1.70143475  0.40242124 -0.00351106 -0.39575891] 1 1.0
[ 1.70948318  0.20734928 -0.01142623 -0.10418502] 0 1.0
[ 1.71363017  0.01239292 -0.01350993  0.18487118] 0 1.0
[ 1.71387802  0.20770554 -0.00981251 -0.11204282] 1 1.0
[ 1.71803213  0.01272556 -0.01205337  0.17752824] 0 1.0
[ 1.71828665  0.20801791 -0.0085028  -0.11893261] 1 1.0
[ 1.722447    0.01301881 -0.01088145  0.17105567] 0 1.0
[ 1.72270738  0.20829481 -0.00746034 -0.12504006] 1 1.0
[ 1.72687328  0.01328052 -0.00996114  0.16527991] 0 1.0
[ 1.72713889  0.20854364 -0.00665554 -0.13052876] 1 1.0
[ 1.73130976  0.01351766 -0.00926612  0.16004701] 0 1.0
[ 1.73158011  0.20877103 -0.00606518 -0.13554468] 1 1.0
[ 1.73575553  0.01373648 -0.00877607  0.15521862] 0 1.0
[ 1.73603026  0.20898298 -0.0056717  -0.14021998] 1 1.0
[ 1.74020992  0.01394272 -0.0084761   0.15066824] 0 1.0
[ 1.74048878  0.20918501 -0.00546274 -0.14467662] 1 1.0
[ 1.74467248  0.01414171 -0.0083

In [8]:
env.close()