In [26]:
import gym
import numpy as np

# Create the Cart-Pole game environment
env = gym.make('CartPole-v1')

# Number of possible actions
print('Number of possible actions:', env.action_space.n)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Number of possible actions: 2


In [27]:
import tensorflow as tf

class QNetwork:
    def __init__(self, learning_rate=0.01, state_size=4, 
                 action_size=2, hidden_size=10, 
                 name='QNetwork'):
        # state inputs to the Q-network
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            
            # One hot encode the actions to later choose the Q-value for the action
            self.actions_ = tf.placeholder(tf.int32, [None], name='actions')
            one_hot_actions = tf.one_hot(self.actions_, action_size)
            
            # Target Q values for training
            self.targetQs_ = tf.placeholder(tf.float32, [None], name='target')
            
            # ReLU hidden layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size)

            # Linear output layer
            self.output = tf.contrib.layers.fully_connected(self.fc2, action_size, 
                                                            activation_fn=None)
            
            ### Train with loss (targetQ - Q)^2
            # output has length 2, for two actions. This next line chooses
            # one value from output (per row) according to the one-hot encoded actions.
            self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

In [28]:
from collections import deque

class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

In [29]:
train_episodes = 1000          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 64               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 10000            # memory capacity
batch_size = 20                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [37]:
tf.reset_default_graph()

env = gym.make('CartPole-v1')
mainQN = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate)

# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())

memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):

    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        
        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state
        
# Now train with experiences
saver = tf.train.Saver()
rewards_list = []
with tf.Session() as sess:
    # Initialize variables
    sess.run(tf.global_variables_initializer())
    
    step = 0
    for ep in range(1, train_episodes):
        total_reward = 0
        t = 0
        while t < max_steps:
            step += 1
            # Uncomment this next line to watch the training
            # env.render() 
            
            # Explore or Exploit
            explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
            if explore_p > np.random.rand():
                # Make a random action
                action = env.action_space.sample()
            else:
                # Get action from Q-network
                feed = {mainQN.inputs_: state.reshape((1, *state.shape))}
                Qs = sess.run(mainQN.output, feed_dict=feed)
                action = np.argmax(Qs)
            
            # Take action, get new state and reward
            next_state, reward, done, _ = env.step(action)
    
            total_reward += reward
            
            if done:
                # the episode ends so no next state
                next_state = np.zeros(state.shape)
                t = max_steps
                
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Explore P: {:.4f}'.format(explore_p))
                rewards_list.append((ep, total_reward))
                
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                
                # Start new episode
                env.reset()
                # Take one random step to get the pole and cart moving
                state, reward, done, _ = env.step(env.action_space.sample())

            else:
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                state = next_state
                t += 1
            
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            states = np.array([each[0] for each in batch])
            actions = np.array([each[1] for each in batch])
            rewards = np.array([each[2] for each in batch])
            next_states = np.array([each[3] for each in batch])
            
            # Train network
            target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})
            
            # Set target_Qs to 0 for states where episode ends
            episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
            target_Qs[episode_ends] = (0, 0)
            
            targets = rewards + gamma * np.max(target_Qs, axis=1)

            loss, _ = sess.run([mainQN.loss, mainQN.opt],
                                feed_dict={mainQN.inputs_: states,
                                           mainQN.targetQs_: targets,
                                           mainQN.actions_: actions})
        
    saver.save(sess, "checkpoints/cartpole-them.ckpt")

Episode: 1 Total reward: 11.0 Training loss: 1.0315 Explore P: 0.9989
Episode: 2 Total reward: 18.0 Training loss: 1.0720 Explore P: 0.9971
Episode: 3 Total reward: 11.0 Training loss: 1.0438 Explore P: 0.9960
Episode: 4 Total reward: 44.0 Training loss: 1.0128 Explore P: 0.9917
Episode: 5 Total reward: 40.0 Training loss: 0.9768 Explore P: 0.9878
Episode: 6 Total reward: 28.0 Training loss: 1.0224 Explore P: 0.9851
Episode: 7 Total reward: 14.0 Training loss: 1.0101 Explore P: 0.9837
Episode: 8 Total reward: 26.0 Training loss: 1.0613 Explore P: 0.9812
Episode: 9 Total reward: 44.0 Training loss: 1.0730 Explore P: 0.9769
Episode: 10 Total reward: 11.0 Training loss: 0.9692 Explore P: 0.9758
Episode: 11 Total reward: 58.0 Training loss: 1.1488 Explore P: 0.9703
Episode: 12 Total reward: 21.0 Training loss: 1.0503 Explore P: 0.9682
Episode: 13 Total reward: 35.0 Training loss: 0.9060 Explore P: 0.9649
Episode: 14 Total reward: 11.0 Training loss: 1.1143 Explore P: 0.9638
Episode: 15 Tot

Episode: 125 Total reward: 13.0 Training loss: 112.2249 Explore P: 0.7964
Episode: 126 Total reward: 47.0 Training loss: 115.4710 Explore P: 0.7927
Episode: 127 Total reward: 9.0 Training loss: 38.7038 Explore P: 0.7920
Episode: 128 Total reward: 25.0 Training loss: 2.8918 Explore P: 0.7901
Episode: 129 Total reward: 9.0 Training loss: 91.3488 Explore P: 0.7894
Episode: 130 Total reward: 11.0 Training loss: 41.5676 Explore P: 0.7885
Episode: 131 Total reward: 22.0 Training loss: 47.0988 Explore P: 0.7868
Episode: 132 Total reward: 20.0 Training loss: 51.1520 Explore P: 0.7853
Episode: 133 Total reward: 14.0 Training loss: 60.6304 Explore P: 0.7842
Episode: 134 Total reward: 13.0 Training loss: 57.0783 Explore P: 0.7832
Episode: 135 Total reward: 13.0 Training loss: 46.5582 Explore P: 0.7822
Episode: 136 Total reward: 9.0 Training loss: 6.1925 Explore P: 0.7815
Episode: 137 Total reward: 9.0 Training loss: 47.4498 Explore P: 0.7808
Episode: 138 Total reward: 10.0 Training loss: 60.7637 

Episode: 245 Total reward: 12.0 Training loss: 39.2709 Explore P: 0.6641
Episode: 246 Total reward: 12.0 Training loss: 48.6901 Explore P: 0.6633
Episode: 247 Total reward: 26.0 Training loss: 2.6709 Explore P: 0.6616
Episode: 248 Total reward: 8.0 Training loss: 2.7375 Explore P: 0.6611
Episode: 249 Total reward: 13.0 Training loss: 25.8094 Explore P: 0.6602
Episode: 250 Total reward: 10.0 Training loss: 23.1138 Explore P: 0.6596
Episode: 251 Total reward: 17.0 Training loss: 37.3459 Explore P: 0.6585
Episode: 252 Total reward: 14.0 Training loss: 2.3844 Explore P: 0.6576
Episode: 253 Total reward: 15.0 Training loss: 3.6373 Explore P: 0.6566
Episode: 254 Total reward: 25.0 Training loss: 19.4205 Explore P: 0.6550
Episode: 255 Total reward: 12.0 Training loss: 2.9355 Explore P: 0.6542
Episode: 256 Total reward: 15.0 Training loss: 95.2248 Explore P: 0.6532
Episode: 257 Total reward: 18.0 Training loss: 39.9295 Explore P: 0.6521
Episode: 258 Total reward: 11.0 Training loss: 2.8798 Exp

Episode: 359 Total reward: 73.0 Training loss: 21.2399 Explore P: 0.5459
Episode: 360 Total reward: 25.0 Training loss: 12.6680 Explore P: 0.5446
Episode: 361 Total reward: 55.0 Training loss: 0.9695 Explore P: 0.5417
Episode: 362 Total reward: 35.0 Training loss: 7.7499 Explore P: 0.5398
Episode: 363 Total reward: 66.0 Training loss: 8.9419 Explore P: 0.5363
Episode: 364 Total reward: 19.0 Training loss: 9.0823 Explore P: 0.5353
Episode: 365 Total reward: 21.0 Training loss: 6.3500 Explore P: 0.5342
Episode: 366 Total reward: 52.0 Training loss: 9.2004 Explore P: 0.5315
Episode: 367 Total reward: 54.0 Training loss: 20.7629 Explore P: 0.5287
Episode: 368 Total reward: 38.0 Training loss: 1.7379 Explore P: 0.5267
Episode: 369 Total reward: 25.0 Training loss: 5.8685 Explore P: 0.5254
Episode: 370 Total reward: 80.0 Training loss: 1.8076 Explore P: 0.5213
Episode: 371 Total reward: 41.0 Training loss: 1.6989 Explore P: 0.5192
Episode: 372 Total reward: 96.0 Training loss: 7.6388 Explore

Episode: 475 Total reward: 174.0 Training loss: 1.5152 Explore P: 0.2549
Episode: 476 Total reward: 101.0 Training loss: 52.5082 Explore P: 0.2525
Episode: 477 Total reward: 64.0 Training loss: 32.0341 Explore P: 0.2509
Episode: 478 Total reward: 92.0 Training loss: 1.9479 Explore P: 0.2487
Episode: 479 Total reward: 88.0 Training loss: 1.0063 Explore P: 0.2466
Episode: 480 Total reward: 129.0 Training loss: 39.5081 Explore P: 0.2436
Episode: 481 Total reward: 147.0 Training loss: 0.9590 Explore P: 0.2402
Episode: 483 Total reward: 22.0 Training loss: 2.5352 Explore P: 0.2351
Episode: 485 Total reward: 43.0 Training loss: 1.7627 Explore P: 0.2297
Episode: 486 Total reward: 130.0 Training loss: 1.0097 Explore P: 0.2269
Episode: 487 Total reward: 159.0 Training loss: 1.9966 Explore P: 0.2235
Episode: 488 Total reward: 73.0 Training loss: 0.8327 Explore P: 0.2219
Episode: 489 Total reward: 136.0 Training loss: 1.1225 Explore P: 0.2191
Episode: 491 Total reward: 17.0 Training loss: 0.9376 

Episode: 692 Total reward: 89.0 Training loss: 0.2360 Explore P: 0.0209
Episode: 694 Total reward: 102.0 Training loss: 0.1549 Explore P: 0.0206
Episode: 696 Total reward: 22.0 Training loss: 0.1895 Explore P: 0.0204
Episode: 698 Total reward: 15.0 Training loss: 0.1540 Explore P: 0.0201
Episode: 700 Total reward: 82.0 Training loss: 0.0952 Explore P: 0.0199
Episode: 701 Total reward: 172.0 Training loss: 0.1240 Explore P: 0.0197
Episode: 703 Total reward: 80.0 Training loss: 0.1355 Explore P: 0.0194
Episode: 705 Total reward: 11.0 Training loss: 0.0967 Explore P: 0.0192
Episode: 707 Total reward: 177.0 Training loss: 0.1278 Explore P: 0.0189
Episode: 709 Total reward: 35.0 Training loss: 0.0724 Explore P: 0.0187
Episode: 710 Total reward: 190.0 Training loss: 0.3937 Explore P: 0.0185
Episode: 712 Total reward: 131.0 Training loss: 0.1341 Explore P: 0.0182
Episode: 715 Total reward: 37.0 Training loss: 0.0995 Explore P: 0.0179
Episode: 716 Total reward: 133.0 Training loss: 0.4001 Expl

Episode: 946 Total reward: 15.0 Training loss: 795.0436 Explore P: 0.0103
Episode: 949 Total reward: 99.0 Training loss: 1.6740 Explore P: 0.0103
Episode: 950 Total reward: 50.0 Training loss: 2.3543 Explore P: 0.0103
Episode: 951 Total reward: 43.0 Training loss: 3.4401 Explore P: 0.0103
Episode: 952 Total reward: 20.0 Training loss: 1.9404 Explore P: 0.0103
Episode: 953 Total reward: 13.0 Training loss: 2.2597 Explore P: 0.0103
Episode: 954 Total reward: 16.0 Training loss: 1.9588 Explore P: 0.0103
Episode: 955 Total reward: 14.0 Training loss: 586.8810 Explore P: 0.0103
Episode: 956 Total reward: 14.0 Training loss: 2.4387 Explore P: 0.0103
Episode: 957 Total reward: 10.0 Training loss: 4.0468 Explore P: 0.0103
Episode: 958 Total reward: 9.0 Training loss: 2.4275 Explore P: 0.0103
Episode: 959 Total reward: 12.0 Training loss: 3.2327 Explore P: 0.0103
Episode: 960 Total reward: 12.0 Training loss: 2.6738 Explore P: 0.0103
Episode: 961 Total reward: 9.0 Training loss: 2.1138 Explore 

In [35]:
def play(env_name, agent):
    env = gym.make(env_name)
    try:
        with tf.Session() as sess:
            saver.restore(sess, "checkpoints/cartpole-them.ckpt")
            state = env.reset()
            action = env.action_space.sample()
            env.render()
            state, reward, done, info = env.step(action)
            env.render()
            done = False
            total_reward = 0
            total_reward += reward
            while not done:
                feed = {agent.inputs_: state.reshape((1, *state.shape))}
                Qs = sess.run(agent.output, feed_dict=feed)
                action = np.argmax(Qs)
                state, reward, done, info = env.step(action)
                total_reward += reward
                print(state, action, reward)
                env.render()
            env.close()
            print(total_reward)
    finally:
        env.close()

In [36]:
play('CartPole-v1', mainQN)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from checkpoints/cartpole.ckpt


NotFoundError: Key main/beta1_power not found in checkpoint
	 [[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]

Caused by op 'save/RestoreV2', defined at:
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 281, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 232, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 397, in execute_request
    user_expressions, allow_stdin)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-30-9d23205b3d1b>", line 34, in <module>
    saver = tf.train.Saver()
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1311, in __init__
    self.build()
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1320, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1357, in _build
    build_save=build_save, build_restore=build_restore)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 809, in _build_internal
    restore_sequentially, reshape)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 448, in _AddRestoreOps
    restore_sequentially)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 860, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1458, in restore_v2
    shape_and_slices=shape_and_slices, dtypes=dtypes, name=name)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3290, in create_op
    op_def=op_def)
  File "/Users/uniyomi/omi/miniconda3/envs/py3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1654, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

NotFoundError (see above for traceback): Key main/beta1_power not found in checkpoint
	 [[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]


In [12]:
env.close()

In [3]:
import numpy as np
x = np.array(range(16)).reshape((4, 4))
x[1] *= 0
np.zeros(x.shape[1:])

array([0., 0., 0., 0.])

In [4]:
def one_hot(values, n_values:int):
    return np.eye(n_values)[values]

In [14]:
x = one_hot(np.array([[2, 1], [2, 3]]), 4)
x

array([[[0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.]]])

In [17]:
x[0, 0, 0] = 1
x

array([[[1., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 0., 1.]]])