In [1]:
import tensorflow as tf
import numpy as np
import gym
import random
from collections import deque



In [2]:
num_episodes = 500
num_exploration_episodes = 100  
max_len_episode = 10000 
batch_size = 32           
gamma = 1.  
final_epsilon = 0.01
initial_epsilon = 1.         # epsilon for epsilon-greedy: epsilon probability to explore

In [3]:
# class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
#     def __init__(self):
#         super(CustomSchedule, self).__init__()
    
#     def __call__(self, step):

#         return 1.0e-3*(0.99**step)

In [4]:
learning_rate = 0.001

In [None]:
class QNetwork(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense3 = tf.keras.layers.Dense(units=2)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        return x

    def predict(self, inputs):
        q_values = self(inputs)
        return tf.argmax(q_values, axis=-1)

In [None]:
env = gym.make('CartPole-v1') 
model = QNetwork()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
replay_buffer = deque(maxlen=10000)
for episode_id in range(num_episodes):
    state = env.reset()             # init state
    epsilon = max(                  # dynamic changing epsilon (explore rate for epsilon greedy)
        initial_epsilon * (num_exploration_episodes - episode_id) / num_exploration_episodes,
        final_epsilon)
    for t in range(max_len_episode):
        env.render()                                # draw
        if random.random() < epsilon:               # by random choose explore
            action = env.action_space.sample()      # random pick one action
        else: # exploit 
            action = model.predict(np.expand_dims(state, axis=0)).numpy() 
            action = action[0]

        # excute action: return next_state, reward, if game is over, and extra info
        next_state, reward, done, info = env.step(action)
        if done:
            reward = -10.  # game over => negative reward
        # append (state, action, reward, next_state, done) to replay_buffer
        replay_buffer.append((state, action, reward, next_state, 1 if done else 0))
        # update state
        state = next_state

        if done:
#             print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
            print("episode %d, score %d" % (episode_id, t))
            break

        if len(replay_buffer) >= batch_size:
            # get a batch of state, action, reward, ext_state, done
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
                *random.sample(replay_buffer, batch_size))
            batch_state, batch_reward, batch_next_state, batch_done = \
                [np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
            batch_action = np.array(batch_action, dtype=np.int32)

            q_value = model(batch_next_state)
            # Bellman Equation
            y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done)
            with tf.GradientTape() as tape:
                loss = tf.keras.losses.mean_squared_error(  # Square
                    y_true=y,
                    y_pred=tf.reduce_sum(model(batch_state) * tf.one_hot(batch_action, depth=2), axis=1)
                )
            grads = tape.gradient(loss, model.variables)
            optimizer.apply_gradients(grads_and_vars=zip(grads, model.variables))       

episode 0, score 19
episode 1, score 32
episode 2, score 62
episode 3, score 14


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.





To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



episode 4, score 13
episode 5, score 44
episode 6, score 10
episode 7, score 30
episode 8, score 12
episode 9, score 19
episode 10, score 25
episode 11, score 13
episode 12, score 10
episode 13, score 10
episode 14, score 15
episode 15, score 14
episode 16, score 15
episode 17, score 21
episode 18, score 18
episode 19, score 19
episode 20, score 23
episode 21, score 35
episode 22, score 10
episode 23, score 18
episode 24, score 18
episode 25, score 12
episode 26, score 15
episode 27, score 17
episode 28, score 17
episode 29, score 18
episode 30, score 14
episode 31, score 20
episode 32, score 8
episode 33, score 17
episode 34, score 18
episode 35, score 14
episode 36, score 15
episode 37, score 7
episode 38, score 12
episode 39, score 17
episode 40, score 9
episode 41, score 14
episode 42, score 11
episode 43, score 12
episode 44, score 19
episode 45, score 13
episode 46, score 17
episode 47, score 17
episode 48, score 76
episode 49, score 29
episode 50, score 16
episode 51, score 31
e

episode 379, score 10
episode 380, score 10
episode 381, score 11
episode 382, score 11
episode 383, score 8
episode 384, score 9
episode 385, score 9
episode 386, score 8
episode 387, score 9
episode 388, score 8
episode 389, score 9
episode 390, score 10
episode 391, score 8
episode 392, score 9
episode 393, score 9
episode 394, score 9
episode 395, score 8
episode 396, score 11
episode 397, score 11
episode 398, score 12
episode 399, score 16
episode 400, score 200
episode 401, score 9
episode 402, score 9
episode 403, score 8
episode 404, score 9
episode 405, score 9
episode 406, score 7
episode 407, score 7
episode 408, score 7
episode 409, score 9
episode 410, score 7
episode 411, score 9
episode 412, score 8
episode 413, score 9
episode 414, score 9
episode 415, score 11
episode 416, score 9
episode 417, score 10
episode 418, score 10
episode 419, score 7
episode 420, score 8
episode 421, score 9
episode 422, score 8
episode 423, score 8
episode 424, score 7
episode 425, score 7