# Double DQN (DDQN) Taxi with gym and TensorFlow

In [0]:
## setup

import gym
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import numpy as np
import random
import time

In [0]:
## hyperparameter

ENV_NAME = 'Taxi-v3'

# seed for initial position in environment
SEED = 123

# the learning rate used by RMSProp in "human-level control through deep reinforcement learning"
LEARNING_RATE = 0.00025
MOMENTUM = 0.95

# taxi environment
STATE_SIZE = 1
ACTION_SIZE = 6

# soft target update
# value used in "continuous control with deep reinforcement learning"
TAU = 0.001 

BATCH_SIZE = 32

# gamma in Bellman equation
GAMMA = 0.99

# epsilon in epsilon greedy algorithm
# we implement epsilon decay
EPSILON = 1.0
# EPSILON_DECAY = 0.99999
EPSILON_DECAY = 0.99995
# EPSILON_MIN = 0.1
EPSILON_MIN = 0.01

# max step in each episode
T_RANGE = 201

# taxi environment
STATE_SIZE = 1
ACTION_SIZE = 6

# training
# EPISODES = 5000
# MONITOR_INTERVAL = 100
EPISODES = 100
# MONITOR_INTERVAL = 10
MONITOR_INTERVAL = 1

In [36]:
## environment

env = gym.make(ENV_NAME)
# env.seed(SEED)
env.render()

+---------+
|[35mR[0m: | : :[43mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [0]:
## experience replay

class Replay:
    def __init__(self):
        self.buffer = []
        self.length = 0
        self.max_length = 10000

    def write(self, data):
        if self.length >= self.max_length:
            self.buffer.pop(0)
            self.length -= 1
        self.buffer.append(data)
        self.length += 1

    def read(self, batch_size):
        # at beginning buffer is almost empty, so batch is smaller than batch_size
        return random.sample(self.buffer, min(batch_size, self.length))

In [0]:
## network

# we use the same architectures for online_network and target_network
# we _build_model 2 times

class Network:
    def __init__(self, n_in, n_out):
        self.n_in = n_in
        self.n_out = n_out

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_shape = (self.n_in,), activation = 'relu'))
        model.add(Dense(48, activation = 'relu'))
        model.add(Dense(self.n_out, activation = 'linear'))

        optimizer = tf.keras.optimizers.RMSprop(LEARNING_RATE)
        model.compile(loss = 'mse', optimizer = optimizer)
        
        # debug
        # print("compiled model")
        
        return model

In [0]:
## agent

class Agent:
    def __init__(self):
        self.n_in = STATE_SIZE
        self.n_out = ACTION_SIZE
        self.total_reward = 0
        self.gamma = GAMMA
        self.tau = TAU
        self.epsilon = EPSILON
        self.epsilon_min = EPSILON_MIN
        self.epsilon_decay = EPSILON_DECAY
        self.batch_size = BATCH_SIZE
        self.replay_buffer = Replay()
        self.online_model = Network(self.n_in, self.n_out)._build_model()
        self.target_model = Network(self.n_in, self.n_out)._build_model()

    def gather_experience(self, last_observation, action, reward, observation):
        self.replay_buffer.write((last_observation, action, reward, observation))

    # return action index
    def choose_action(self, observation):
        # epsilon greedy policy is performed here
        # exploitation
        # np.random.rand is uniform [0,1]
        if np.random.rand() > self.epsilon:
            observation = self.reshape_state(observation)
            return np.argmax(self.online_model.predict(observation))
            
        # exploration
        else:
            # random action from 0 to 5 out of 6 actions
            return int(np.random.randint(low = 0, high = ACTION_SIZE-1, size = 1, dtype = 'int'))

    # set total reward
    def set_total_reward(self, new_total):
        self.total_reward = new_total

        # debug
        # print("initialized total reward")

    # gather reward
    def gather_reward(self, reward):
        self.total_reward += reward

    # get total rewards
    def get_total_reward(self):
        return self.total_reward

    # we start from large epsilon and gradually decay the epsilon in each episode
    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
            # debug
            # print("decayed epsilon")
            
    # reshape state for model
    def reshape_state(self, state):
        return np.reshape(state, newshape = (self.n_in, ))

    # update q values
    # train online model
    def q_update(self):
        # batch to update q values and train online model
        batch = self.replay_buffer.read(self.batch_size)
        
        y_batch = []
        
        for b in batch:
            
            # debug
            # print("working?")
            
            last_observation, action, reward, observation = b
            
            last_observation = self.reshape_state(last_observation)
            # [0] because tf nn output is list of list
            q_last = self.online_model.predict(last_observation)[0]
            
            if observation is None:
                # TODO 20191209 Yuki
                q_this = reward
            else:
                observation = self.reshape_state(observation)
                
                # Double DQN logic ---------------------------------------------------------
                # select action by online model
                action_online = np.argmax(self.online_model.predict(observation))
                # evaluate action by target model
                q_this_target = self.target_model.predict(observation)[0][action_online]
                # Bellman equation
                q_this = reward + self.gamma * q_this_target
                # --------------------------------------------------------------------------
                
            # update q values
            q_last[action] = q_this
            # store y data
            y_batch.append(q_last)
            
        # numpy for tf nn model
        # b[0] is last_observation
        x_batch = np.array([b[0] for b in batch])
        y_batch = np.array(y_batch)
        
        # train online model
        history = self.online_model.fit(x_batch, y_batch, epochs = 1, verbose = 0)
        
        # debug
        # print("tained online model")
        
        # return online model loss
        return history.history['loss'][0]
        
    # update target model
    def update_target_model(self):
        # get_weights returns list of weights of each layer
        theta_online = self.online_model.get_weights()
        theta_target = self.target_model.get_weights()
        
        # soft target update from "continuous control with DRL"
        counter = 0
        for weight_online, weight_target in zip(theta_online, theta_target):
            # This equations need to be compared with paper
            # target weight is a weighted average of target weight and online weight
            weight_target = weight_target * (1 - self.tau) + weight_online * self.tau
            # update target weight
            theta_target[counter] = weight_target
            # iterate
            counter += 1
        
        # update target model
        self.target_model.set_weights(theta_target)
        
        # debug
        # print("updated target model")

In [0]:
## experience replay

class Replay:
    def __init__(self):
        self.buffer = []
        self.length = 0
        self.max_length = 10000

    def write(self, data):
        if self.length >= self.max_length:
            self.buffer.pop(0)
            self.length -= 1
        self.buffer.append(data)
        self.length += 1

    def read(self, batch_size):
        # at beginning buffer is almost empty, so batch is smaller than batch_size
        return random.sample(self.buffer, min(batch_size, self.length))

In [58]:
## training

agent = Agent()
ep_rewards = []
losses = []
start_time = time.time()

for ep in range(EPISODES):

    # initialize
    # env.reset() in taxi returns index of states out of 500
    last_observation = env.reset()
    agent.set_total_reward(0)

    # iterations within an episode
    for t in range(T_RANGE):

        # draw action
        action = agent.choose_action(last_observation)
        # draw next state and reward
        observation, reward, done, info = env.step(action)

        # when taxi drop a passenger at destination, done = True
        if done == True:
            observation = None

        # accumulate reward
        agent.gather_reward(reward)
        agent.gather_experience(last_observation, action, reward, observation)

        # update q values
        # train online model
        loss = agent.q_update()
        
        # update target model
        agent.update_target_model()
        
        # iterate
        last_observation = observation

        # goal
        if done == True:
            ep_rewards.append(agent.get_total_reward())
            break

    # store last loss of online model
    losses.append(loss)

    # In each episode we decay epsilon
    agent.decay_epsilon()

    # Monitor total reward during episodes
    if ep % MONITOR_INTERVAL == 0:
        print("episode:", ep,
              "reward:", agent.get_total_reward(),
              "loss:", np.round(loss, decimals = 3), 
              "epsilon:", np.round(agent.epsilon, decimals = 5),
              "time: {} seconds".format(np.round(time.time() - start_time, decimals = 0)))

episode: 0 reward: -551 loss: 1.593 epsilon: 0.99995 time: 28.0 seconds
episode: 1 reward: -560 loss: 0.884 epsilon: 0.9999 time: 56.0 seconds
episode: 2 reward: -587 loss: 0.99 epsilon: 0.99985 time: 84.0 seconds
episode: 3 reward: -560 loss: 1.052 epsilon: 0.9998 time: 112.0 seconds
episode: 4 reward: -587 loss: 0.805 epsilon: 0.99975 time: 140.0 seconds
episode: 5 reward: -506 loss: 1.851 epsilon: 0.9997 time: 168.0 seconds
episode: 6 reward: -560 loss: 4.794 epsilon: 0.99965 time: 195.0 seconds
episode: 7 reward: -542 loss: 6.277 epsilon: 0.9996 time: 223.0 seconds
episode: 8 reward: -488 loss: 7.406 epsilon: 0.99955 time: 251.0 seconds
episode: 9 reward: -515 loss: 7.368 epsilon: 0.9995 time: 279.0 seconds
episode: 10 reward: -614 loss: 20.849 epsilon: 0.99945 time: 307.0 seconds
episode: 11 reward: -587 loss: 36.804 epsilon: 0.9994 time: 336.0 seconds
episode: 12 reward: -542 loss: 17.531 epsilon: 0.99935 time: 363.0 seconds


KeyboardInterrupt: ignored

In [0]:
## debug q update

# experiences = []
# # (last_observation, action, reward, observation)
# experiences.append((1, 0, -1, 2))
# experiences.append((2, 1, -1, 3))
# experiences.append((3, 2, -1, 4))
# # print(experiences)

# BATCH_SIZE = 2

# net = Network(1, 6)
# online_model = net._build_model()
# target_model = net._build_model()

# # reshape state for model
# def reshape_state(state):
#     return np.reshape(state, newshape = (STATE_SIZE, -1))


# # def q_update(self):
# def q_update():
#     # sample batch
#     batch = random.sample(experiences, BATCH_SIZE)
#     print(batch)
    
#     # [0] because list of list
#     last_state_input = reshape_state([s[0] for s in batch])[0]
#     state_input = reshape_state([b[3] for b in batch if b[3] is not None])[0]
#     print(last_state_input)
#     print(state_input)
    
#     q_last = online_model.predict(last_state_input)
#     print(q_last)
    
#     q_this = np.zeros_like(q_last)
#     print(q_this)
    
#     ind_not_none = [i for i in range(np.shape(batch)[0]) if batch[i][3] is not None]
#     print(ind_not_none)
    
#     action_online_model = np.argmax(online_model.predict(state_input), axis = 1)
#     print("action_online_model", action_online_model)
#     # print(online_model.predict(state_input))
    
#     print("target_model.predict(state_input) \n", target_model.predict(state_input))
#     pred_target_model = target_model.predict(state_input)
#     # print(pred_target_model)
#     # print([outputs for outputs in pred_target_model])
    
#     q_value_target_model = []
#     for i in range(len(action_online_model)):
#         tmp = pred_target_model[i][action_online_model[i]]
#         q_value_target_model.append(tmp)
#     print(q_value_target_model)
    
# q_update()

In [0]:
# # easier version of q update

# experiences = []
# random.seed(1)
# # (last_observation, action, reward, observation)
# experiences.append((1, 0, -1, 2))
# experiences.append((2, 1, -1, 3))
# experiences.append((3, 2, -1, 4))
# # print(experiences)

# BATCH_SIZE = 2

# net = Network(1, 6)
# online_model = net._build_model()
# target_model = net._build_model()

# # reshape state for model
# def reshape_state(state):
#     # return np.reshape(state, newshape = (STATE_SIZE, -1))
#     return np.reshape(state, newshape = (STATE_SIZE, ))


# # def q_update(self):
# def q_update():
#     # sample batch
#     batch = random.sample(experiences, BATCH_SIZE)
#     # print(batch)
    
#     minibatch_new_q_values = []
    
#     for experience in batch:
#         last_state, action, reward, state = experience
        
#         last_state = reshape_state(last_state)
#         # [0] for list of list
#         experience_new_q_values = online_model.predict(last_state)[0]
#         # print(experience_new_q_values)
        
#         if state is None:
#             q_update = reward
#         else:
#             state = reshape_state(state)
#             action_online = np.argmax(online_model.predict(state))
#             # print("action_online", action_online)
#             # [0] for list of list
#             q_value_target = target_model.predict(state)[0][action_online]
#             # print(q_value_target)
#             q_update = reward + GAMMA * q_value_target
        
#         # print("before", experience_new_q_values)
#         experience_new_q_values[action] = q_update
#         # print("after", experience_new_q_values)
#         minibatch_new_q_values.append(experience_new_q_values)
#         # print(minibatch_new_q_values)
        
#     x_batch = np.array([b[0] for b in batch])
#     print(x_batch)
#     # x_batch = reshape_state(x_batch)
#     # print(x_batch)
#     y_batch = np.array(minibatch_new_q_values)
#     print(y_batch)
    
#     online_model.fit(x_batch, y_batch, epochs = 1)
    
# q_update()

In [0]:
# ## debug neural network model

# net = Network(1, 6)
# model = net._build_model()
# model.summary()

# data = 1
# tmp = np.reshape(data, newshape = (STATE_SIZE, -1))
# print(tmp.shape)
# print(model.predict(tmp))

# tmp = model.get_weights()
# # print(type(tmp))
# # print(len(tmp))
# # print(tmp[0])
# # print(len(tmp[0][0]))
# # print(tmp[1])
# print(tmp[0].shape)
# print(tmp[1].shape)
# print(tmp[2].shape)
# print(tmp[3].shape)
# print(tmp[4].shape)
# print(tmp[5].shape)
# # print(tmp)