# Double DQN (DDQN) Taxi with gym and TensorFlow

In [43]:
## setup

import gym
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import numpy as np
import random

In [82]:
## hyperparameter

ENV_NAME = 'Taxi-v3'

# seed for initial position in environment
SEED = 123

# the learning rate used by RMSProp in "human-level control through deep reinforcement learning"
LEARNING_RATE = 0.00025
MOMENTUM = 0.95

# taxi environment
STATE_SIZE = 1
ACTION_SIZE = 6

# soft target update
# value used in "continuous control with deep reinforcement learning"
TAU = 0.001 

BATCH_SIZE = 32

# gamma in Bellman equation
GAMMA = 0.99

# epsilon in epsilon greedy algorithm
# we implement epsilon decay
EPSILON = 1.0
# EPSILON_DECAY = 0.99999
EPSILON_DECAY = 0.99995
# EPSILON_MIN = 0.1
EPSILON_MIN = 0.01

# max step in each episode
T_RANGE = 201

# taxi environment
STATE_SIZE = 1
ACTION_SIZE = 6

# training
EPISODES = 5000
MONITOR_INTERVAL = 100
# EPISODES = 100
# MONITOR_INTERVAL = 10

In [84]:
## environment

env = gym.make(ENV_NAME)
# env.seed(SEED)
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[34;1mY[0m| : |B: |
+---------+



In [80]:
## experience replay

class Replay:
    def __init__(self):
        self.buffer = []
        self.length = 0
        self.max_length = 10000

    def write(self, data):
        if self.length >= self.max_length:
            self.buffer.pop(0)
            self.length -= 1
        self.buffer.append(data)
        self.length += 1

    def read(self, batch_size):
        # at beginning buffer is almost empty, so batch is smaller than batch_size
        return random.sample(self.buffer, min(batch_size, self.length))

In [89]:
## network

# we use the same architectures for online_network and target_network
# we _build_model 2 times

class Network:
    def __init__(self, n_in, n_out):
        self.n_in = n_in
        self.n_out = n_out

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_shape = (self.n_in,), activation = 'relu'))
        model.add(Dense(48, activation = 'relu'))
        model.add(Dense(self.n_out, activation = 'linear'))

        optimizer = tf.keras.optimizers.RMSprop(LEARNING_RATE)
        model.compile(loss = 'mse', optimizer = optimizer)

        return model

In [53]:
## agent

class Agent:
    def __init__(self):
        self.n_in = STATE_SIZE
        self.n_out = ACTION_SIZE
        self.total_reward = 0
        self.gamma = GAMMA
        self.tau = TAU
        self.epsilon = EPSILON
        self.epsilon_min = EPSILON_MIN
        self.epsilon_decay = EPSILON_DECAY
        self.batch_size = BATCH_SIZE
        self.replay_buffer = Replay()
        self.online_model = Network(self.n_in, self.n_out)._build_model()
        self.target_model = Network(self.n_in, self.n_out)._build_model()

    def gather_experience(self, last_observation, action, reward, observation):
        self.replay_buffer.write((last_observation, action, reward, observation))

    # return action index
    def choose_action(self, observation):
        # epsilon greedy policy is performed here
        # exploitation
        # np.random.rand is uniform [0,1]
        if np.random.rand() > self.epsilon:
            return np.argmax(self.model.predict(np.array([observation])))
        # exploration
        else:
            # random action from 0 to 5 out of 6 actions
            return int(np.random.randint(low = 0, high = ACTION_SIZE-1, size = 1, dtype = 'int'))

    # set total reward
    def set_total_reward(self, new_total):
        self.total_reward = new_total

    # gather reward
    def gather_reward(self, reward):
        self.total_reward += reward

    # get total rewards
    def get_total_reward(self):
        return self.total_reward

    # we start from large epsilon and gradually decay the epsilon in each episode
    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    # reshape state for model
    def reshape_state(self, state):
        return np.reshape(state, newshape = (self.n_in, -1))

    # q update
    # also contains online network update
    def q_update(self):
        # get a batch from replay buffer
        # batch is list of turples
        batch = self.replay_buffer.read(self.batch_size)
        
        # reshape state for model input
        # batch[i][0], or s[0] is last_observation (first element of turple in the list is state index)
        # batch[i][3], or b[3] is observation
        last_state_input = reshape_state([s[0] for s in batch])
        state = reshape_state([b[3] for b in batch if b[3] is not None])
        
        # experience new q values ([0] because list of list)
        q_last = self.online_model.predict(last_state_input)[0]

        # initialize
        q_this = np.zeros_like(q_last)
        
        # batch[i][3] is observation
        ind_not_none = [i for i in range(np.shape(batch)[0]) if batch[i][3] is not None]
        
        # select action by online model
        action_online_model = np.argmax(self.online_model.predict(state))
        
        # Bellman equation of Double DQN
        # evaluate action by target model
        q_value_target_model = self.target_model.predict(state)[action_online_model]
        
        
        q_value = 
    
        for i in range(len(ind_not_none)):
            # store n_out number of q predictions by neural network regression
            q_this[ind_not_none[i], :] = q_this_not_none[i, :]

        # initialize batch for online model training data
        x_batch = np.zeros([np.shape(batch)[0], self.n_in])
        y_batch = np.zeros([np.shape(batch)[0], self.n_out])

        # x batch for online model
        for i in range(np.shape(batch)[0]):
            # batch[i][0] is last_observation
            x_batch[i, :] = batch[i][0]

        # y batch for online model
        for j in range(self.n_out):
            # batch[i][1] is action
            if j == batch[i][1]:
                
                # Bellman equation
                # Q values update of Double DQN
                # batch[i][2] is reward
                y_batch[i, j] = batch[i][2] + self.gamma * q_value_target_model
                
            else:
                y_batch[i, j] = q_last[i][j]

        # train online model
        self.online_model.fit(x_batch, y_batch, epochs = 1, verbose = 0)
        
    # update target model
    def update_target_model(self):
        # get_weights returns list of weights of each layer
        theta_online = self.online_model.get_weights()
        theta_target = self.target_model.get_weights()
        
        # soft target update from "continuous control with DRL"
        counter = 0
        for weight_online, weight_target in zip(theta_online, theta_target):
            # This equations need to be compared with paper
            # target weight is a weighted average of target weight and online weight
            weight_target = weight_target * (1 - self.tau) + weight_online * self.tau
            # update target weight
            theta_target[counter] = weight_target
            # iterate
            counter += 1
        
        # update target model
        self.target_model.set_weights(theta_target)

In [None]:
## experience replay

class Replay:
    def __init__(self):
        self.buffer = []
        self.length = 0
        self.max_length = 10000

    def write(self, data):
        if self.length >= self.max_length:
            self.buffer.pop(0)
            self.length -= 1
        self.buffer.append(data)
        self.length += 1

    def read(self, batch_size):
        # at beginning buffer is almost empty, so batch is smaller than batch_size
        return random.sample(self.buffer, min(batch_size, self.length))

In [None]:
## training

agent = Agent()
ep_rewards = []

for ep in range(EPISODES):

  # monitor training process
  if ep % MONITOR_INTERVAL == 0:
    print("episode", ep, "epsilon", agent.epsilon)

  # initialize
  # env.reset() in taxi returns index of states out of 500
  last_observation = env.reset()
  agent.set_total_reward(0)

  # iterations within an episode
  for t in range(T_RANGE):

    # draw action
    action = agent.choose_action(last_observation)
    # draw next state and reward
    observation, reward, done, info = env.step(action)

    # when taxi drop a passenger at destination, done = True
    if done == True:
      observation = None

    # accumulate reward
    agent.gather_reward(reward)
    agent.gather_experience(last_observation, action, reward, observation)

    # update q function
    agent.q_update()
    # iterate
    last_observation = observation
        
    # goal
    if done == True:
      ep_rewards.append(agent.get_total_reward())
      break

  # In each episode we decay epsilon
  agent.decay_epsilon()

  # Monitor total reward during episodes
  if ep % MONITOR_INTERVAL == 0:
    print("reward", agent.get_total_reward())

In [123]:
## debug q update

experiences = []
# (last_observation, action, reward, observation)
experiences.append((1, 0, -1, 2))
experiences.append((2, 1, -1, 3))
experiences.append((3, 2, -1, 4))
# print(experiences)

BATCH_SIZE = 2

net = Network(1, 6)
online_model = net._build_model()
target_model = net._build_model()

# reshape state for model
def reshape_state(state):
    return np.reshape(state, newshape = (STATE_SIZE, -1))


# def q_update(self):
def q_update():
    # sample batch
    batch = random.sample(experiences, BATCH_SIZE)
    print(batch)
    
    # [0] because list of list
    last_state_input = reshape_state([s[0] for s in batch])[0]
    state_input = reshape_state([b[3] for b in batch if b[3] is not None])[0]
    print(last_state_input)
    print(state_input)
    
    q_last = online_model.predict(last_state_input)
    print(q_last)
    
    q_this = np.zeros_like(q_last)
    print(q_this)
    
    ind_not_none = [i for i in range(np.shape(batch)[0]) if batch[i][3] is not None]
    print(ind_not_none)
    
    action_online_model = np.argmax(online_model.predict(state_input), axis = 1)
    print("action_online_model", action_online_model)
    # print(online_model.predict(state_input))
    
    print("target_model.predict(state_input) \n", target_model.predict(state_input))
    pred_target_model = target_model.predict(state_input)
    # print(pred_target_model)
    # print([outputs for outputs in pred_target_model])
    
    q_value_target_model = []
    for i in range(len(action_online_model)):
        tmp = pred_target_model[i][action_online_model[i]]
        q_value_target_model.append(tmp)
    print(q_value_target_model)
    
q_update()

[(1, 0, -1, 2), (3, 2, -1, 4)]
[1 3]
[2 4]
[[ 0.01578611  0.32744968 -0.09944885  0.09646658  0.05530835  0.03442656]
 [ 0.04735828  0.9823491  -0.29834646  0.28939974  0.16592506  0.10327965]]
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
[0, 1]
action_online_model [1 1]
target_model.predict(state_input) 
 [[-0.15322496  0.01192236  0.154649    0.6328393   0.197792    0.080158  ]
 [-0.30644992  0.02384472  0.309298    1.2656786   0.395584    0.160316  ]]
[0.0119223595, 0.023844719]


In [124]:
# ## debug neural network model

# net = Network(1, 6)
# model = net._build_model()
# model.summary()

# data = 1
# tmp = np.reshape(data, newshape = (STATE_SIZE, -1))
# print(tmp.shape)
# print(model.predict(tmp))

# tmp = model.get_weights()
# # print(type(tmp))
# # print(len(tmp))
# # print(tmp[0])
# # print(len(tmp[0][0]))
# # print(tmp[1])
# print(tmp[0].shape)
# print(tmp[1].shape)
# print(tmp[2].shape)
# print(tmp[3].shape)
# print(tmp[4].shape)
# print(tmp[5].shape)
# # print(tmp)