In [0]:
## setup

import gym

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import random
import numpy as np
from matplotlib import pyplot as plt

# clear rendered env display
from IPython.display import clear_output
# freeze rendered env display
from time import sleep
import time
from google.colab import files
import pickle

In [0]:
## hyperparameter

# You should change
# LEARNING_RATE = 0.01
LEARNING_RATE = 0.00025
# MOMENTUM = 0.95

# BATCH_SIZE = 16
BATCH_SIZE = 32

GAMMA = 0.95
# GAMMA = 1.0

EPSILON = 1.0

# EPSILON_DECAY = 0.99999
# EPSILON_DECAY = 0.9995
EPSILON_DECAY = 0.9972

EPSILON_MIN = 0.1
# EPSILON_MIN = 0.01

T_RANGE = 201

STATE_SIZE = 1
ONEHOT_STATE_SIZE = 500

ACTION_SIZE = 6

# EPISODES = 15000
# MONITOR_INTERVAL = 1000
# EPISODES = 10000
# MONITOR_INTERVAL = 100
EPISODES = 5000
MONITOR_INTERVAL = 100
# EPISODES = 100
# MONITOR_INTERVAL = 10
# EPISODES = 10
# MONITOR_INTERVAL = 1

SEED = 123

ENV_NAME = 'Taxi-v3'

PATH_SAVE_WEIGHTS = 'model.h5'
SAVE_FIG_REWARD = 'dqn_taxi_reward.png'
SAVE_FIG_LOSS = 'dqn_taxi_loss.png'
REWARD_OBJ = 'reward_list.sav'
LOSS_OBJ = 'loss_list.sav'

In [4]:
## environment

env = gym.make(ENV_NAME)
# env.seed(SEED)
env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+



In [0]:
## experience replay

class Replay:
    def __init__(self):
        self.buffer = []
        self.length = 0
        self.max_length = 10000

    def write(self, data):
        if self.length >= self.max_length:
            self.buffer.pop(0)
            self.length -= 1
        self.buffer.append(data)
        self.length += 1

    def read(self, batch_size):
        # at beginning buffer is almost empty, so batch is smaller than batch_size
        return random.sample(self.buffer, min(batch_size, self.length))

In [0]:
## network

class Network:
    def __init__(self, n_in, n_out):
        self.n_in = n_in
        self.n_out = n_out

    # 2 hidden layers model
    def _build_model_1(self):
        model = Sequential()
        model.add(Dense(24, input_shape = (self.n_in,), activation = 'relu'))
        model.add(Dense(48, activation = 'relu'))
        model.add(Dense(self.n_out, activation = 'linear'))
        optimizer = tf.keras.optimizers.RMSprop(LEARNING_RATE)
        model.compile(loss = 'mse', optimizer = optimizer)
        return model
    
    # no hidden or bias layer model
    def _build_model_2(self):
        model = Sequential()
        model.add(Dense(self.n_out, input_shape = (self.n_in,), activation = 'linear', use_bias = False))
        optimizer = tf.keras.optimizers.Adam(learning_rate = LEARNING_RATE)
        model.compile(loss = 'mse', optimizer = optimizer)
        return model

In [0]:
## agent

class Agent:
    def __init__(self):
        # choose what style you want for state representation
        # self.n_in = STATE_SIZE
        self.n_in = ONEHOT_STATE_SIZE
        self.n_out = ACTION_SIZE
        self.total_reward = 0
        self.gamma = GAMMA
        self.epsilon = EPSILON
        self.epsilon_min = EPSILON_MIN
        self.epsilon_decay = EPSILON_DECAY
        self.batch_size = BATCH_SIZE
        self.replay_buffer = Replay()
        # self.model = Network(self.n_in, self.n_out)._build_model_1()
        self.model = Network(self.n_in, self.n_out)._build_model_2()

    def gather_experience(self, last_observation, action, reward, observation):
        self.replay_buffer.write((last_observation, action, reward, observation))

    # used for initilization
    def set_total_reward(self, new_total):
        self.total_reward = new_total

    # used in each time step to accumulate reward
    def gather_reward(self, reward):
        self.total_reward += reward

    # used at the end of episode to return total reward
    def get_total_reward(self):
        return self.total_reward

    # we start from large epsilon and gradually decay the epsilon in each episode
    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def one_hot_encode(self, observation):
        state = np.zeros(self.n_in)
        state[observation] = 1
        state = np.reshape(state, (-1, self.n_in))
        return state

    # return action index
    def choose_action(self, observation):
        # epsilon greedy policy
        
        # exploitation
        # np.random.rand is uniform [0,1]
        if np.random.rand() > self.epsilon:
            state = self.one_hot_encode(observation)
            return np.argmax(self.model.predict(state)[0])
        
        # exploration
        else:
            # random action from 0 to 5 out of 6 actions
            return int(np.random.randint(low = 0, high = self.n_out - 1, size = 1, dtype = 'int'))

    # n_in 500 one hot encoding case
    def q_update(self):
        
        # sample batch
        batch = self.replay_buffer.read(self.batch_size)
        n = np.shape(batch)[0]
        
        # initialize training data
        x_batch = np.zeros([n, self.n_in])
        y_batch = np.zeros([n, self.n_out])
        
        counter = 0
        
        for b in batch:
            last_state, action, reward, state = b
            
            last_state = self.one_hot_encode(last_state)
            q_last = self.model.predict(last_state)[0]
            
            if state is None:
                q_last[action] = reward
            else:
                state = self.one_hot_encode(state)
                q_this = self.model.predict(state)[0]
                y = reward + self.gamma * np.max(q_this)
                q_last[action] = y
        
            # store
            x_batch[counter, :] = last_state
            y_batch[counter, :] = q_last
            counter += 1
        
        history = self.model.fit(x_batch, y_batch, epochs = 1, verbose = 0)
        
        # return online model loss
        return history.history['loss'][0]

In [11]:
agent = Agent()
ep_rewards = []
loss = False
losses = []
start_time = time.time()

for ep in range(EPISODES):

    # initialize
    # env.reset() in taxi returns index of states out of 500
    last_observation = env.reset()
    agent.set_total_reward(0)

    # iterations within an episode
    for t in range(T_RANGE):

        # draw action
        action = agent.choose_action(last_observation)
        # draw next state and reward
        observation, reward, done, info = env.step(action)

        # when taxi drop a passenger at destination, done = True
        if done == True:
            observation = None

        # accumulate reward
        agent.gather_reward(reward)
        agent.gather_experience(last_observation, action, reward, observation)

        # update q function
        # loss = agent.q_update()

        # update q function version 2
        if np.random.random() < 0.3 and len(ep_rewards) > 0:
            if ep_rewards[-1] < 9.7:
                loss = agent.q_update()
        
        # iterate
        last_observation = observation
        
        # goal
        if done == True:
            ep_rewards.append(agent.get_total_reward())
            break

    # store last loss of online model
    if loss:
        losses.append(loss)

    # In each episode we decay epsilon
    agent.decay_epsilon()

    # Monitor total reward during episodes
    if ep % MONITOR_INTERVAL == 0 and loss:
        print("episode:", ep,
              "reward:", agent.get_total_reward(),
              "loss:", np.round(loss, decimals = 3), 
              "epsilon:", np.round(agent.epsilon, decimals = 5),
              "time: {} seconds".format(np.round(time.time() - start_time, decimals = 0)))


# when training finishes

# save weights of neural network
agent.model.save(PATH_SAVE_WEIGHTS)

# plot the reward result
episode = range(0, EPISODES, 1)
plt.plot(episode, ep_rewards)
plt.ylabel("total rewards per episode")
plt.xlabel("episode")
plt.title("DQN Taxi rewards")
plt.savefig(SAVE_FIG_REWARD)
plt.show()

# plot the loss result
episode = range(0, losses, 1)
plt.plot(episode, losses)
plt.ylabel("loss per episode")
plt.xlabel("episode")
plt.title("DQN Taxi rewards")
plt.savefig(SAVE_FIG_LOSS)
plt.show()

# save object
pickle.dump(ep_rewards, open(REWARD_OBJ, 'wb'))
pickle.dump(losses, open(LOSS_OBJ, 'wb'))

episode: 1 reward: -596 loss: 2.702 epsilon: 0.99441 time: 11.0 seconds


KeyboardInterrupt: ignored

In [0]:
files.download(PATH_SAVE_WEIGHTS)

In [0]:
files.download(SAVE_FIG_REWARD)

In [0]:
files.download(SAVE_FIG_LOSS)

In [0]:
files.download(REWARD_OBJ)

In [0]:
files.download(LOSS_OBJ)

In [0]:
# ## Evaluation

# episode = range(0, EPISODES, 1)
# plt.plot(episode, ep_rewards)
# plt.ylabel("total rewards per episode")
# plt.xlabel("episode")
# plt.title("DQN Taxi-v3 q-learning (training)")
# plt.show()

In [0]:
# # easier version of q update

# experiences = []
# random.seed(1)
# # (last_observation, action, reward, observation)
# experiences.append((1, 0, -1, 2))
# experiences.append((2, 1, -1, 3))
# experiences.append((3, 2, -1, 4))
# # print(experiences)

# BATCH_SIZE = 2
# GAMMA = 0.95

# net = Network(500, 6)
# model = net._build_model_2()

# def one_hot_encode(observation):
#     state = np.zeros(500)
#     state[observation] = 1
#     state = np.reshape(state, (-1, 500))
#     return state

# # def q_update(self):
# def q_update():
#     # sample batch
#     batch = random.sample(experiences, BATCH_SIZE)
#     # print(batch)
    
#     n = np.shape(batch)[0]
    
#     x_batch = np.zeros([n, 500])
#     y_batch = np.zeros([n, 6])
    
#     counter = 0
    
#     for b in batch:
#         last_state, action, reward, state = b
        
#         last_state = one_hot_encode(last_state)
#         # print(last_state.shape)
#         # print(last_state)
        
#         # [0] for list of list
#         q_last = model.predict(last_state)[0]
#         # print(q_last)
        
#         if state is None:
#             q_last[action] = reward
#         else:
#             state = one_hot_encode(state)
#             # print("state", state)
#             # [0] for list of list
#             q_this = model.predict(state)[0]
#             # print("q_this", q_this)
#             # new_action = np.argmax(q_this)
#             # print("new_action", new_action)
            
#             # calculate new q
#             # Bellman equation
#             # y = reward + GAMMA * q_this[new_action]
#             y = reward + GAMMA * np.max(q_this)
#             # print("y", y)
            
#             # updat q only for current action
#             q_last[action] = y
#             # print(q_last)
        
#         y_batch[counter, :] = q_last
#         x_batch[counter, :] = last_state
        
#         counter += 1
        
#     # debug
#     # print("x_batch.shape", x_batch.shape)
#     # print("y_batch.shape", y_batch.shape)
#     # print("x_batch", x_batch)
#     # print("y_batch", y_batch)
                
#     model.fit(x_batch, y_batch, epochs = 1, verbose = 0)
    
# q_update()

In [0]:
## debug trainig

# EPSILON = 0.1
# agent = Agent()
# # agent.model.predict(np.array([100]))
# last_observation = env.reset()
# print(last_observation)
# action = agent.choose_action(np.array([last_observation]))
# print(action)

In [0]:
# replay_buffer = Replay()
# replay_buffer.write

In [0]:
## debug neural network

# class Network:
#   def __init__(self, n_in, n_out):
#     self.n_in = n_in
#     self.n_out = n_out
#     self.model = self._build_model()

#   def _build_model(self):
#     model = Sequential()
#     model.add(Dense(24, input_shape = (self.n_in,), activation = 'relu'))
#     # model.add(Dense(24, input_dim = self.n_in, activation = 'relu'))
#     model.add(Dense(48, activation = 'relu'))
#     model.add(Dense(self.n_out, activation = 'linear'))

#     # optimizer = tf.keras.optimizer.RMSprop(0.001)
#     model.compile(loss = 'mse', optimizer = 'rmsprop')

#     return model

# net = Network(n_in = 1, n_out = 6)
# # net.model.summary()

# x = np.array([1])
# pred = net.model.predict(x)
# print(pred[0])
# print(pred)
# print(np.argmax(pred))

In [0]:
## Debug replay buffer

# replay_buffer = Replay()
# last_observation = env.reset()

# for _ in range(10):

#   action = 0
#   observation, reward, done, info = env.step(action)
#   # last_observation, action, reward, observation
#   replay_buffer.write((last_observation, action, reward, observation))
#   last_observation = observation

# print(replay_buffer.buffer)

# batch = replay_buffer.read(3)
# print(batch)

# print([s[0] for s in batch])
# print('np.shape(batch)[0]', np.shape(batch)[0])
# batch[0][3]

In [0]:
# print(env.observation_space)
# print(env.action_space)
# observation = env.reset()
# print("observation", observation)
# print("env.step", env.step(0))
# nb_actions = env.action_space.n
# nb_states = env.observation_space
# print(nb_actions)
# print(nb_states)

In [0]:
# reply = Replay()
# action = 0
# last_observation,  = env.step(action)
# print(step)
# reply.write()
# reply.read(10)

In [0]:
## random exploration

# for ep in range(20):
#   observation = env.reset()
#   for t in range(100):
#     clear_output(wait = True)
#     env.render()
#     action = env.action_space.sample()
#     observation, reward, done, info = env.step(action)
#     sleep(.5)

In [0]:
## demonstration

# # simple demonstration that network is able to train property
# with tf.Graph().as_default():
#   with tf.Session() as sess:
#     f = Network(sess, 1, 6)
#     sess.run(tf.global_variables_initializer())
    
#     # make demo input
#     x = np.random.randn(10000, 1)
#     # make demo output
#     tmp_1 = 2 * x[:,0]
#     tmp_2 = x[:,0]**2
#     tmp_3 = x[:,0]**3
#     tmp_4 = 0.1 * x[:,0]
#     tmp_5 = 0.3 * x[:,0]
#     tmp_6 = 5 * x[:,0]**2
#     y = np.transpose([tmp_1, tmp_2, tmp_3, tmp_4, tmp_5, tmp_6])

#     # check MSE before training
#     print('MSE at iteration 0 is {}'.format(((f.compute(x) - y)**2).mean()))

#     # train
#     iteration = 5000
#     for i in range(iteration):
#       f.train(x, y)

#     # check MSE after training
#     print('MSE at iteration {} is {}'.format(iteration, ((f.compute(x) - y)**2).mean()))

# # We can check that MSE decreased after training so our network is working

In [0]:
# x = np.random.randn(10000, 4)
# tmp_1 = x[:,0] + x[:,1]**2
# tmp_2 = x[:,2] + x[:,3]**3
# print(tmp_1.shape, tmp_2.shape)
# y = np.transpose([ x[:,0] + x[:,1]**2, x[:,2] + x[:,3]**3 ])
# print(x.shape, y.shape)

# x = np.random.randn(10000, 1)
# tmp_1 = 2 * x[:,0]
# tmp_2 = x[:,0]**2
# tmp_3 = x[:,0]**3
# tmp_4 = 0.1 * x[:,0]
# y = np.transpose([tmp_1, tmp_2, tmp_3, tmp_4])
# print(x.shape, y.shape)
# print(y[0:4, :])
# print(y[0,:])
# print(np.argmax(y[0, :]))

In [0]:
# model = Sequential()
# model.add(Dense(32, input_shape = (INPUT_SIZE, ), activation = 'relu'))
# model.add(Dense(OUTPUT_SIZE, activation = 'linear'))

# model.summary()