In [1]:
import numpy as np
import gym

In [2]:
import simple_nn.activation_function as actv_func
import simple_nn.gd_updater as gd_upd
import simple_nn.nn_model as nn
import simple_nn.nn_layer as nn_l

### gym environment

In [3]:
env_name = 'FrozenLake8x8-v0'

In [4]:
env = gym.make(env_name)

### common op for RL

In [5]:
def calculate_batch_tgt_values(batch_action,
                            batch_q_values, 
                            batch_next_q_values, 
                            batch_reward, 
                            batch_end):
    tgt_rewards = batch_reward + discount * np.max(batch_next_q_values, axis=1) * (1-batch_end)
    tgt_values = batch_q_values.copy()
    tgt_values[np.arange(len(tgt_values)),batch_action] = tgt_rewards
    
    return tgt_values

### behavior define

In [6]:
# binary expand categorical type to preprocess states
def binary_expand(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    res = np.zeros(idx.shape)
    res[n==idx] = 1
    return res

In [7]:
def binary_expand_mod(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    return np.identity(len(idx))[np.where(n==idx)]

In [8]:
# preprocess state information into input parameters
preprocess_state = binary_expand

---

### learning agent - online

#### hyperparameters

In [9]:
episodes = 15000
learning_rate = 0.02
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.001
epsilon = max_epsilon

print_step = 1000

In [10]:
m = nn.NNModel(env.observation_space.n, gd_upd.GDUpdaterNormal(learning_rate=learning_rate))

In [11]:
m.add_layer(nn_l.FCLayer(20, True, actv_func.ActivationNone()))
m.add_layer(nn_l.FCLayer(env.action_space.n, True, actv_func.ActivationNone()))

In [12]:
%%time

step_count_list = []
total_reward_list = []

for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values = m.model_forward(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # calculate error for back propagtion with Bellman equation
        tgt_q_values = q_values.copy()
        next_q_values = m.predict(new_state)
        tgt_q_values[action] = reward + discount * np.max(next_q_values) * (not end)
                
        # update model with predicted q values
        m.update_model(q_values, tgt_q_values)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 48.721 steps are used, avg total reward is 0.005.
In episode 2000, avg 74.534 steps are used, avg total reward is 0.018.
In episode 3000, avg 90.089 steps are used, avg total reward is 0.065.
In episode 4000, avg 71.354 steps are used, avg total reward is 0.157.
In episode 5000, avg 76.702 steps are used, avg total reward is 0.212.
In episode 6000, avg 75.56 steps are used, avg total reward is 0.221.
In episode 7000, avg 76.437 steps are used, avg total reward is 0.253.
In episode 8000, avg 79.83 steps are used, avg total reward is 0.419.
In episode 9000, avg 94.433 steps are used, avg total reward is 0.754.
In episode 10000, avg 94.791 steps are used, avg total reward is 0.743.
In episode 11000, avg 95.926 steps are used, avg total reward is 0.746.
In episode 12000, avg 94.365 steps are used, avg total reward is 0.748.
In episode 13000, avg 95.189 steps are used, avg total reward is 0.751.
In episode 14000, avg 96.805 steps are used, avg total reward is 0.739.
In 

ob model

In [13]:
env.render()

  (Up)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFF[41mH[0mFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [14]:
q_values = m.predict(np.identity(64))
np.argmax(q_values, axis=1).reshape(8,8)

array([[1, 1, 1, 1, 2, 2, 2, 0],
       [3, 3, 3, 3, 3, 2, 2, 0],
       [3, 3, 0, 0, 2, 3, 2, 0],
       [0, 3, 0, 1, 0, 0, 2, 2],
       [0, 3, 3, 1, 2, 3, 3, 2],
       [0, 0, 1, 1, 3, 1, 1, 2],
       [0, 0, 3, 3, 1, 0, 2, 2],
       [3, 3, 0, 0, 0, 3, 2, 0]], dtype=int32)

run with model

In [15]:
test_episode = 10000

In [16]:
total_reward = 0
for _ in range(test_episode):
    end = False
    state = preprocess_state(env.reset())
    while not end:
        q_values = m.predict(state)
        action = np.argmax(q_values)
        state_no, reward, end, _ = env.step(action)
        state = preprocess_state(state_no)
        total_reward += reward
print('average reward: {}'.format(total_reward/test_episode))

average reward: 0.7005


---

### learning agent - experience replay with mini batch

In [17]:
from collections import deque

#### hyperparameters

In [18]:
episodes = 10000
learning_rate = 0.1
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.01
epsilon = max_epsilon

print_step = 1000

In [19]:
batch_size = 50
max_buf_size = 10000

In [20]:
m = nn.NNModel(env.observation_space.n, gd_upd.GDUpdaterNormal(learning_rate=learning_rate))

In [21]:
m.add_layer(nn_l.FCLayer(20, True, actv_func.ActivationNone()))
m.add_layer(nn_l.FCLayer(env.action_space.n, True, actv_func.ActivationNone()))

In [22]:
%%time

step_count_list = []
total_reward_list = []

experience_buf = deque(maxlen=max_buf_size)

for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values = m.predict(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # record to the experience buf
        experience_buf.append((state, action, reward, new_state, end))
        
        # sample and mini batch training
        if len(experience_buf) > batch_size:
            batch_samples_idx = np.random.choice(np.arange(len(experience_buf)), 
                                                 size=batch_size, replace=False)
            batch_samples = [experience_buf[i] for i in batch_samples_idx]
            batch_states = np.array([sample[0] for sample in batch_samples])
            batch_action = np.array([sample[1] for sample in batch_samples])
            batch_reward = np.array([sample[2] for sample in batch_samples])
            batch_new_states = np.array([sample[3] for sample in batch_samples])
            batch_end = np.array([sample[4] for sample in batch_samples])
            batch_q_values = m.model_forward(batch_states)
            batch_next_q_values = m.predict(batch_new_states)

            # calculate error for back propagtion with Bellman equation
            batch_tgt_q_values = calculate_batch_tgt_values(batch_action,
                                                            batch_q_values, 
                                                            batch_next_q_values, 
                                                            batch_reward, 
                                                            batch_end)

            # update model with predicted q values
            m.update_model(batch_q_values, batch_tgt_q_values)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 89.163 steps are used, avg total reward is 0.162.
In episode 2000, avg 96.15 steps are used, avg total reward is 0.385.
In episode 3000, avg 98.227 steps are used, avg total reward is 0.644.
In episode 4000, avg 95.054 steps are used, avg total reward is 0.841.
In episode 5000, avg 102.165 steps are used, avg total reward is 0.801.
In episode 6000, avg 93.573 steps are used, avg total reward is 0.812.
In episode 7000, avg 94.897 steps are used, avg total reward is 0.816.
In episode 8000, avg 92.835 steps are used, avg total reward is 0.812.
In episode 9000, avg 98.056 steps are used, avg total reward is 0.777.
In episode 10000, avg 96.09 steps are used, avg total reward is 0.783.
Wall time: 12min 56s


ob model

In [23]:
env.render()

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m


In [24]:
q_values = m.predict(np.identity(64))
np.argmax(q_values, axis=1).reshape(8,8)

array([[2, 2, 2, 2, 2, 2, 2, 2],
       [3, 2, 3, 3, 3, 2, 2, 2],
       [3, 3, 0, 3, 2, 3, 2, 2],
       [3, 3, 3, 3, 0, 3, 2, 2],
       [3, 3, 3, 0, 2, 1, 3, 2],
       [0, 3, 1, 1, 3, 0, 2, 2],
       [1, 0, 2, 3, 3, 0, 2, 2],
       [3, 3, 3, 2, 2, 1, 2, 0]], dtype=int32)

run with model

In [25]:
test_episode = 10000

In [26]:
total_reward = 0
for _ in range(test_episode):
    end = False
    state = preprocess_state(env.reset())
    while not end:
        q_values = m.predict(state)
        action = np.argmax(q_values)
        state_no, reward, end, _ = env.step(action)
        state = preprocess_state(state_no)
        total_reward += reward
print('average reward: {}'.format(total_reward/test_episode))

average reward: 0.848
