In [1]:
import numpy as np
import gym

In [2]:
import simple_nn.simple_nn as snn
import simple_nn.gd_optimizor as gd_opt
import simple_nn.activation_function as actv_func

### gym environment

In [3]:
env_name = 'FrozenLake-v0'

In [4]:
env = gym.make(env_name)

### common op for RL

In [35]:
def calculate_batch_tgt_values(batch_action,
                            batch_q_values, 
                            batch_next_q_values, 
                            batch_reward, 
                            batch_end):
    tgt_rewards = batch_reward + discount * np.max(batch_next_q_values, axis=1) * (1-batch_end)
    tgt_values = batch_q_values.copy()
    tgt_values[np.arange(len(tgt_values)),batch_action] = tgt_rewards
    
    return tgt_values

### behavior define

In [5]:
# binary expand categorical type to preprocess states
def binary_expand(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    res = np.zeros(idx.shape)
    res[n==idx] = 1
    return res

In [6]:
def binary_expand_mod(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    return np.identity(len(idx))[np.where(n==idx)]

In [7]:
# preprocess state information into input parameters
preprocess_state = binary_expand

---

### learning agent - online

#### hyperparameters

In [8]:
episodes = 15000
learning_rate = 0.02
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.001
epsilon = max_epsilon

print_step = 1000

In [21]:
layers = [
    snn.NNLayer(20, True, actv_func.ActivationNone()),
    snn.NNLayer(env.action_space.n, True, actv_func.ActivationNone()),
]

In [22]:
m = snn.NNModel(env.observation_space.n, layers, gd_opt.GDOptimizerNone(learning_rate=learning_rate))

In [23]:
step_count_list = []
total_reward_list = []
for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values = m.model_forward(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # calculate error for back propagtion with Bellman equation
        tgt_q_values = q_values.copy()
        next_q_values = m.predict(new_state)
        tgt_q_values[action] = reward + discount * np.max(next_q_values) * (not end)
                
        # update model with predicted q values
        m.update_model(q_values, tgt_q_values)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 11.364 steps are used, avg total reward is 0.046.
In episode 2000, avg 21.996 steps are used, avg total reward is 0.163.
In episode 3000, avg 31.066 steps are used, avg total reward is 0.389.
In episode 4000, avg 39.125 steps are used, avg total reward is 0.547.
In episode 5000, avg 40.672 steps are used, avg total reward is 0.635.
In episode 6000, avg 42.074 steps are used, avg total reward is 0.657.
In episode 7000, avg 40.993 steps are used, avg total reward is 0.695.
In episode 8000, avg 42.601 steps are used, avg total reward is 0.666.
In episode 9000, avg 42.897 steps are used, avg total reward is 0.697.
In episode 10000, avg 43.027 steps are used, avg total reward is 0.7.
In episode 11000, avg 43.731 steps are used, avg total reward is 0.689.
In episode 12000, avg 42.778 steps are used, avg total reward is 0.693.
In episode 13000, avg 43.658 steps are used, avg total reward is 0.678.
In episode 14000, avg 43.855 steps are used, avg total reward is 0.67.
In e

ob model

In [24]:
env.render()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [25]:
q_values = m.predict(np.identity(16))
np.argmax(q_values, axis=1).reshape(4,4)

array([[0, 3, 3, 3],
       [0, 3, 2, 0],
       [3, 1, 0, 2],
       [1, 2, 1, 0]], dtype=int32)

run with model

In [26]:
test_episode = 10000

In [27]:
total_reward = 0
for _ in range(test_episode):
    end = False
    state = preprocess_state(env.reset())
    while not end:
        q_values = m.predict(state)
        action = np.argmax(q_values)
        state_no, reward, end, _ = env.step(action)
        state = preprocess_state(state_no)
        total_reward += reward
print('average reward: {}'.format(total_reward/test_episode))

average reward: 0.7408


---

### learning agent - experience replay with mini batch

In [29]:
from collections import deque

#### hyperparameters

In [30]:
episodes = 10000
learning_rate = 0.1
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.01
epsilon = max_epsilon

print_step = 1000

In [31]:
batch_size = 50
max_buf_size = 10000

In [32]:
layers = [
    snn.NNLayer(20, True, actv_func.ActivationNone()),
    snn.NNLayer(env.action_space.n, True, actv_func.ActivationNone()),
]

In [33]:
m = snn.NNModel(env.observation_space.n, layers, gd_opt.GDOptimizerNone(learning_rate=learning_rate))

In [36]:
step_count_list = []
total_reward_list = []

experience_buf = deque(maxlen=max_buf_size)

for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values = m.predict(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # record to the experience buf
        experience_buf.append((state, action, reward, new_state, end))
        
        # sample and mini batch training
        if len(experience_buf) > batch_size:
            batch_samples_idx = np.random.choice(np.arange(len(experience_buf)), 
                                                 size=batch_size, replace=False)
            batch_samples = [experience_buf[i] for i in batch_samples_idx]
            batch_states = np.array([sample[0] for sample in batch_samples])
            batch_action = np.array([sample[1] for sample in batch_samples])
            batch_reward = np.array([sample[2] for sample in batch_samples])
            batch_new_states = np.array([sample[3] for sample in batch_samples])
            batch_end = np.array([sample[4] for sample in batch_samples])
            batch_q_values = m.model_forward(batch_states)
            batch_next_q_values = m.predict(batch_new_states)

            # calculate error for back propagtion with Bellman equation
            batch_tgt_q_values = calculate_batch_tgt_values(batch_action,
                                                            batch_q_values, 
                                                            batch_next_q_values, 
                                                            batch_reward, 
                                                            batch_end)

            # update model with predicted q values
            m.update_model(batch_q_values, batch_tgt_q_values)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 34.965 steps are used, avg total reward is 0.515.
In episode 2000, avg 43.662 steps are used, avg total reward is 0.69.
In episode 3000, avg 44.921 steps are used, avg total reward is 0.66.
In episode 4000, avg 43.682 steps are used, avg total reward is 0.7.
In episode 5000, avg 44.235 steps are used, avg total reward is 0.692.
In episode 6000, avg 42.703 steps are used, avg total reward is 0.704.
In episode 7000, avg 43.807 steps are used, avg total reward is 0.656.
In episode 8000, avg 44.369 steps are used, avg total reward is 0.659.
In episode 9000, avg 42.236 steps are used, avg total reward is 0.691.
In episode 10000, avg 43.599 steps are used, avg total reward is 0.679.


ob model

In [37]:
env.render()

  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [38]:
q_values = m.predict(np.identity(16))
np.argmax(q_values, axis=1).reshape(4,4)

array([[0, 3, 3, 3],
       [0, 3, 0, 1],
       [3, 1, 0, 2],
       [3, 2, 1, 0]], dtype=int32)

run with model

In [39]:
test_episode = 10000

In [40]:
total_reward = 0
for _ in range(test_episode):
    end = False
    state = preprocess_state(env.reset())
    while not end:
        q_values = m.predict(state)
        action = np.argmax(q_values)
        state_no, reward, end, _ = env.step(action)
        state = preprocess_state(state_no)
        total_reward += reward
print('average reward: {}'.format(total_reward/test_episode))

average reward: 0.7337
