In [1]:
import numpy as np
import gym

In [2]:
old_settings = np.seterr(all='warn')

In [3]:
np.seterr(all='raise', under='warn')

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'warn'}

### deep Q model

In [4]:
# model
model = []

In [5]:
# init model
def model_init(input_size, model_define):
    model.clear()
    prev_node_count = input_size
    for node_count, activation_func, activation_derivative in model_define:
        model.append([np.random.randn(prev_node_count, node_count)/np.sqrt(prev_node_count),
                      np.random.randn(node_count)/np.sqrt(prev_node_count),
                      activation_func,
                      activation_derivative])
        prev_node_count = node_count

In [6]:
# forward pass, calculate predict value with current model
def model_forward(state):
    cur_res = state
    hidden_layer_input_buf = []
    for layer_weight, inter_weight, activation_func, _ in model:
        hidden_layer_input_buf.append(cur_res)
        cur_res = np.dot(cur_res, layer_weight)
        cur_res += inter_weight
        if activation_func:
            cur_res = activation_func(cur_res)
    return cur_res, hidden_layer_input_buf

In [7]:
# batch back propagation to update model
def back_propagation(errs, hidden_layer_input_buf):
    grads = []
    delta = np.atleast_2d(errs)
    
    # TODO: bug here, layers mismatched during calculating delta with layer activation derivative
    for layer_out, (layer_weight, inter_weight, _, activation_derivative) \
        in zip(reversed(hidden_layer_input_buf), reversed(model)):
        grads.append((np.dot(np.atleast_2d(layer_out).T, delta)/delta.shape[0],
                      delta.mean(axis=0)))
        delta = np.dot(delta, layer_weight.T)
        if activation_derivative:
            delta = delta * activation_derivative(layer_out)
    grads.reverse()
    return grads

In [8]:
# update model with gradients
def update_model(grads):
    for i in range(len(model)):
        model[i][0] = model[i][0] + learning_rate * grads[i][0]
        model[i][1] = model[i][1] + learning_rate * grads[i][1]

### common op for RL

In [9]:
def calculate_batch_td_errs(batch_action,
                            batch_q_values, 
                            batch_next_q_values, 
                            batch_reward, 
                            batch_end):
    tgt_rewards = batch_reward + discount * np.max(batch_next_q_values, axis=1) * (1-batch_end)
    td_errs = np.zeros(batch_q_values.shape)
    td_errs[np.arange(len(td_errs)),batch_action] = \
        tgt_rewards - batch_q_values[np.arange(len(batch_q_values)),batch_action]
    
    return td_errs

### gym environment

In [10]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '8x8', 'is_slippery': False},
)

In [11]:
env_name = 'FrozenLake-v0'
# env_name = 'FrozenLake8x8-v0'
# env_name = 'FrozenLakeNotSlippery-v0'

In [12]:
env = gym.make(env_name)

### behavior define

In [13]:
# binary expand categorical type to preprocess states
def binary_expand(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    res = np.zeros(idx.shape)
    res[n==idx] = 1
    return res

preprocess_func = binary_expand

In [14]:
def binary_expand_mod(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    return np.identity(len(idx))[np.where(n==idx)]

In [15]:
# active function: sigmoid
def sigmoid(v):
    return 1.0 / (1.0 + np.exp(-v.clip(max=500,min=-500)))

def sigmoid_derivative(sig_v):
    return sig_v * (1 - sig_v)

In [16]:
# active function: ReLU
def relu(v):
    return v.clip(min=0)

def relu_derivative(v):
    return np.where(v>0,1,0)

In [17]:
# nn model define by layers:
# (node_count, active_function)
model_define = [
#     (20, relu, relu_derivative),
#     (env.action_space.n, sigmoid, sigmoid_derivative),
    (20, None, None),
    (env.action_space.n, None, None),
]

In [18]:
# preprocess state information into input parameters
preprocess_state = preprocess_func

### learning agent - online

#### hyperparameters

In [19]:
episodes = 30000
learning_rate = 0.02
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.001
epsilon = max_epsilon

print_step = 1000

In [20]:
model_init(env.observation_space.n, model_define)

In [21]:
step_count_list = []
total_reward_list = []
for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values, hidden_layer_input_buf = model_forward(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # calculate error for back propagtion with Bellman equation
        td_err = np.zeros_like(q_values)
        next_q_values, _ = model_forward(new_state)
        td_err[action] = (reward + discount * np.max(next_q_values) * (not end) - q_values[action])
        
        # back propagation with td error
        grads = back_propagation(td_err, hidden_layer_input_buf)
        
        # update model with gradients
        update_model(grads)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 11.014 steps are used, avg total reward is 0.037.
In episode 2000, avg 21.638 steps are used, avg total reward is 0.18.
In episode 3000, avg 32.584 steps are used, avg total reward is 0.427.
In episode 4000, avg 39.059 steps are used, avg total reward is 0.532.
In episode 5000, avg 42.098 steps are used, avg total reward is 0.611.
In episode 6000, avg 43.445 steps are used, avg total reward is 0.673.
In episode 7000, avg 44.35 steps are used, avg total reward is 0.638.
In episode 8000, avg 43.389 steps are used, avg total reward is 0.674.
In episode 9000, avg 43.227 steps are used, avg total reward is 0.688.
In episode 10000, avg 41.921 steps are used, avg total reward is 0.688.
In episode 11000, avg 43.802 steps are used, avg total reward is 0.66.
In episode 12000, avg 42.588 steps are used, avg total reward is 0.717.
In episode 13000, avg 42.462 steps are used, avg total reward is 0.682.
In episode 14000, avg 42.627 steps are used, avg total reward is 0.71.
In ep

ob model

In [22]:
env.render()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [23]:
q_values, _ = model_forward(np.identity(16))
np.argmax(q_values, axis=1).reshape(4,4)

array([[0, 3, 3, 3],
       [0, 2, 0, 3],
       [3, 1, 0, 1],
       [3, 2, 1, 0]], dtype=int32)

run with model

In [24]:
test_episode = 10000

In [25]:
total_reward = 0
for _ in range(test_episode):
    end = False
    state = preprocess_state(env.reset())
    while not end:
        q_values, _ = model_forward(state)
        action = np.argmax(q_values)
        state_no, reward, end, _ = env.step(action)
        state = preprocess_state(state_no)
        total_reward += reward
print('average reward: {}'.format(total_reward/test_episode))

average reward: 0.7376


---

### learning agent - experience replay with mini batch

In [26]:
from collections import deque

#### hyperparameters

In [27]:
episodes = 10000
learning_rate = 0.1
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.01
epsilon = max_epsilon

print_step = 1000

In [28]:
batch_size = 50
max_buf_size = 10000

In [29]:
model_init(env.observation_space.n, model_define)

In [30]:
step_count_list = []
total_reward_list = []

experience_buf = deque(maxlen=max_buf_size)

for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values, _ = model_forward(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # record to the experience buf
        experience_buf.append((state, action, reward, new_state, end))
        
        # sample and mini batch training
        if len(experience_buf) > batch_size:
            batch_samples_idx = np.random.choice(np.arange(len(experience_buf)), 
                                                 size=batch_size, replace=False)
            batch_samples = [experience_buf[i] for i in batch_samples_idx]
            batch_states = np.array([sample[0] for sample in batch_samples])
            batch_action = np.array([sample[1] for sample in batch_samples])
            batch_reward = np.array([sample[2] for sample in batch_samples])
            batch_new_states = np.array([sample[3] for sample in batch_samples])
            batch_end = np.array([sample[4] for sample in batch_samples])
            batch_q_values, batch_hidden_layers = model_forward(batch_states)
            batch_next_q_values, _ = model_forward(batch_new_states)

            # calculate error for back propagtion with Bellman equation
            batch_td_err = calculate_batch_td_errs(batch_action,
                                                   batch_q_values, 
                                                   batch_next_q_values, 
                                                   batch_reward, 
                                                   batch_end)

            # back propagation with td error
            grads = back_propagation(batch_td_err, batch_hidden_layers)

            # update model with gradients
            update_model(grads)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 35.227 steps are used, avg total reward is 0.509.
In episode 2000, avg 41.4 steps are used, avg total reward is 0.681.
In episode 3000, avg 43.389 steps are used, avg total reward is 0.667.
In episode 4000, avg 43.945 steps are used, avg total reward is 0.685.
In episode 5000, avg 44.573 steps are used, avg total reward is 0.669.
In episode 6000, avg 41.986 steps are used, avg total reward is 0.667.
In episode 7000, avg 42.284 steps are used, avg total reward is 0.699.
In episode 8000, avg 43.899 steps are used, avg total reward is 0.657.
In episode 9000, avg 43.636 steps are used, avg total reward is 0.713.
In episode 10000, avg 41.808 steps are used, avg total reward is 0.656.


ob model

In [31]:
env.render()

  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG


In [32]:
q_values, _ = model_forward(np.identity(16))
np.argmax(q_values, axis=1).reshape(4,4)

array([[0, 3, 3, 3],
       [0, 0, 2, 2],
       [3, 1, 0, 3],
       [0, 2, 1, 2]], dtype=int32)

run with model

In [35]:
test_episode = 10000

In [36]:
total_reward = 0
for _ in range(test_episode):
    end = False
    state = preprocess_state(env.reset())
    while not end:
        q_values, _ = model_forward(state)
        action = np.argmax(q_values)
        state_no, reward, end, _ = env.step(action)
        state = preprocess_state(state_no)
        total_reward += reward
print('average reward: {}'.format(total_reward/test_episode))

average reward: 0.7393
