In [1]:
import numpy as np
import gym

In [2]:
old_settings = np.seterr(all='warn')

In [3]:
np.seterr(all='raise', under='warn')

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'warn'}

gym environment

In [4]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
)

In [5]:
env_name = 'FrozenLake-v0'
# env_name = 'FrozenLakeNotSlippery-v0'

In [6]:
env = gym.make(env_name)

behavior define

In [7]:
# binary expand categorical type to preprocess states
def binary_expand(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    res = np.zeros(idx.shape)
    res[n==idx] = 1
    return res

preprocess_func = binary_expand

In [8]:
# active function: sigmoid
def sigmoid(v):
    return 1.0 / (1.0 + np.exp(-v.clip(max=500,min=-500)))

def sigmoid_derivative(v):
    return v * (1 - v)

In [9]:
# active function: ReLU
def relu(v):
    return v.clip(min=0)

def relu_derivative(v):
    return np.where(v>0,1,0)

In [46]:
# nn model define by layers:
# (node_count, active_function)
model_define = [
#     (20, relu, relu_derivative),
#     (env.action_space.n, sigmoid, sigmoid_derivative),
    (50, None, None),
    (env.action_space.n, None, None),
]

hyperparameters

In [47]:
episodes = 30000
learning_rate = 0.03
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.001
epsilon = max_epsilon

print_step = 1000

deep Q engine

In [12]:
# preprocess state information into input parameters
preprocess_state = preprocess_func

In [13]:
# forward pass, calculate predict value with current model
def model_forward(state):
    cur_res = state
    hidden_layer_input_buf = []
    for layer, activation_func, _ in model:
        hidden_layer_input_buf.append(cur_res)
        cur_res = np.dot(cur_res, layer)
        if activation_func:
            cur_res = activation_func(cur_res)
    return cur_res, hidden_layer_input_buf

In [14]:
# back propagation to update model
def back_propagation(td_err, hidden_layer_input_buf):
    grads = []
    delta = td_err
    for layer_out, (layer, _, activation_derivative) in zip(reversed(hidden_layer_input_buf), reversed(model)):
        grads.append(np.outer(layer_out.T, delta))
        delta = np.dot(layer, delta)
        if activation_derivative:
            delta = delta * activation_derivative(layer_out)
    grads.reverse()
    return grads

In [15]:
# update model with gradients
def update_model(grads):
    for i in range(len(model)):
        model[i][0] = model[i][0] + learning_rate * grads[i]

In [48]:
model = []
prev_node_count = env.observation_space.n
for node_count, activation_func, activation_derivative in model_define:
    model.append([np.random.randn(prev_node_count, node_count)/np.sqrt(prev_node_count),
                  activation_func,
                  activation_derivative])
    prev_node_count = node_count

In [49]:
step_count_list = []
total_reward_list = []
reward_ob = []
for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    reward_list = []
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values, hidden_layer_input_buf = model_forward(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # calculate error for back propagtion with Bellman equation
        td_err = np.zeros_like(q_values)
        next_q_values, _ = model_forward(new_state)
        td_err[action] = reward + discount * np.max(next_q_values) * (not end) - q_values[action]
        
        reward_list.append((reward, td_err))
        
        # back propagation with td error
        grads = back_propagation(td_err, hidden_layer_input_buf)
        
        # update model with gradients
        update_model(grads)
        
    reward_ob.append(reward_list)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 11.309 steps are used, avg total reward is 0.045.
In episode 2000, avg 21.502 steps are used, avg total reward is 0.205.
In episode 3000, avg 32.013 steps are used, avg total reward is 0.409.
In episode 4000, avg 35.698 steps are used, avg total reward is 0.564.
In episode 5000, avg 39.175 steps are used, avg total reward is 0.597.
In episode 6000, avg 41.136 steps are used, avg total reward is 0.647.
In episode 7000, avg 41.735 steps are used, avg total reward is 0.635.
In episode 8000, avg 38.513 steps are used, avg total reward is 0.676.
In episode 9000, avg 42.399 steps are used, avg total reward is 0.686.
In episode 10000, avg 42.367 steps are used, avg total reward is 0.628.
In episode 11000, avg 43.11 steps are used, avg total reward is 0.66.
In episode 12000, avg 43.52 steps are used, avg total reward is 0.657.
In episode 13000, avg 41.509 steps are used, avg total reward is 0.614.
In episode 14000, avg 36.874 steps are used, avg total reward is 0.65.
In ep

run with model

In [50]:
end = False
state = preprocess_state(env.reset())
env.render()
while not end:
    q_values, _ = model_forward(state)
    action = np.argmax(q_values)
    state_no, _, end, _ = env.step(action)
    env.render()
    state = preprocess_state(state_no)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH


---

ob

In [51]:
env.render()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [52]:
for i in range(16):
    q_values, _ = model_forward(preprocess_state(i))
    print(q_values)

[0.44176123 0.39228165 0.39312355 0.38193445]
[0.29775156 0.31812821 0.26201089 0.37133117]
[0.31341036 0.25036357 0.26266108 0.34567234]
[0.1556511  0.218531   0.20644806 0.33812346]
[0.45459239 0.40034054 0.37292717 0.28068976]
[ 0.0250409  -0.04699562  0.12119069  0.30352165]
[0.17927158 0.13745011 0.24337014 0.07157744]
[0.19761918 0.26500022 0.12002214 0.2486737 ]
[0.3059476  0.4097947  0.32701651 0.5060742 ]
[0.47312837 0.61231524 0.43795595 0.38931995]
[0.5512142  0.40252655 0.3467512  0.27015182]
[0.08836059 0.02600961 0.00816638 0.22208995]
[ 0.05163072 -0.00270992  0.11372     0.3796335 ]
[0.39521913 0.49691806 0.72268208 0.50598365]
[0.62918252 0.83175076 0.71998892 0.72762153]
[ 0.08592947 -0.21484125  0.05120207  0.01428415]


---

test nn