In [1]:
import numpy as np
import gym

In [2]:
old_settings = np.seterr(all='warn')

In [3]:
np.seterr(all='raise', under='warn')

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'warn'}

deep Q model

In [4]:
# model
model = []

In [5]:
# init model
def model_init(input_size, model_define):
    model.clear()
    prev_node_count = input_size
    for node_count, activation_func, activation_derivative in model_define:
        model.append([np.random.randn(prev_node_count, node_count)/np.sqrt(prev_node_count),
                      np.random.randn(node_count)/np.sqrt(prev_node_count),
                      activation_func,
                      activation_derivative])
        prev_node_count = node_count

In [6]:
# forward pass, calculate predict value with current model
def model_forward(state):
    cur_res = state
    hidden_layer_input_buf = []
    for layer_weight, inter_weight, activation_func, _ in model:
        hidden_layer_input_buf.append(cur_res)
        cur_res = np.dot(cur_res, layer_weight)
        cur_res += inter_weight
        if activation_func:
            cur_res = activation_func(cur_res)
    return cur_res, hidden_layer_input_buf

In [7]:
# back propagation to update model
def back_propagation(td_err, hidden_layer_input_buf):
    grads = []
    delta = td_err
    for layer_out, (layer_weight, inter_weight, _, activation_derivative) \
        in zip(reversed(hidden_layer_input_buf), reversed(model)):
        grads.append((np.outer(layer_out.T, delta), delta))
        delta = np.dot(layer_weight, delta)
        if activation_derivative:
            delta = delta * activation_derivative(layer_out)
    grads.reverse()
    return grads

In [8]:
# update model with gradients
def update_model(grads):
    for i in range(len(model)):
        model[i][0] = model[i][0] + learning_rate * grads[i][0]
        model[i][1] = model[i][1] + learning_rate * grads[i][1]

gym environment

In [9]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
)

In [10]:
env_name = 'FrozenLake-v0'
# env_name = 'FrozenLakeNotSlippery-v0'

In [11]:
env = gym.make(env_name)

behavior define

In [12]:
# binary expand categorical type to preprocess states
def binary_expand(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    res = np.zeros(idx.shape)
    res[n==idx] = 1
    return res

preprocess_func = binary_expand

In [13]:
# active function: sigmoid
def sigmoid(v):
    return 1.0 / (1.0 + np.exp(-v.clip(max=500,min=-500)))

def sigmoid_derivative(sig_v):
    return sig_v * (1 - sig_v)

In [14]:
# active function: ReLU
def relu(v):
    return v.clip(min=0)

def relu_derivative(v):
    return np.where(v>0,1,0)

In [15]:
# nn model define by layers:
# (node_count, active_function)
model_define = [
#     (20, relu, relu_derivative),
#     (env.action_space.n, sigmoid, sigmoid_derivative),
    (20, None, None),
    (env.action_space.n, None, None),
]

hyperparameters

In [16]:
episodes = 30000
learning_rate = 0.03
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.001
epsilon = max_epsilon

print_step = 1000

learning agent

In [17]:
# preprocess state information into input parameters
preprocess_state = preprocess_func

In [18]:
model_init(env.observation_space.n, model_define)

In [19]:
step_count_list = []
total_reward_list = []
reward_ob = []
for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    reward_list = []
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values, hidden_layer_input_buf = model_forward(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # calculate error for back propagtion with Bellman equation
        td_err = np.zeros_like(q_values)
        next_q_values, _ = model_forward(new_state)
        td_err[action] = reward + discount * np.max(next_q_values) * (not end) - q_values[action]
        
        reward_list.append((reward, td_err))
        
        # back propagation with td error
        grads = back_propagation(td_err, hidden_layer_input_buf)
        
        # update model with gradients
        update_model(grads)
        
    reward_ob.append(reward_list)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 12.29 steps are used, avg total reward is 0.046.
In episode 2000, avg 20.331 steps are used, avg total reward is 0.161.
In episode 3000, avg 32.064 steps are used, avg total reward is 0.377.
In episode 4000, avg 39.034 steps are used, avg total reward is 0.532.
In episode 5000, avg 41.24 steps are used, avg total reward is 0.654.
In episode 6000, avg 43.841 steps are used, avg total reward is 0.658.
In episode 7000, avg 42.759 steps are used, avg total reward is 0.664.
In episode 8000, avg 42.315 steps are used, avg total reward is 0.671.
In episode 9000, avg 43.171 steps are used, avg total reward is 0.676.
In episode 10000, avg 44.142 steps are used, avg total reward is 0.678.
In episode 11000, avg 42.934 steps are used, avg total reward is 0.674.
In episode 12000, avg 44.728 steps are used, avg total reward is 0.667.
In episode 13000, avg 43.965 steps are used, avg total reward is 0.666.
In episode 14000, avg 43.069 steps are used, avg total reward is 0.665.
In 

run with model

In [20]:
end = False
state = preprocess_state(env.reset())
env.render()
while not end:
    q_values, _ = model_forward(state)
    action = np.argmax(q_values)
    state_no, _, end, _ = env.step(action)
    env.render()
    state = preprocess_state(state_no)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
F

---

ob

In [21]:
env.render()

  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [22]:
for i in range(16):
    q_values, _ = model_forward(preprocess_state(i))
    print(q_values)

[0.5630969  0.50566662 0.46317781 0.47563946]
[0.4106695  0.40228291 0.30889418 0.48346134]
[0.46073009 0.38687891 0.34426882 0.45141782]
[0.39939655 0.31686998 0.29295624 0.43066903]
[0.58253161 0.36637333 0.34773745 0.33504698]
[0.46218707 0.41620034 0.54831936 0.57040386]
[0.45226855 0.29203177 0.29349483 0.20992347]
[0.41671437 0.6447355  0.86118689 0.82397516]
[0.40371493 0.44696051 0.44072058 0.61950365]
[0.50545381 0.69777568 0.47948539 0.44467505]
[0.69990072 0.53748153 0.43469887 0.36598947]
[0.50282117 0.32354417 0.35526406 0.16985848]
[0.64575089 0.51970965 0.52211683 0.16347403]
[0.55105429 0.58008346 0.74570892 0.48268673]
[0.69054793 0.91129654 0.77078684 0.77018467]
[0.59322792 0.64340212 0.58019897 0.48540829]


---

test nn

test nn with sklearn.datasets.load_breast_cancer