In [1]:
import numpy as np
import gym

In [2]:
old_settings = np.seterr(all='warn')

In [3]:
np.seterr(all='raise', under='warn')

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'warn'}

deep Q model

In [4]:
# model
model = []

In [5]:
# init model
def model_init(input_size, model_define):
    model.clear()
    prev_node_count = input_size
    for node_count, activation_func, activation_derivative in model_define:
        model.append([np.random.randn(prev_node_count, node_count)/np.sqrt(prev_node_count),
                      np.random.randn(node_count)/np.sqrt(prev_node_count),
                      activation_func,
                      activation_derivative])
        prev_node_count = node_count

In [6]:
# batch forward pass, calculate predict value with current model
def model_forward(states):
    cur_res = np.atleast_2d(states)
    hidden_layer_input_buf = []
    for layer_weight, inter_weight, activation_func, _ in model:
        hidden_layer_input_buf.append(cur_res)
        cur_res = np.dot(cur_res, layer_weight)
        cur_res += inter_weight
        if activation_func:
            cur_res = activation_func(cur_res)
    return cur_res, hidden_layer_input_buf

In [7]:
# batch back propagation to update model
def back_propagation(td_err, hidden_layer_input_buf):
    grads = []
    delta = np.atleast_2d(td_err)
    for layer_out, (layer_weight, inter_weight, _, activation_derivative) \
        in zip(reversed(hidden_layer_input_buf), reversed(model)):
        grads.append((np.dot(np.atleast_2d(layer_out.T), delta)/delta.shape[0],
                      delta.mean(axis=0)))
        delta = np.dot(delta, layer_weight.T)
        if activation_derivative:
            delta = delta * activation_derivative(layer_out)
    grads.reverse()
    return grads

In [8]:
# update model with gradients
def update_model(grads):
    for i in range(len(model)):
        model[i][0] = model[i][0] + learning_rate * grads[i][0]
        model[i][1] = model[i][1] + learning_rate * grads[i][1]

gym environment

In [9]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
)

In [10]:
env_name = 'FrozenLake-v0'
# env_name = 'FrozenLakeNotSlippery-v0'

In [11]:
env = gym.make(env_name)

behavior define

In [12]:
# binary expand categorical type to preprocess states
def binary_expand(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    res = np.zeros(idx.shape)
    res[n==idx] = 1
    return res

preprocess_func = binary_expand

In [39]:
def binary_expand_mod(n, idx=env.observation_space.n):
    if type(idx) is int:
        idx = np.array(list(range(idx)))
    return np.identity(len(idx))[np.where(n==idx)]

In [13]:
# active function: sigmoid
def sigmoid(v):
    return 1.0 / (1.0 + np.exp(-v.clip(max=500,min=-500)))

def sigmoid_derivative(sig_v):
    return sig_v * (1 - sig_v)

In [14]:
# active function: ReLU
def relu(v):
    return v.clip(min=0)

def relu_derivative(v):
    return np.where(v>0,1,0)

In [15]:
# nn model define by layers:
# (node_count, active_function)
model_define = [
#     (20, relu, relu_derivative),
#     (env.action_space.n, sigmoid, sigmoid_derivative),
    (20, None, None),
    (env.action_space.n, None, None),
]

hyperparameters

In [16]:
episodes = 30000
learning_rate = 0.03
discount = 0.99

max_epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.001
epsilon = max_epsilon

print_step = 1000

learning agent

In [17]:
# preprocess state information into input parameters
preprocess_state = preprocess_func

In [20]:
model_init(env.observation_space.n, model_define)

In [21]:
step_count_list = []
total_reward_list = []
reward_ob = []
for ep in range(1, episodes+1):
    end = False
    step_count = 0
    total_reward = 0
    reward_list = []
    
    # startup state
    new_state = preprocess_state(env.reset())
    
    # run until game end
    while not end:
        # predict with the latest model
        state = new_state
        q_values, hidden_layer_input_buf = model_forward(state)
        
        # epsilon-greedy action selection
        if np.random.rand() > epsilon:
            action = np.argmax(q_values)
        else:
            action = env.action_space.sample()

        # step forward
        new_state_no, reward, end, _ = env.step(action)
        step_count += 1
        total_reward += reward
        
        # update q values with actual returns
        # save new state for the next step
        new_state = preprocess_state(new_state_no)
        
        # calculate error for back propagtion with Bellman equation
        # TODO: need to modify error calculation to apply batch functions
        td_err = np.zeros_like(q_values)
        next_q_values, _ = model_forward(new_state)
        td_err[0][action] = reward + discount * np.max(next_q_values) * (not end) - q_values[0][action]
        
        reward_list.append((reward, td_err))
        
        # back propagation with td error
        grads = back_propagation(td_err, hidden_layer_input_buf)
        
        # update model with gradients
        update_model(grads)
        
    reward_ob.append(reward_list)
    
    # update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon_decay*ep)
    
    # record step counts
    step_count_list.append(step_count)
    total_reward_list.append(total_reward)
    # print informations
    if (ep)%print_step == 0:
        print('In episode {}, avg {} steps are used, avg total reward is {}.'.format(
            ep, sum(step_count_list)/print_step, sum(total_reward_list)/print_step))
        step_count_list.clear()
        total_reward_list.clear()
    

In episode 1000, avg 11.063 steps are used, avg total reward is 0.036.
In episode 2000, avg 20.939 steps are used, avg total reward is 0.183.
In episode 3000, avg 31.276 steps are used, avg total reward is 0.4.
In episode 4000, avg 40.734 steps are used, avg total reward is 0.548.
In episode 5000, avg 40.819 steps are used, avg total reward is 0.656.
In episode 6000, avg 41.39 steps are used, avg total reward is 0.649.
In episode 7000, avg 43.764 steps are used, avg total reward is 0.654.
In episode 8000, avg 43.723 steps are used, avg total reward is 0.672.
In episode 9000, avg 42.275 steps are used, avg total reward is 0.666.
In episode 10000, avg 42.576 steps are used, avg total reward is 0.696.
In episode 11000, avg 42.85 steps are used, avg total reward is 0.671.
In episode 12000, avg 43.634 steps are used, avg total reward is 0.682.
In episode 13000, avg 42.544 steps are used, avg total reward is 0.685.
In episode 14000, avg 43.188 steps are used, avg total reward is 0.692.
In ep

run with model

In [22]:
end = False
state = preprocess_state(env.reset())
env.render()
while not end:
    q_values, _ = model_forward(state)
    action = np.argmax(q_values)
    state_no, _, end, _ = env.step(action)
    env.render()
    state = preprocess_state(state_no)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH

---

ob

In [23]:
env.render()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [29]:
q_values, _ = model_forward(np.identity(16))
print(q_values)

[[0.51473984 0.48080445 0.42777855 0.47582202]
 [0.3368716  0.38301985 0.31254668 0.4699393 ]
 [0.36348846 0.36301077 0.31556663 0.4428129 ]
 [0.31522579 0.3370004  0.22888494 0.41972735]
 [0.53462124 0.37436321 0.37029661 0.3415028 ]
 [0.33410536 0.67124027 0.44401701 0.44399237]
 [0.25783028 0.25147451 0.35105163 0.11138381]
 [0.37685194 0.44877215 0.33855867 0.39988035]
 [0.33778966 0.43612407 0.37178573 0.58912297]
 [0.43508694 0.64265339 0.4294603  0.3711778 ]
 [0.60241005 0.52463266 0.39626051 0.33004984]
 [0.40077839 0.51634073 0.28478198 0.39648419]
 [0.51114158 0.36385602 0.43240276 0.55760059]
 [0.40719497 0.58140697 0.71125142 0.38172757]
 [0.6233313  0.89722137 0.73812503 0.75215066]
 [0.60184097 0.64187536 0.42983917 0.46089344]]


---

test nn

test nn with sklearn.datasets.load_breast_cancer