# Bellman Equation

![](bellman01.png)
![](bellman02.png)
![](bellman03.png)

------------------------

## Reinforcement Learning: Q-table to solve Maze 

[reference](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb)

In [33]:
# reinforcement learning: Maze

MAZE = [
    [0, 0, 1, 0, 0, 0],
    [1, 0, 0, 0, 1, 0],
    [1, 1, 1, 0, 1, 0],
    [0, 1, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 0, 0, 0, 0],
]


import numpy as np
import random

R, C = len(MAZE), len(MAZE[0])

# Define the rewards for every step
ACTIONS = [[1,0], [-1,0], [0,-1], [0,1]]
ACTIONS_STR = ['v', '^', '<', '>']
REWARDS = np.zeros((R, C))
for i in range(R):
    for j in range(C):
        if MAZE[i][j] == 0:
            r = -1
        elif MAZE[i][j] == 1:
            r = -100
        REWARDS[i][j] = r
REWARDS[R-1][C-1] = 10000

def go_one_step(state:int, action:int):
    i, j = state//C, state%C
    ii, jj = i + ACTIONS[action][0], j + ACTIONS[action][1]
    new_state = ii * C + jj
    r = REWARDS[ii][jj]
    done = False
    if ii == R-1 and jj == C-1:
        done = True
    return new_state, r, done

def avail_actions(state:int):
    res = []
    i, j = state//C, state%C
    for a in range(len(ACTIONS)):
        ii, jj = i + ACTIONS[a][0], j + ACTIONS[a][1]
        if ii >= 0 and ii < R and jj >= 0 and jj < C:
            res.append(a)
    return res
            
        
        
action_size = 4 # ^ v < >
state_size = R * C

qtable = np.zeros((state_size, action_size))

for state in range(state_size):
    acts = avail_actions(state)
    for a in range(4):
        if a not in acts:
            qtable[state, a] = -10000


In [34]:
# Hyperparameters
total_episodes = 15000 # Total episodes
learning_rate = 0.8 # Learning rate
max_step = 99 # Max step per episode
gamma = 0.95 # Discounting rate

# Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005


In [35]:
# Train the Q-table
rewards = []

for episode in range(total_episodes):
    done = False
    total_rewards = 0
    state = 0
    
    for step in range(max_step):
        rnd = random.uniform(0,1)
        if rnd > epsilon:
            action = np.argmax(qtable[state, :])
        else:
            action = random.sample(avail_actions(state),1)[0]
            
        new_state, reward, done = go_one_step(state, action)
        
        ## Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        state = new_state
        
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
print("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)
        
            
        

Score over time: 9965.107866666667
[[  6196.09908544 -10000.         -10000.           6295.09908544]
 [  6627.47272152 -10000.           5979.34413117   6528.47272152]
 [  6977.33970686 -10000.           6295.09908544   6977.33970686]
 [  7345.62074406 -10000.           6528.47272073   7269.01643065]
 [  7553.64775291 -10000.           6977.33970485   7652.648875  ]
 [  8056.4725     -10000.           7266.68706381 -10000.        ]
 [  6528.47272152   5979.34413117 -10000.           6627.47272152]
 [  6784.28970686   6295.09908544   6196.09908544   6977.33970686]
 [  7246.62074406   6528.47272152   6627.47272152   7345.62074406]
 [  7733.28499375   6977.33970686   6977.33970686   7553.648875  ]
 [  8042.35262498   7269.01438042   7345.62074357   8056.4725    ]
 [  8481.55         7652.64887434   7553.648875   -10000.        ]
 [  6977.33970686   6196.09811054 -10000.           6784.2562922 ]
 [  7246.62074406   6627.47272151   6528.4727049    7246.62064866]
 [  7733.28499375   6977.33

In [36]:
# Solve the Maze

for episode in range(1):
    state = 0
    done = False
    actions = []
    
    for step in range(max_step):
        action = np.argmax(qtable[state, :])
        
        new_state, reward, done = go_one_step(state, action)
        if done:
            break
        
        state = new_state
        actions.append(action)
        
    print(list(map(lambda a: ACTIONS_STR[a], actions)))
        

['>', 'v', '>', '>', 'v', 'v', '>', 'v', 'v']


## Reinforcement Learning: Deep Q Network

[REF 1](https://github.com/mswang12/minDQN/blob/main/minDQN.ipynb)
[REF 2](https://zhuanlan.zhihu.com/p/110769361)

## DQN 三个优化

![](dqn01.png)
![](dqn02.png)
![](dqn03.png)