# Bellman Equation

![](bellman01.png)
![](bellman02.png)
![](bellman03.png)

------------------------

## Reinforcement Learning: Q-table to solve Maze 

[reference](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb)

In [174]:
# reinforcement learning: Maze

MAZE = [
    [0, 0, 1, 0, 0, 0],
    [1, 0, 0, 0, 1, 0],
    [1, 1, 1, 0, 1, 0],
    [0, 1, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 0, 0, 0, 0],
]

MAZE = [
    [0, 0, 1],
    [1, 0, 0],
    [1, 0, 1],
    [0, 0, 0],
]


import numpy as np
import random

R, C = len(MAZE), len(MAZE[0])

# Define the rewards for every step
ACTIONS = [[1,0], [-1,0], [0,-1], [0,1]]
ACTIONS_STR = ['v', '^', '<', '>']
REWARDS = np.zeros((R, C))
for i in range(R):
    for j in range(C):
        if MAZE[i][j] == 0:
            r = -0.1
        elif MAZE[i][j] == 1:
            r = -1
        REWARDS[i][j] = r
REWARDS[R-1][C-1] = 100

def go_one_step(state:int, action:int):
    i, j = state//C, state%C
    ii, jj = i + ACTIONS[action][0], j + ACTIONS[action][1]
    new_state = ii * C + jj

    done = False
    if ii == R-1 and jj == C-1:
        done = True
        
    if ii < 0 or ii >= R or jj < 0 or jj >= C:
        r = -1
        new_state = state
    else:
        r = REWARDS[ii][jj]
    
    return new_state, r, done

def avail_actions(state:int):
    res = []
    i, j = state//C, state%C
    for a in range(len(ACTIONS)):
        ii, jj = i + ACTIONS[a][0], j + ACTIONS[a][1]
        if ii >= 0 and ii < R and jj >= 0 and jj < C:
            res.append(a)
    return res
            
        
        
action_size = 4 # ^ v < >
state_size = R * C

qtable = np.zeros((state_size, action_size))

for state in range(state_size):
    acts = avail_actions(state)
    for a in range(4):
        if a not in acts:
            qtable[state, a] = -1


In [175]:
# Hyperparameters
total_episodes = 10000 # Total episodes
learning_rate = 0.8 # Learning rate
max_step = 99 # Max step per episode
gamma = 0.95 # Discounting rate

# Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005


In [176]:
# Train the Q-table
rewards = []

for episode in range(total_episodes):
    done = False
    total_rewards = 0
    state = 0
    
    for step in range(max_step):
        rnd = random.uniform(0,1)
        if rnd > epsilon:
            action = np.argmax(qtable[state, :])
        else:
            action = random.sample(avail_actions(state),1)[0]
            
        new_state, reward, done = go_one_step(state, action)
        
        #print(state, new_state)
        
        ## Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        state = new_state
        
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
print("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)
        
            
        

Score over time: 99.48714999998464
[[ 80.1796375   -1.          -1.          81.0796375 ]
 [ 85.45225     -1.          76.92565563  83.74      ]
 [ 89.2         -1.          81.0796375   -1.        ]
 [ 84.55225     76.92565563  -1.          85.45225   ]
 [ 90.055       81.0796375   80.1796375   89.2       ]
 [ 94.          83.74        85.45225     -1.        ]
 [ 90.055       80.1796375   -1.          90.055     ]
 [ 94.9         85.45225     84.55225     94.        ]
 [100.          89.2         90.055       -1.        ]
 [ -1.          84.55225     -1.          94.9       ]
 [ -1.          90.055       90.055      100.        ]
 [ -1.           0.           0.          -1.        ]]


In [177]:
# Solve the Maze

for episode in range(1):
    state = 0
    done = False
    actions = []
    
    for step in range(max_step):
        action = np.argmax(qtable[state, :])
        
        new_state, reward, done = go_one_step(state, action)
        
        state = new_state
        actions.append(action)
        
        if done:
            break
        
    print(list(map(lambda a: ACTIONS_STR[a], actions)))
        

['>', 'v', 'v', 'v', '>']


## Reinforcement Learning: Deep Q Network

[REF 1](https://github.com/mswang12/minDQN/blob/main/minDQN.ipynb)
[REF 2](https://zhuanlan.zhihu.com/p/110769361)

## DQN

1. 只需要用DNN代替前面的Q table即可
2. 经测试发现，replay batch train特别重要

In [178]:
# Hyperparameters
total_episodes = 100 # Total episodes
learning_rate = 0.8 # Learning rate
max_step = 99 # Max step per episode
gamma = 0.95 # Discounting rate

# Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

In [179]:
# Net: QTable
import torch
import torch.nn.functional as F

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)
        self.predict = torch.nn.Linear(n_hidden, n_output)
        
    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = self.predict(x)
        return x

dqn_target = Net(n_feature=1, n_hidden=100, n_output=4)    
dqn = Net(n_feature=1, n_hidden=100, n_output=4)

replay_memory = []

def trainDQN(x, y):
    optimizer = torch.optim.Adam(dqn.parameters(), lr=0.01)
    loss_func = torch.nn.MSELoss()
    
    for t in range(500):
        py = dqn(x)
        loss = loss_func(py, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()     
        
        
        
        
def trainDQNBatch():
    global replay_memory
    if len(replay_memory) > 100000:
        replay_memory = replay_memory[:-100000]
    
    batch_size = min(64*2, len(replay_memory))
    mini_batch = random.sample(replay_memory, batch_size)  
    
    X, Y = [], []
    for state, action, reward, new_state, done in mini_batch:
        x = torch.from_numpy(np.array([state])).type(torch.FloatTensor)
        new_x = torch.from_numpy(np.array([new_state])).type(torch.FloatTensor)
        qs = dqn(x).data.numpy()
        new_qs = dqn_target(new_x).data.numpy()
        if not done:
            max_future_q = reward + gamma * np.max(new_qs)
        else:
            max_future_q = reward
        
        #print(action, qs)
        qs[action] = (1-learning_rate)*qs[action] + learning_rate*max_future_q
        
        X.append([state])
        Y.append(qs)
    
    X = torch.from_numpy(np.array(X)).type(torch.FloatTensor)
    Y = torch.from_numpy(np.array(Y)).type(torch.FloatTensor)
    
    trainDQN(X, Y)
        
        


In [180]:
# pre train        
for i in range(1):
    n = 1000
    x = torch.from_numpy(np.array([[random.randint(0, R*C)] for _ in range(n)])).type(torch.FloatTensor)
    y = torch.from_numpy(np.array([[0,0,0,0] for _ in range(n)])).type(torch.FloatTensor)
    trainDQN(x, y)
        
dqn_target.load_state_dict(dqn.state_dict())

for state in range(R*C):
    x = torch.from_numpy(np.array([[state]])).type(torch.FloatTensor)
    y = dqn(x).data.numpy()[0]
    print(y)

[-1.6260892e-06  4.9564987e-06  1.1920929e-07  6.0424209e-06]
[-4.52343374e-05 -1.04030594e-04  7.54594803e-05 -3.98976728e-04]
[ 0.00075101  0.00075611 -0.00104138  0.00476136]
[-0.00040017 -0.00029201  0.00043276 -0.00186353]
[-0.00030367 -0.00023956  0.00035611 -0.00154104]
[-0.00021518 -0.00018693  0.0002754  -0.00121599]
[-0.00012655 -0.00013346  0.00019464 -0.00089099]
[-3.806688e-05 -8.059107e-05  1.141727e-04 -5.663242e-04]
[ 5.0654635e-05 -2.7602538e-05  3.3229589e-05 -2.4106167e-04]
[ 1.3906322e-04  2.5326386e-05 -4.7355890e-05  8.3724037e-05]
[ 2.2778474e-04  7.7897683e-05 -1.2782216e-04  4.0868856e-04]
[ 0.00031625  0.0001313  -0.00020888  0.00073359]


In [None]:
# Train
rewards = []

for episode in range(total_episodes):
    done = False
    total_rewards = 0
    state = 0
    step = 0

    #for step in range(max_step):
    while True:
        step += 1
        rnd = random.uniform(0,1)
        x = torch.from_numpy(np.array([[state]])).type(torch.FloatTensor)
    
        qs = dqn(x).data.numpy()[0]
        
        if rnd > epsilon:
            action, mx = 0, -100000
            for a in avail_actions(state):
                if qs[a] >= mx:
                    action = a
                    mx = qs[a]
            
        else:
            action = random.sample(avail_actions(state),1)[0]
            
        new_state, reward, done = go_one_step(state, action)               
        
        replay_memory.append([state, action, reward, new_state, done])
        
        if step % 4 == 0:
            trainDQNBatch()    
        
        total_rewards += reward
        state = new_state
        
        if done:
            break
        
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
    print(total_rewards)
    
    if episode % 1 == 0:
        dqn_target.load_state_dict(dqn.state_dict())
    
print("Score over time: " + str(sum(rewards)/total_episodes))


In [188]:
# Solve the Maze

for episode in range(1):
    state = 0
    done = False
    actions = []
    
    for step in range(max_step):
        x = torch.from_numpy(np.array([[state]])).type(torch.FloatTensor)
        qs = dqn_target(x).data.numpy()[0]
        action, mx = 0, -100000
        for a in avail_actions(state):
            if qs[a] >= mx:
                action = a
                mx = qs[a]
        
        new_state, reward, done = go_one_step(state, action)
        
        print(state, new_state, action, reward, qs)
        
        state = new_state
        actions.append(action)
        
        if done:
            break
        
    print(list(map(lambda a: ACTIONS_STR[a], actions)))

0 1 3 -0.1 [ 80.17061  -17.889841  71.95214   81.083496]
1 4 0 -0.1 [85.46958  13.561689 76.93455  83.75208 ]
4 7 0 -0.1 [90.06905  81.09974  80.200005 89.2372  ]
7 10 0 -0.1 [94.94379 85.48391 84.69894 93.9384 ]
10 11 3 100.0 [ 6.709077 90.10781  90.137184 99.97809 ]
['>', 'v', 'v', 'v', '>']


## DQN 三个优化

![](dqn01.png)
![](dqn02.png)
![](dqn03.png)