# Bellman Equation

![](bellman01.png)
![](bellman02.png)
![](bellman03.png)

------------------------

## Reinforcement Learning: Q-table to solve Maze 

[reference](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb)

In [28]:
# reinforcement learning: Maze

MAZE = [
    [0, 0, 1, 0, 0, 0],
    [1, 0, 0, 0, 1, 0],
    [1, 1, 1, 0, 1, 0],
    [0, 1, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 0, 0, 0, 0],
]


import numpy as np
import random

R, C = len(MAZE), len(MAZE[0])

# Define the rewards for every step
ACTIONS = [[1,0], [-1,0], [0,-1], [0,1]]
ACTIONS_STR = ['v', '^', '<', '>']
REWARDS = np.zeros((R, C))
for i in range(R):
    for j in range(C):
        if MAZE[i][j] == 0:
            r = -1
        elif MAZE[i][j] == 1:
            r = -100
        REWARDS[i][j] = r
REWARDS[R-1][C-1] = 10000

def go_one_step(state:int, action:int):
    i, j = state//C, state%C
    ii, jj = i + ACTIONS[action][0], j + ACTIONS[action][1]
    new_state = ii * C + jj

    done = False
    if ii == R-1 and jj == C-1:
        done = True
        
    if ii < 0 or ii >= R or jj < 0 or jj >= C:
        r = -100000
        new_state = state
    else:
        r = REWARDS[ii][jj]
    
    return new_state, r, done

def avail_actions(state:int):
    res = []
    i, j = state//C, state%C
    for a in range(len(ACTIONS)):
        ii, jj = i + ACTIONS[a][0], j + ACTIONS[a][1]
        if ii >= 0 and ii < R and jj >= 0 and jj < C:
            res.append(a)
    return res
            
        
        
action_size = 4 # ^ v < >
state_size = R * C

qtable = np.zeros((state_size, action_size))

for state in range(state_size):
    acts = avail_actions(state)
    for a in range(4):
        if a not in acts:
            qtable[state, a] = -10000


In [10]:
# Hyperparameters
total_episodes = 15000 # Total episodes
learning_rate = 0.8 # Learning rate
max_step = 99 # Max step per episode
gamma = 0.95 # Discounting rate

# Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005


In [11]:
# Train the Q-table
rewards = []

for episode in range(total_episodes):
    done = False
    total_rewards = 0
    state = 0
    
    for step in range(max_step):
        rnd = random.uniform(0,1)
        if rnd > epsilon:
            action = np.argmax(qtable[state, :])
        else:
            action = random.sample(avail_actions(state),1)[0]
            
        new_state, reward, done = go_one_step(state, action)
        
        ## Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        state = new_state
        
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
print("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)
        
            
        

Score over time: 9961.543266666667
[[  6196.09908544 -10000.         -10000.           6295.09908544]
 [  6627.47272152 -10000.           5979.34413117   6528.47272152]
 [  6977.33970686 -10000.           6295.09908544   6977.33970686]
 [  7345.62074406 -10000.           6528.47272152   7269.01638346]
 [  7553.64413106 -10000.           6977.33970659   7652.648875  ]
 [  8056.4725     -10000.           7269.01627926 -10000.        ]
 [  6528.47271727   5979.34413117 -10000.           6627.47272152]
 [  6784.28970686   6295.09908544   6196.09908544   6977.33970686]
 [  7246.62074406   6528.47272152   6627.47272152   7345.62074406]
 [  7733.28499375   6977.33970686   6977.33970686   7553.648875  ]
 [  8042.352625     7269.01643094   7345.62074406   8056.4725    ]
 [  8481.55         7652.64887428   7553.648875   -10000.        ]
 [  6977.33970683   6196.08148003 -10000.           6784.28970686]
 [  7246.62074406   6627.47272147   6528.47271302   7246.62074406]
 [  7733.28499375   6977.33

In [12]:
# Solve the Maze

for episode in range(1):
    state = 0
    done = False
    actions = []
    
    for step in range(max_step):
        action = np.argmax(qtable[state, :])
        
        new_state, reward, done = go_one_step(state, action)
        if done:
            break
        
        state = new_state
        actions.append(action)
        
    print(list(map(lambda a: ACTIONS_STR[a], actions)))
        

['>', 'v', '>', '>', 'v', 'v', '>', 'v', 'v']


## Reinforcement Learning: Deep Q Network

[REF 1](https://github.com/mswang12/minDQN/blob/main/minDQN.ipynb)
[REF 2](https://zhuanlan.zhihu.com/p/110769361)

## 简单的DQN

只需要用DNN代替前面的Q table即可

In [15]:
# Net: QTable
import torch
import torch.nn.functional as F

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)
        self.predict = torch.nn.Linear(n_hidden, n_output)
        
    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = self.predict(x)
        return x
    
dqn = Net(n_feature=1, n_hidden=20, n_output=4)


def trainDQN(x, y):
    optimizer = torch.optim.SGD(dqn.parameters(), lr=0.02)
    loss_func = torch.nn.MSELoss()
    
    for t in range(100):
        py = dqn(x)
        loss = loss_func(py, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()        

In [32]:
# Train
rewards = []

for episode in range(50):
    done = False
    total_rewards = 0
    state = 0

    for step in range(max_step):
        rnd = random.uniform(0,1)
        # 
        x = torch.from_numpy(np.array([[state]])).type(torch.FloatTensor)
    
        qs = dqn(x).data.numpy()[0]
        
        
        if rnd > epsilon:
            action = np.argmax(qs)
        else:
            action = random.sample(avail_actions(state),1)[0]
            
        new_state, reward, done = go_one_step(state, action)
        
        ## Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qs[action] = qs[action] + learning_rate*(reward+gamma*np.max(qs) - qs[action])
        
        
        # update DQN
        x = torch.from_numpy(np.array([[state]])).type(torch.FloatTensor)
        y = torch.from_numpy(np.array([qs])).type(torch.FloatTensor)
        trainDQN(x, y)    
    
        total_rewards += reward
        state = new_state
        
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
print("Score over time: " + str(sum(rewards)/total_episodes))


Score over time: -148.84646666666666


In [33]:
# Solve the Maze

for episode in range(1):
    state = 0
    done = False
    actions = []
    
    for step in range(max_step):
        qs = dqn(x).data.numpy()[0]
        action = np.argmax(qs)
        
        new_state, reward, done = go_one_step(state, action)
        if done:
            break
        
        state = new_state
        actions.append(action)
        
    print(list(map(lambda a: ACTIONS_STR[a], actions)))

['v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v']


## DQN 三个优化

![](dqn01.png)
![](dqn02.png)
![](dqn03.png)