# **一个NatureDQN算法的简单实现**
## **算法概述**
- 普通DQN算法的改进版
- 使用了两个网络，目标网络和策略网络
    - 目标网络用于提供策略网络需要学习的Q值目标(Qmax(S',a))，策略网络则用于学习更新
    - 两个网络的结构完全相同，目标网络参数固定，每隔X步将策略网络的参数更新到目标网络
- 传统DQN使用单个网络进行价值评估和网络参数更新，这会两者之间的相关性过强，不利于算法收敛；使用两个网络来解耦这种关联性
- off-policy算法，value-based算法

论文链接：*https://docs.popo.netease.com/docs/cbb9344e20ca49578e9d107a26c2ba17*

In [1]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import copy

In [2]:
GAMMA = 0.9
epsilon = 0.5
start_epsilon = 0.5
end_epsilon = 0.01
replay_size = 10000
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class NatureDQN(nn.Module):
    def __init__(self, env):
        super(NatureDQN, self).__init__()
        self.replay_buffer = deque()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.create_training_network()
        self.create_training_method()
        self.to(device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        self.state_action_value = self.fc2(x)
        return self.state_action_value
    
    def create_training_network(self):
        self.fc1 = nn.Linear(self.state_dim, 20)
        self.fc2 = nn.Linear(20, self.action_dim)
        
    def create_training_method(self):
        self.optimizer = optim.Adam(self.parameters(), lr=0.0001)
        self.loss_cal = F.mse_loss
        
    def get_target_network(self, target_network):
        self.target_network = target_network
        
    def train_loop(self):
        # self.eval()
        minibatch = random.sample(self.replay_buffer, batch_size)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done = [data[4] for data in minibatch]
        
        # Q(S,A) = Q(S,A) + alpha*(R+gamma*Qmax(S',a) - Q(S,A)) 
        with torch.no_grad():
            Q_max_value_batch = torch.max(self.target_network(torch.tensor(np.stack(next_state_batch), device=device)), dim=1)[0] # 从目标网络中得到最大的Q值
        action = torch.tensor(action_batch, device=device).unsqueeze(-1) 
        reward = torch.tensor(reward_batch, device=device) # 
        done = torch.tensor(done, device=device)
        y_batch = torch.where(done, reward, reward + GAMMA * Q_max_value_batch) # 根据目标网络输出的最大Q值计算目标Q值
        Q_batch = torch.gather(self(torch.tensor(np.stack(state_batch), device=device)), 1, action).squeeze(-1)
        loss = self.loss_cal(Q_batch, y_batch)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def epsilon_greedy(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).to(device)
            if random.random() > epsilon:
                state_action_value = self(state)
                action = torch.argmax(state_action_value).item()
            else:
                action = np.random.randint(0, 2)
            # epsilon = epsilon - (start_epsilon - end_epsilon) / 10000
            return action
    
    def action(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).to(device)
            state_action_value = self(state)
            return torch.argmax(state_action_value).item()
    
    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
        if len(self.replay_buffer) > replay_size:
            self.replay_buffer.popleft()
            
        if len(self.replay_buffer) > batch_size:
            self.train_loop()
            
    def update_target_network(self):
        self.target_network.load_state_dict(self.state_dict(),strict=False)

In [4]:
import gym
env_name = "CartPole-v1"
env = gym.make(env_name)
agent = NatureDQN(env)
agent.train()

target_network = NatureDQN(env)
agent.get_target_network(target_network)

In [5]:
def main():
    for episode in range(3000):
        print(f"trainging episode {episode}")
        state, _ = env.reset()
        for step in range(300):
            action = agent.epsilon_greedy(state)
            # action = epsilon_greedy(state_action_value)
            # next_state, reward, done, _ = env.step(action)
            next_state, reward, done, _, _ = env.step(action)
            reward = -1 if done else 0.01
            agent.perceive(state, action, reward, next_state, done)
            if done:
                break
            state = next_state
        if episode % 100 == 0:
            total_reward = 0
            for i in range(10):
                state, _ = env.reset()
                for j in range(300):
                    action = agent.action(state)
                    next_state, reward, done, _, _ = env.step(action)
                    total_reward += reward
                    state = next_state
                    if done:
                        break
            total_reward /= 10
            print(f"average reward is {total_reward}")
        if episode % 100 == 0 and episode != 0:
            agent.update_target_network()

In [None]:
if __name__ == '__main__':
    main()

trainging episode 0


  if not isinstance(terminated, (bool, np.bool8)):


average reward is 11.4
trainging episode 1
trainging episode 2
trainging episode 3
trainging episode 4
trainging episode 5
trainging episode 6
trainging episode 7
trainging episode 8
trainging episode 9
trainging episode 10
trainging episode 11
trainging episode 12
trainging episode 13
trainging episode 14
trainging episode 15
trainging episode 16
trainging episode 17
trainging episode 18
trainging episode 19
trainging episode 20
trainging episode 21
trainging episode 22
trainging episode 23
trainging episode 24
trainging episode 25
trainging episode 26
trainging episode 27
trainging episode 28
trainging episode 29
trainging episode 30
trainging episode 31
trainging episode 32
trainging episode 33
trainging episode 34
trainging episode 35
trainging episode 36
trainging episode 37
trainging episode 38
trainging episode 39
trainging episode 40
trainging episode 41
trainging episode 42
trainging episode 43
trainging episode 44
trainging episode 45
trainging episode 46
trainging episode 47