# **一个Duleing DQN算法的简单实现**
## **算法概述**
- Nature DQN的改进版。
- 改进了网络结构，原本的DQN只有一个输出头，输出各动作的Q值；Duleing DQN有两个输出头，一个输出与动作无关的价值（函数），另外一个输出与动作有关的优势（函数）。
- 将原来的一个输出分解为两个，价值函数专注于预测当前状态的好坏，而优势函数专注于预测当前状态下每个待执行动作的好坏。
- off-policy算法，value-based算法。

论文链接：*https://docs.popo.netease.com/docs/dcc8c9f994bc441bab9fc84be9572bb7*

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from collections import deque
import numpy as np

In [2]:
GAMMA = 0.9
epsilon = 0.5
start_epsilon = 0.5
end_epsilon = 0.01
batch_size = 32
replay_size = 10000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class DuleDQN(nn.Module):
    
    def __init__(self, env):
        super(DuleDQN, self).__init__()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.create_training_network()
        self.create_training_method()
        self.replay_buffer = deque()
        self.to(device)
        
        
    def create_training_network(self):
        self.fc1 = nn.Linear(self.state_dim, 20)
        self.value_func = nn.Linear(20, 1)
        self.advantage_func = nn.Linear(20, self.action_dim)
    
    def create_training_method(self):
        self.optim = optim.Adam(self.parameters(), lr=0.001)
        self.loss_cal = F.mse_loss
        
    def get_target_network(self, target_network):
        self.target_network = target_network
    
    def forward(self, input):
        x = F.relu(self.fc1(input))
        value = self.value_func(x)
        advantage = self.advantage_func(x)
        return value, advantage
    
    def calculate_dule_Q(self, value, advantage):
        # return [B,A_dim]
        if advantage.dim() == 2:
            return value.view(-1, 1) + (advantage - torch.mean(advantage, dim=-1, keepdim=True))
        elif advantage.dim() == 1:
            return value + (advantage - torch.mean(advantage, dim=-1, keepdim=True))
    def episode_greedy(self, state):
        global epsilon
        with torch.no_grad():
            # 如果源类型是float，则tensor()和from_tensor()结果相同；否则tensor()返回的类型是float32，from_tensor返回类型与源类型相同。
            state = torch.tensor(state, device=device)
            # state = torch.from_numpy(state) 
            if random.random() > epsilon:
                value, advantage = self(state)
                state_action_value = self.calculate_dule_Q(value, advantage)
                action = torch.argmax(state_action_value).item()
            else:
                action = np.random.randint(0, 2)
            epsilon = epsilon - (start_epsilon - end_epsilon) / 10000
            return action
    
    def action(self, state):
        with torch.no_grad():
            state = torch.tensor(state, device=device)
            value, advantage = self(state)
            state_action_value = self.calculate_dule_Q(value, advantage)
            action = torch.argmax(state_action_value).item()
            return action
    
    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
        if len(self.replay_buffer) > replay_size:
            self.replay_buffer.popleft()
        if len(self.replay_buffer) >= batch_size:
            self.train_loop()
    
    def train_loop(self):
        
        minibatch = random.sample(self.replay_buffer, batch_size)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done_batch = [data[4] for data in minibatch]
        
        with torch.no_grad():
            value, advantage = self.target_network(torch.tensor(np.stack(next_state_batch), device=device))
            target_Q_value = torch.max(self.calculate_dule_Q(value,advantage), dim=1)[0]
        reward = torch.tensor(np.stack(reward_batch), device=device, dtype=torch.float32)
        done = torch.tensor(np.stack(done_batch), device=device)
        zero_value = torch.zeros_like(target_Q_value)
        y_batch = torch.where(done, zero_value, GAMMA * target_Q_value) + reward
        
        value, advantage = self(torch.tensor(np.stack(state_batch), device=device))
        Q_value = self.calculate_dule_Q(value, advantage)
        action = torch.tensor(np.stack(action_batch),device=device).view(-1, 1)
        Q_batch = torch.gather(Q_value, 1, action).squeeze(-1)
        
        loss = self.loss_cal(Q_batch, y_batch)
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        
    def update_target_network(self):
        self.target_network.load_state_dict(self.state_dict(), strict=False)

In [4]:
import gym
env_name = "CartPole-v1"
env = gym.make(env_name)
agent = DuleDQN(env)
agent.train()
target_network = DuleDQN(env)
agent.get_target_network(target_network)

In [5]:
def main():
    for episode in range(3000):
        state, _ = env.reset()
        for step in range(300):
            action = agent.episode_greedy(state)
            next_state, reward, done, _, _ = env.step(action)
            reward = -1 if done else 0.01
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # for evaluation
        if episode % 100 == 0 and episode != 0:
            totoal_reward = 0.0
            for eval_count in range(10):
                state, _ = env.reset()
                for step in range(300):
                    action = agent.action(state)
                    next_state, reward, done, _, _ = env.step(action)
                    totoal_reward += reward
                    state = next_state
                    if done:
                        break
            totoal_reward /= 10
            print(f"episode {episode} total score is {totoal_reward}")
            agent.update_target_network()

In [None]:
if __name__ == "__main__":
    main()

  if not isinstance(terminated, (bool, np.bool8)):


episode 100 total score is 9.2
episode 200 total score is 11.1
episode 300 total score is 14.4
episode 400 total score is 31.2
episode 500 total score is 64.4
episode 600 total score is 86.4
episode 700 total score is 130.3
episode 800 total score is 255.3
episode 900 total score is 95.2
episode 1000 total score is 144.7
episode 1100 total score is 86.5
episode 1200 total score is 296.4
episode 1300 total score is 275.3
episode 1400 total score is 298.5
episode 1500 total score is 232.0
episode 1600 total score is 300.0
episode 1700 total score is 110.2
episode 1800 total score is 103.8
episode 1900 total score is 107.4
episode 2000 total score is 158.3
episode 2100 total score is 117.5
episode 2200 total score is 105.6
episode 2300 total score is 153.2
episode 2400 total score is 214.2
episode 2500 total score is 90.8
episode 2600 total score is 91.9
episode 2700 total score is 104.2
episode 2800 total score is 110.7
episode 2900 total score is 143.2
