# **一个Double DQN算法的简单实现**
## **算法概述**
- Narture DQN算法的改进版
- 使用了两个网络，目标网络和策略网络
    - 目标网络用于提供策略网络需要学习的Q值目标(Qmax(S',a))，策略网络则用于学习更新
    - 两个网络的结构完全相同，目标网络参数固定，每隔X步将策略网络的参数更新到目标网络
- 与Nature DQN不同的是，Nature DQN中目标网络输出N个Q值（如果有N个动作的话，每个动作一个Q值），选取最大的Q值作为目标Q值的计算项；而在Double DQN中，需要先在策略网络中选择Q值最大的动作，而后在目标网络中选取该动作对应的Q值，并将该Q值作为目标Q值的计算项。
- 此举是因为Nature DQN始终选择Q值最大的动作容易导致Q值过估计的问题，最终的结果会有较大的偏差，而将Q值动作的选择和Q值计算解耦可以消除过估计的问题
- off-policy算法，value-based算法

论文链接：*https://docs.popo.netease.com/docs/0f6d78d83bfa4d63baa73f99cf622543*

In [1]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque

In [2]:
GAMMA = 0.9
epsilon = 0.5
start_epsilon = 0.5
end_epsilon = 0.01
replay_size = 10000
batch_size = 32
N_ACTIONS = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class DDQN(nn.Module):
    def __init__(self, env):
        super(DDQN, self).__init__()
        self.replay_buffer = deque()
        self.epsilon = start_epsilon
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.create_training_network()
        self.create_training_method()
        self.to(device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        self.state_action_value = self.fc2(x)
        return self.state_action_value
    
    def create_training_network(self):
        self.fc1 = nn.Linear(self.state_dim, 20)
        self.fc2 = nn.Linear(20, self.action_dim)
    
    def create_training_method(self):
        self.optimizer = optim.Adam(self.parameters(), lr=1e-4)
        self.loss_cal = F.mse_loss
    
    def get_target_network(self, target_network):
        self.target_network = target_network
        self.target_network.load_state_dict(self.state_dict(), strict=False)
    
    def train_loop(self):
        minibatch = random.sample(self.replay_buffer, batch_size)
        state = [data[0] for data in minibatch] 
        action = [data[1] for data in minibatch]
        reward = [data[2] for data in minibatch]
        next_state = [data[3] for data in minibatch]
        done = [data[4] for data in minibatch]
        with torch.no_grad():
            next_state = torch.tensor(np.stack(next_state), device=device)
            action_max_Q = torch.argmax(self(next_state), dim=1).unsqueeze(-1) # 得到策略网络输出Q值最大的动作
            Q_max_value_batch = self.target_network(next_state).gather(1, action_max_Q).squeeze(-1) # 从目标网络中获取该动作对应的Q值
        action = torch.tensor(action, device=device).unsqueeze(-1)
        reward = torch.tensor(reward, device=device)
        done = torch.tensor(done, device=device)
        
        state = torch.tensor(np.stack(state), device=device)
        y_batch = torch.where(done,reward,reward + GAMMA * Q_max_value_batch) # 计算目标Q值
        Q_batch = self(state).gather(1, action).squeeze(-1)
        self.optimizer.zero_grad()
        loss = self.loss_cal(Q_batch, y_batch)
        loss.backward()
        self.optimizer.step()
            
    def epsilon_greedy(self, state):
        with torch.no_grad():
            if random.random() > self.epsilon:
                state = torch.tensor(state, device=device)
                state_action_value = self(state)
                action = torch.argmax(state_action_value, dim=-1).item()
            else:
                action = np.random.randint(0, N_ACTIONS)
            self.epsilon -= (start_epsilon - end_epsilon)/10000
            return action
                
    def action(self, state):
        with torch.no_grad():
            state = torch.tensor(state, device=device)
            action = torch.argmax(self(state), dim=-1).item()
            return action
        
    
    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
        if len(self.replay_buffer) > replay_size:
            self.replay_buffer.popleft()
        
        if len(self.replay_buffer) > batch_size:
            self.train_loop()
            
    def update_target_network(self):
        self.target_network.load_state_dict(self.state_dict(), strict=False)

In [4]:
import gym
env_name = "CartPole-v1"
env = gym.make(env_name)
agent = DDQN(env)
target_network = DDQN(env)
agent.get_target_network(target_network)

In [5]:
def main():
    for episode in range(3000):
        state, _ = env.reset()
        for step in range(300):
            action = agent.epsilon_greedy(state)
            next_state, reward, done, _, _= env.step(action)
            reward = -1 if done else 0.01
            agent.perceive(state, action, reward, next_state, done)
            if done:
                break
            state = next_state
        if episode % 100 == 0:
            print(f"testing episode {episode / 100}")
            total_reward = 0
            for i in range(10):
                state, _ = env.reset()
                for j in range(300):
                    action = agent.action(state)
                    next_state, reward, done, _, _ = env.step(action)
                    total_reward += reward
                    state = next_state
                    if done:
                        break
            total_reward /= 10
            print(f"average reward is {total_reward}")
        if episode % 100 == 0 and episode != 0:
            agent.update_target_network()

In [None]:
if __name__ == "__main__":
    main()

  if not isinstance(terminated, (bool, np.bool8)):


testing episode 0.0
average reward is 15.2
testing episode 1.0
average reward is 12.7
testing episode 2.0
average reward is 27.0
testing episode 3.0
average reward is 65.5
testing episode 4.0
average reward is 47.8
testing episode 5.0
average reward is 169.4
testing episode 6.0
average reward is 208.5
testing episode 7.0
average reward is 267.5
testing episode 8.0
average reward is 29.9
testing episode 9.0
average reward is 300.0
testing episode 10.0
average reward is 60.6
testing episode 11.0
average reward is 300.0
testing episode 12.0
average reward is 114.1
testing episode 13.0
average reward is 298.5
testing episode 14.0
average reward is 173.5
testing episode 15.0
average reward is 141.0
testing episode 16.0
average reward is 166.0
testing episode 17.0
average reward is 99.6
testing episode 18.0
average reward is 113.9
testing episode 19.0
average reward is 132.3
testing episode 20.0
average reward is 117.2
testing episode 21.0
average reward is 127.5
testing episode 22.0
average