# **一个普通DQN算法的简单实现**
## **算法概述**
- Q-learning算法的深度学习版
- 将传统Q-Learning中的Q表替换为了神经网络
    - 神经网络输入当前的状态（状态特征），输出各个动作的Q值
    - 仅支持离散的动作环境
- off-policy算法，value-based算法

论文链接：*https://docs.popo.netease.com/docs/66926f6c39134373894976456da3132b*

In [1]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

In [2]:
GAMMA = 0.9
epsilon = 0.5
start_epsilon = 0.5
end_epsilon = 0.01
replay_size = 10000
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class DQN(nn.Module):
    def __init__(self, env):
        super(DQN, self).__init__()
        self.replay_buffer = deque()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.create_training_network()
        self.create_training_method()
        self.to(device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        self.state_action_value = self.fc2(x)
        return self.state_action_value
    
    def create_training_network(self):
        self.fc1 = nn.Linear(self.state_dim, 20)
        self.fc2 = nn.Linear(20, self.action_dim)
        
    def create_training_method(self):
        self.optimizer = optim.Adam(self.parameters(), lr=0.0001)
        self.loss_cal = F.mse_loss
        
    def train_loop(self):
        # self.eval()
        minibatch = random.sample(self.replay_buffer, batch_size)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done = [data[4] for data in minibatch]
        
        # Q(S,A) = Q(S,A) + alpha*(R+gamma*Qmax(S',a) - Q(S,A)) 
        with torch.no_grad():
            Q_max_value_batch = torch.max(self(torch.tensor(np.stack(next_state_batch), device=device)), dim=1)[0] # Qmax(S',a)
        action = torch.tensor(action_batch, device=device).unsqueeze(-1) # A
        reward = torch.tensor(reward_batch, device=device) # R
        done = torch.tensor(done, device=device)
        y_batch = torch.where(done, reward, reward + GAMMA * Q_max_value_batch) # R+gamma*Qmax(S',a) 计算目标Q值
        Q_batch = torch.gather(self(torch.tensor(np.stack(state_batch), device=device)), 1, action).squeeze(-1) # Q(S,A)
        loss = self.loss_cal(Q_batch, y_batch)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def epsilon_greedy(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).to(device)
            if random.random() > epsilon:
                state_action_value = self(state)
                action = torch.argmax(state_action_value).item()
            else:
                action = np.random.randint(0, 2)
            # epsilon = epsilon - (start_epsilon - end_epsilon) / 10000
            return action
    
    def action(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).to(device)
            state_action_value = self(state)
            return torch.argmax(state_action_value).item()
    
    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
        if len(self.replay_buffer) > replay_size:
            self.replay_buffer.popleft()
            
        if len(self.replay_buffer) > batch_size:
            self.train_loop()

In [4]:
import gym
env_name = "CartPole-v1"
env = gym.make(env_name)
agent = DQN(env)

In [5]:
def main():
    for episode in range(3000):
        agent.train()
        state, _ = env.reset()
        for step in range(300):
            action = agent.epsilon_greedy(state)
            # action = epsilon_greedy(state_action_value)
            # next_state, reward, done, _ = env.step(action)
            next_state, reward, done, _, _ = env.step(action)
            reward = -1 if done else 0.01
            agent.perceive(state, action, reward, next_state, done)
            if done:
                break
            state = next_state
        
        if episode % 100 == 0:
            total_reward = 0
            for i in range(10):
                state, _ = env.reset()
                for j in range(300):
                    action = agent.action(state)
                    next_state, reward, done, _, _ = env.step(action)
                    total_reward += reward
                    state = next_state
                    if done:
                        break
            total_reward /= 10
            print(f"average reward is {total_reward}")

In [6]:
if __name__ == '__main__':
    main()

  if not isinstance(terminated, (bool, np.bool8)):


average reward is 9.1
average reward is 17.1
average reward is 51.1
average reward is 167.3
average reward is 194.2
average reward is 299.1
average reward is 294.9
average reward is 294.5


KeyboardInterrupt: 