In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import gym
import random
import torch.optim as optim
import torch
from collections import deque

## 1.DDQN

In [2]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [3]:
class DQNTrainer:
    '''
    DDQN的训练器
    '''
    def __init__(self, env, policy_net, target_net, optimizer, device='cpu'):
        self.env = env
        self.policy_net = policy_net  # policy_net的参数会被target_net的参数
        self.target_net = target_net  # target_net的参数会被policy_net的参数
        self.optimizer = optimizer
        self.device = device
        self.memory = deque(maxlen=10000)  # 经验回放池
        self.batch_size = 128  # batch大小
        self.gamma = 0.999  # 折扣因子
        self.epsilon = 0.9  # epsilon-greedy策略中的epsilon
        self.epsilon_decay = 0.995  # epsilon的衰减率
        self.min_epsilon = 0.05  # epsilon的最小值
        self.num_episodes = 1000  # 训练的回合数

    def select_action(self, state):  # epsilon-greedy策略
        if random.random() > self.epsilon:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1).to(self.device)
        else:
            return torch.tensor([[random.randrange(self.env.action_space.n)]], device=self.device, dtype=torch.long)

    def optimize_model(self):  # 利用Q-learning更新网络参数
        if len(self.memory) < self.batch_size:  # 如果记忆中的样本数量小于batch_size，不优化，直接返回
            return
        
        transitions = random.sample(self.memory, self.batch_size)  # 从记忆中随机采样
        batch = tuple(zip(*transitions))

        # 非最终状态的mask: 以下两行代码的目的是创建一个布尔类型的张量 non_final_mask，
        # 用于表示批处理中的状态是否为非最终状态。
        # 它通过对批处理中的每个状态进行检查，如果状态不为 None，则对应的张量元素为 True，否则为 False。
        # 这个张量将在强化学习的深度 Q 网络 (DQN) 中使用，用于选择下一个动作和计算目标 Q 值。
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch[3])), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch[3] if s is not None])

        # 将batch转换为张量
        state_batch = torch.cat(batch[0])  # 将batch中的state拼接成一个张量
        action_batch = torch.cat(batch[1])  # 将batch中的action拼接成一个张量
        reward_batch = torch.cat(batch[2])  # 将batch中的reward拼接成一个张量

        # 计算hat_q的值
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        # 计算Q(s_{t+1}, a)的值
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # 计算hat_y的值
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        # 计算损失
        loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        # 更新参数
        self.optimizer.step()


    def train(self):  # 训练
        for i_episode in range(self.num_episodes):
            state, _ = self.env.reset()
            state = torch.tensor([state], device=self.device, dtype=torch.float32)  # 直接转换为张量
            sum_reward = 0
            for _ in range(1000):  # 假设每个回合的最大步数为1000
                action = self.select_action(state)
                next_state, reward, done, _, _ = self.env.step(action.item())  # 执行动作
                reward = torch.tensor([reward], device=self.device, dtype=torch.float32)
                if done:
                    next_state = None
                else:
                    next_state = torch.tensor([next_state], device=self.device, dtype=torch.float32)
                
                self.memory.append((state, action, reward, next_state))
                state = next_state if not done else None
                self.optimize_model()
                if done:
                    break
                
                sum_reward += reward.item()
            
            # 更新target_net的参数
            if i_episode % 10 == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            
            # 打印每个回合的reward
            if (i_episode + 1) % 50 == 0:
                print('Episode {} reward: {}'.format(i_episode + 1, sum_reward))
            
            # 更新epsilon
            self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

- cartpole游戏环境

In [4]:
# Setting up the environment and the trainer
env = gym.make('CartPole-v1')
state = env.reset()
print(state)

action = env.action_space.sample()
print(action)

next_state, reward, done, truancated, _ = env.step(env.action_space.sample())
print(next_state, reward, done, truancated)

(array([ 0.00906778,  0.01233419, -0.01864976,  0.00012788], dtype=float32), {})
1
[ 0.00931446  0.20771858 -0.0186472  -0.29838043] 1.0 False False


  if not isinstance(terminated, (bool, np.bool8)):


- 训练agent

In [5]:
env = gym.make('CartPole-v1')
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
policy_net = DQN(input_dim, output_dim).to('cpu')
target_net = DQN(input_dim, output_dim).to('cpu')
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters())

trainer = DQNTrainer(env, policy_net, target_net, optimizer)
trainer.train()
env.close()

  state = torch.tensor([state], device=self.device, dtype=torch.float32)  # 直接转换为张量


Episode 50 reward: 36.0
Episode 100 reward: 59.0
Episode 150 reward: 56.0
Episode 200 reward: 240.0
Episode 250 reward: 141.0
Episode 300 reward: 90.0
Episode 350 reward: 276.0
Episode 400 reward: 161.0
Episode 450 reward: 284.0
Episode 500 reward: 343.0
Episode 550 reward: 148.0
Episode 600 reward: 1000.0
Episode 650 reward: 129.0
Episode 700 reward: 136.0
Episode 750 reward: 343.0
Episode 800 reward: 1000.0
Episode 850 reward: 464.0
Episode 900 reward: 482.0
Episode 950 reward: 241.0
Episode 1000 reward: 89.0
Episode 1050 reward: 103.0
Episode 1100 reward: 93.0
Episode 1150 reward: 94.0
Episode 1200 reward: 97.0
Episode 1250 reward: 483.0
Episode 1300 reward: 301.0
Episode 1350 reward: 106.0
Episode 1400 reward: 345.0
Episode 1450 reward: 405.0
Episode 1500 reward: 324.0
Episode 1550 reward: 93.0
Episode 1600 reward: 54.0
Episode 1650 reward: 501.0
Episode 1700 reward: 103.0
Episode 1750 reward: 243.0
Episode 1800 reward: 1000.0
Episode 1850 reward: 1000.0
Episode 1900 reward: 11.0
E

- 测试agent的表现

In [22]:
def evaluate_agent(env, policy_net, num_episodes=100):
    policy_net.eval()  # 设置为评估模式
    total_rewards = []
    
    for _ in range(num_episodes):
        state, _ = env.reset()
        state = torch.tensor([state], dtype=torch.float32)
        episode_reward = 0
        while True:
            # 选择最佳动作
            action = policy_net(state).max(1)[1].view(1, 1)
            next_state, reward, done, truancated, _ = env.step(action.item())
            episode_reward += reward
            state = torch.tensor([next_state], dtype=torch.float32)
            if done:
                break
        
        total_rewards.append(episode_reward)
    
    avg_reward = sum(total_rewards) / len(total_rewards)
    print(f'平均奖励：{avg_reward:.2f}')
    return total_rewards

In [18]:
test_rewards = evaluate_agent(env, policy_net)

平均奖励：203.97


## 2. 对决网络

对决网络结构（Dueling Network Architecture），分别估计状态值（V）和优势函数（A）。在对决网络中，网络的输出被分解为两个部分：一个是单一的状态值 \( V(s) \)，另一个是每个动作的优势 \( A(s, a) \)。最终的动作值 \( Q(s, a) \) 通过合并这两部分得到：\( Q(s, a) = V(s) + (A(s, a) - \text{mean}(A(s, a))) \)，其中 \(\text{mean}(A(s, a))\) 是所有动作的平均优势，用来稳定训练。

In [19]:
class DuelingDQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DuelingDQN, self).__init__()
        # Common feature layer
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        # Value stream
        self.fc_value = nn.Linear(64, 1)
        # Advantage stream
        self.fc_advantage = nn.Linear(64, output_dim)

    def forward(self, x):
        # Common layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # Value and Advantage streams
        value = self.fc_value(x)
        advantage = self.fc_advantage(x)
        # Subtract the mean of the advantage to stabilize training
        Q = value + (advantage - advantage.mean(dim=1, keepdim=True))
        
        return Q

### 说明：
- **共享层（Shared Layers）**：`fc1` 和 `fc2` 用于从输入状态提取特征，这些特征被用于后续的价值（V）和优势（A）计算。
- **价值流（Value Stream）**：一个输出层 `fc_value`，用于计算状态值 \( V(s) \)，这是一个标量，表示在给定状态下的预期回报。
- **优势流（Advantage Stream）**：一个输出层 `fc_advantage`，输出每个动作的优势值 \( A(s, a) \)。优势函数表示选择每个动作相对于平均水平的额外价值。
- **合并Q值（Combining Q Values）**：在输出时，动作的总Q值由状态值和优势值合成，通过从每个动作的优势中减去所有动作优势的平均值来保持数值稳定。

这种架构可以更有效地学习在不同状态下哪些动作更重要，因为它区分了状态的整体价值和每个动作相对于其他动作的额外价值。这通常可以在需要评估大量潜在动作的复杂环境中提高性能。

In [20]:
env = gym.make('CartPole-v1')
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
policy_net = DuelingDQN(input_dim, output_dim).to('cpu')  # policy_net
target_net = DuelingDQN(input_dim, output_dim).to('cpu')  # target_net
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters())

trainer = DQNTrainer(env, policy_net, target_net, optimizer)
trainer.train()
env.close()

Episode 50 reward: 26.0
Episode 100 reward: 45.0
Episode 150 reward: 90.0
Episode 200 reward: 65.0
Episode 250 reward: 29.0
Episode 300 reward: 120.0
Episode 350 reward: 101.0
Episode 400 reward: 613.0
Episode 450 reward: 1000.0
Episode 500 reward: 127.0
Episode 550 reward: 122.0
Episode 600 reward: 1000.0
Episode 650 reward: 213.0
Episode 700 reward: 234.0
Episode 750 reward: 40.0
Episode 800 reward: 543.0
Episode 850 reward: 156.0
Episode 900 reward: 97.0
Episode 950 reward: 34.0
Episode 1000 reward: 230.0
Episode 1050 reward: 107.0
Episode 1100 reward: 129.0
Episode 1150 reward: 1000.0
Episode 1200 reward: 134.0
Episode 1250 reward: 303.0
Episode 1300 reward: 23.0
Episode 1350 reward: 12.0
Episode 1400 reward: 169.0
Episode 1450 reward: 293.0
Episode 1500 reward: 1000.0
Episode 1550 reward: 237.0
Episode 1600 reward: 1000.0
Episode 1650 reward: 1000.0
Episode 1700 reward: 1000.0
Episode 1750 reward: 86.0
Episode 1800 reward: 44.0
Episode 1850 reward: 48.0
Episode 1900 reward: 32.0
E

In [23]:
test_rewards = evaluate_agent(env, policy_net)

平均奖励：115.56


## 3. 噪声网络

为了将传统的DQN模型改造为噪声`DQN（Noisy DQN）`，我们需要引入带有参数化噪声的网络层。这样的层可以直接整合到策略网络中，以代替传统的`ε-greedy`策略进行探索。在`Noisy DQN`中，网络本身会预测在决策过程中应该添加的噪声量，这样可以更有效地探索动作空间而不依赖于外部的随机决策过程。

- 实现Noisy Layer

首先，我们定义一个噪声层，该层将在每次调用时生成新的噪声。我们可以通过扩展torch.nn.Linear来创建这样的层，使其包含噪声参数。

In [36]:
import math

class NoisyLinear(nn.Linear):
    def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
        # 先调用super初始化基础功能
        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)

        # 然后添加sigma参数和epsilon缓存
        self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
        self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
        if bias:
            self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
            self.register_buffer("epsilon_bias", torch.zeros(out_features))

        # 重置所有参数
        self.reset_parameters()

    def reset_parameters(self):
        super(NoisyLinear, self).reset_parameters()
        std = 0.5 / math.sqrt(self.weight.size(1))
        nn.init.uniform_(self.weight, -std, std)
        if self.bias is not None:
            nn.init.uniform_(self.bias, -std, std)
            
        nn.init.constant_(self.sigma_weight, 0.017)
        if self.bias is not None:
            nn.init.constant_(self.sigma_bias, 0.017)

    def forward(self, x):
        self.epsilon_weight.normal_()
        self.epsilon_bias.normal_()
        
        return F.linear(x, self.weight + self.sigma_weight * self.epsilon_weight,
                        self.bias + self.sigma_bias * self.epsilon_bias if self.bias is not None else None)


- 集成噪声层道DQN模型中

In [37]:
class NoisyDQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(NoisyDQN, self).__init__()
        # 使用NoisyLinear替换标准全连接层
        self.fc1 = NoisyLinear(input_dim, 64)
        self.fc2 = NoisyLinear(64, 64)
        self.fc3 = NoisyLinear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

    def reset_noise(self):
        """重置所有噪声层的噪声参数"""
        self.fc1.reset_parameters()
        self.fc2.reset_parameters()
        self.fc3.reset_parameters()

- 训练

In [31]:
class NoisyDQNTrainer:
    def __init__(self, env, policy_net, target_net, optimizer, device='cpu'):
        self.env = env
        self.policy_net = policy_net
        self.target_net = target_net
        self.optimizer = optimizer
        self.device = device
        self.memory = deque(maxlen=10000)
        self.batch_size = 128
        self.gamma = 0.999
        self.num_episodes = 5000

    def select_action(self, state):
        with torch.no_grad():
            return self.policy_net(state).max(1)[1].view(1, 1).to(self.device)

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        
        transitions = random.sample(self.memory, self.batch_size)
        batch = tuple(zip(*transitions))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch[3])), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch[3] if s is not None])

        state_batch = torch.cat(batch[0])
        action_batch = torch.cat(batch[1])
        reward_batch = torch.cat(batch[2])

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def train(self):
        for i_episode in range(self.num_episodes):
            self.policy_net.reset_noise()  # Reset noise at the beginning of each episode
            state = self.env.reset()
            state = torch.tensor([state], device=self.device, dtype=torch.float32)
            for t in range(1000):  # Assume max steps per episode is 1000
                action = self.select_action(state)
                next_state, reward, done, _ = self.env.step(action.item())
                reward = torch.tensor([reward], device=self.device, dtype=torch.float32)
                
                if done:
                    next_state = None
                else:
                    next_state = torch.tensor([next_state], device=self.device, dtype=torch.float32)
                
                self.memory.append((state, action, reward, next_state))
                state = next_state if not done else None
                self.optimize_model()
                
                if done:
                    break
            
            # Update the target network
            if i_episode % 10 == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            
            # Print episode stats
            if (i_episode + 1) % 100 == 0:
                print(f'Episode {i_episode + 1} completed')

    def evaluate(self, num_episodes=10):
        self.policy_net.eval()  # Set the network to evaluation mode
        total_rewards = []
        for i_episode in range(num_episodes):
            state = self.env.reset()
            state = torch.tensor([state], device=self.device, dtype=torch.float32)
            total_reward = 0
            while True:
                action = self.select_action(state)
                state, reward, done, _ = self.env.step(action.item())
                total_reward += reward
                state = torch.tensor([state], device=self.device, dtype=torch.float32)
                if done:
                    break
            total_rewards.append(total_reward)
            print(f'Episode {i_episode + 1}: Total Reward = {total_reward}')
        self.policy_net.train()  # Set the network back to train mode
        return total_rewards


- 训练

In [38]:
env = gym.make('CartPole-v1')
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

policy_net = NoisyDQN(input_dim, output_dim).to('cpu')
target_net = NoisyDQN(input_dim, output_dim).to('cpu')
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters())
trainer = NoisyDQNTrainer(env, policy_net, target_net, optimizer)
trainer.train()
env.close()

AttributeError: 'NoisyLinear' object has no attribute 'sigma_weight'