尝试使用DDPG解决pendulum-v1

In [7]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import StepLR

device=torch.device('cuda')

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        # 使用较小规模的网络
        self.fc1 = nn.Linear(state_dim, 256)
        self.ln1 = nn.LayerNorm(256)  # 添加LayerNorm提升训练稳定性
        self.fc2 = nn.Linear(256, 128)
        self.ln2 = nn.LayerNorm(128)
        self.fc3 = nn.Linear(128, action_dim)
        self.max_action = max_action
    
    def forward(self, state):
        x = F.relu(self.ln1(self.fc1(state)))
        x = F.relu(self.ln2(self.fc2(x)))
        action = torch.tanh(self.fc3(x)) * self.max_action
        return action

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 256)
        self.ln1 = nn.LayerNorm(256)
        self.fc2 = nn.Linear(256, 128)
        self.ln2 = nn.LayerNorm(128)
        self.fc3 = nn.Linear(128, 1)
    
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.ln1(self.fc1(x)))
        x = F.relu(self.ln2(self.fc2(x)))
        q_value = self.fc3(x)
        return q_value

class OUNoise:
    #Ornstein-Uhlenbeck过程噪声，更适合惯性系统
    def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
        self.mu = mu * np.ones(action_dim)
        self.theta = theta
        self.sigma = sigma
        self.action_dim = action_dim
        self.reset()
    
    def reset(self):
        self.state = np.copy(self.mu)
    
    def sample(self):
        dx = self.theta * (self.mu - self.state) + self.sigma * np.random.randn(self.action_dim)
        self.state += dx
        return self.state

class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state = np.array([e[0] for e in batch], dtype=np.float32)
        action = np.array([e[1] for e in batch], dtype=np.float32)
        reward = np.array([e[2] for e in batch], dtype=np.float32)
        next_state = np.array([e[3] for e in batch], dtype=np.float32)
        done = np.array([e[4] for e in batch], dtype=np.float32)
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

class DDPG:
    def __init__(self, state_dim, action_dim, max_action):
        # 演员网络
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.actor_scheduler = StepLR(self.actor_optimizer, step_size=1000, gamma=0.95)  # 学习率调度
        
        # 评论家网络
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
        self.critic_scheduler = StepLR(self.critic_optimizer, step_size=1000, gamma=0.95)
        
        # 经验回放
        self.replay_buffer = ReplayBuffer()
        
        # OU噪声
        self.ou_noise = OUNoise(action_dim)
        
        # 超参数
        self.gamma = 0.99
        self.tau = 0.005
        self.batch_size = 64
        self.max_action = max_action
        self.exploration_noise = 0.5  # 初始探索噪声
        self.noise_decay = 0.9995    # 噪声衰减率
        
        # 训练计数器
        self.train_step = 0
    
    def shaped_reward(self, state, action, reward, next_state):
        """
        reward reshaping
        原始奖励: reward = -(θ² + 0.1*θ_dot² + 0.001*torque²)
        """
       
        cos_theta, sin_theta, theta_dot = state
        theta = np.arctan2(sin_theta, cos_theta)
        
        # 强调直立状态
        upright_bonus = 2.0 * (cos_theta + 1)  # 加倍奖励
        
        # 角度惩罚（更温和）
        angle_penalty = -0.1 * (theta ** 2)
        
        # 角速度惩罚（鼓励更稳定）
        angular_penalty = -0.05 * (theta_dot ** 2)
        
        # 扭矩惩罚（保持节能）
        torque_penalty = -0.001 * (action[0] ** 2)
        
        # 组合奖励
        new_reward = upright_bonus + angle_penalty + angular_penalty + torque_penalty
        
        return reward
    def select_action(self, state, add_noise=True):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        action = self.actor(state).cpu().data.numpy().flatten()
        
        if add_noise:
            # 使用OU噪声[8,9]
            noise = self.exploration_noise * self.ou_noise.sample()
            action = (action + noise).clip(-self.max_action, self.max_action)
            # 衰减探索噪声
            self.exploration_noise *= self.noise_decay
            self.exploration_noise = max(self.exploration_noise, 0.1)  # 保持最小探索
        
        return action
    
    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        # 采样批次
        state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size)
        
        state = torch.FloatTensor(state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        done = torch.FloatTensor(done).unsqueeze(1).to(device)
        
        # Critic更新
        with torch.no_grad():
            next_action = self.actor_target(next_state)
            target_Q = self.critic_target(next_state, next_action)
            target_Q = reward + (1 - done) * self.gamma * target_Q
        
        current_Q = self.critic(state, action)
        critic_loss = F.mse_loss(current_Q, target_Q)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # 梯度裁剪防止爆炸
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0)
        self.critic_optimizer.step()
        
        # Actor更新
        actor_loss = -self.critic(state, self.actor(state)).mean()
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1.0)
        self.actor_optimizer.step()
        
        # 软更新目标网络
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        
        # 更新学习率
        self.actor_scheduler.step()
        self.critic_scheduler.step()
        
        self.train_step += 1
    
    def save(self, filename):
        torch.save(self.actor.state_dict(), filename + "_actor.pth")
        torch.save(self.critic.state_dict(), filename + "_critic.pth")
        print(f"模型已保存: {filename}_actor.pth 和 {filename}_critic.pth")
    
    def load(self, filename):
        self.actor.load_state_dict(torch.load(filename + "_actor.pth"))
        self.critic.load_state_dict(torch.load(filename + "_critic.pth"))
        print(f"模型已加载: {filename}_actor.pth 和 {filename}_critic.pth")

def train_pendulum():
    env = gym.make('Pendulum-v1')
    
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    print(f"环境: Pendulum-v1")
    print(f"状态维度: {state_dim}, 动作维度: {action_dim}, 最大动作: {max_action}")
    
    agent = DDPG(state_dim, action_dim, max_action)
    
    num_episodes = 100
    all_rewards = []
    moving_avg_rewards = []
    
    # 训练循环
    for episode in range(num_episodes):
        state, info = env.reset()
        episode_reward = 0
        agent.ou_noise.reset()  # 每回合重置OU噪声
        
        for step in range(200):
            action = agent.select_action(state, add_noise=True)
            next_state, raw_reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            
            # 使用重塑后的奖励
            shaped_reward = agent.shaped_reward(state, action, raw_reward, next_state)
            
            agent.replay_buffer.push(state, action, shaped_reward, next_state, done)
            
            state = next_state
            episode_reward += raw_reward  # 记录原始奖励用于评估
            
            # 延迟训练
            if len(agent.replay_buffer) > 1000:
                for _ in range(2):  # 每步训练2次
                    agent.train()
            
            if done:
                break
        
        all_rewards.append(episode_reward)
        
        # 计算移动平均
        if episode < 10:
            moving_avg = np.mean(all_rewards)
        else:
            moving_avg = np.mean(all_rewards[-10:])
        moving_avg_rewards.append(moving_avg)
        
        if episode % 10 == 0:
            print(f"回合 {episode:3d} | 奖励: {episode_reward:7.2f} | 平均奖励(10): {moving_avg:7.2f} | 噪声: {agent.exploration_noise:.3f}")
    
    # # 绘制训练曲线，但是torch和matplotlib前后端冲突所以先注释掉
    # plt.figure(figsize=(12, 6))
    # plt.plot(all_rewards, alpha=0.6, label='每回合奖励')
    # plt.plot(moving_avg_rewards, 'r-', linewidth=2, label='移动平均 (10回合)')
    # plt.xlabel('回合')
    # plt.ylabel('奖励')
    # plt.title('DDPG 在 Pendulum-v1 上的训练性能')
    # plt.legend()
    # plt.grid(True)
    # plt.savefig('ddpg_training_performance.png', dpi=300, bbox_inches='tight')
    # plt.show()
    
    # 保存模型
    agent.save("ddpg_pendulum")
    
    # 测试训练好的策略
    print("\n测试策略...")
    test_rewards = []
    for i in range(5):
        state, info = env.reset()
        test_reward = 0
        for _ in range(200):
            action = agent.select_action(state, add_noise=False)  # 测试时不加噪声
            state, reward, terminated, truncated, info = env.step(action)
            test_reward += reward
            if terminated or truncated:
                break
        test_rewards.append(test_reward)
        print(f"测试 {i+1}: {test_reward:.2f}")
    
    print(f"\n最终性能: {np.mean(test_rewards):.2f} ± {np.std(test_rewards):.2f}")
    np.save('training_rewards.npy', np.array(all_rewards))
    np.save('moving_avg_rewards.npy', np.array(moving_avg_rewards))
    # 训练总结
    print("\n训练总结")
    print(f"初始性能: {np.mean(all_rewards[:10]):.2f}")
    print(f"最终性能: {np.mean(all_rewards[-10:]):.2f}")
    print(f"最佳回合: {max(all_rewards):.2f}")
    
    env.close()
    return agent, all_rewards

# if __name__ == "__main__":
#     agent, rewards = train_pendulum()

In [None]:

def plot_results_offline():
    try:
        import matplotlib.pyplot as plt
        all_rewards = np.load('training_rewards.npy')
        moving_avg_rewards = np.load('moving_avg_rewards.npy')
        
        plt.figure(figsize=(12, 6))
        plt.plot(all_rewards, alpha=0.6, label='每回合奖励')
        plt.plot(moving_avg_rewards, 'r-', linewidth=2, label='移动平均 (10回合)')
        plt.xlabel('回合')
        plt.ylabel('奖励')
        plt.title('DDPG 在 Pendulum-v1 上的训练性能')
        plt.legend()
        plt.grid(True)
        plt.savefig('ddpg_training_performance.png', dpi=300, bbox_inches='tight')
        print("图表已保存为 ddpg_training_performance.png")
        
    except Exception as e:
        print(f"绘图时出错: {e}")



if __name__ == "__main__":
    agent, rewards = train_pendulum()
    # 训练完成后，在单独的进程中绘图
    plot_results_offline()

环境: Pendulum-v1
状态维度: 3, 动作维度: 1, 最大动作: 2.0
回合   0 | 奖励: -1783.71 | 平均奖励(10): -1783.71 | 噪声: 0.452
回合  10 | 奖励: -1170.29 | 平均奖励(10): -1300.39 | 噪声: 0.166
回合  20 | 奖励: -1047.06 | 平均奖励(10): -775.33 | 噪声: 0.100
回合  30 | 奖励: -125.25 | 平均奖励(10): -407.57 | 噪声: 0.100
回合  40 | 奖励:   -3.70 | 平均奖励(10): -163.62 | 噪声: 0.100
回合  50 | 奖励: -122.10 | 平均奖励(10): -142.52 | 噪声: 0.100
回合  60 | 奖励: -121.53 | 平均奖励(10): -155.20 | 噪声: 0.100
回合  70 | 奖励:   -7.40 | 平均奖励(10): -144.88 | 噪声: 0.100
回合  80 | 奖励:  -13.98 | 平均奖励(10): -117.56 | 噪声: 0.100
回合  90 | 奖励: -129.28 | 平均奖励(10): -148.91 | 噪声: 0.100
模型已保存: ddpg_pendulum_actor.pth 和 ddpg_pendulum_critic.pth

测试策略...
测试 1: -137.32
测试 2: -130.14
测试 3: -132.34
测试 4: -20.25
测试 5: -134.21

最终性能: -110.85 ± 45.36

训练总结
初始性能: -1361.73
最终性能: -158.00
最佳回合: -2.67


最终的训练结果不尽如人意，应该需要进一步调参、改奖励函数  但是真的已经燃尽了，入门三月拼尽全力无法战胜...