# 经典控制环境深度解析

---

## 核心思想

经典控制环境是一组基于**控制论经典问题**设计的低维环境。

## 本节内容

1. **CartPole**: 倒立摆平衡 - 欠驱动系统控制
2. **MountainCar**: 爬山车 - 稀疏奖励与探索
3. **Pendulum**: 单摆 - 连续控制入门

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch, Circle

plt.rcParams['figure.figsize'] = (12, 6)
print(f"Gymnasium 版本: {gym.__version__}")

---

## 1. CartPole - 倒立摆平衡

### 运动方程

$$\ddot{\theta} = \frac{g\sin\theta + \cos\theta \cdot \frac{-F - m_p l \dot{\theta}^2 \sin\theta}{m_c + m_p}}{l\left(\frac{4}{3} - \frac{m_p \cos^2\theta}{m_c + m_p}\right)}$$

In [None]:
env = gym.make("CartPole-v1")

print("=" * 70)
print("CartPole-v1 环境详解")
print("=" * 70)

print("\n【状态空间】")
state_vars = [
    ("小车位置 x", env.observation_space.low[0], env.observation_space.high[0]),
    ("小车速度 ẋ", env.observation_space.low[1], env.observation_space.high[1]),
    ("摆杆角度 θ", env.observation_space.low[2], env.observation_space.high[2]),
    ("摆杆角速度 θ̇", env.observation_space.low[3], env.observation_space.high[3]),
]
for name, low, high in state_vars:
    print(f"  {name}: [{low:.2f}, {high:.2f}]")

env.close()

### CartPole 系统可视化

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_xlim(-3, 3)
ax.set_ylim(-0.5, 3)
ax.set_aspect('equal')

# 地面
ax.axhline(y=0, color='brown', linewidth=4)
ax.fill_between([-3, 3], [-0.3], [0], color='brown', alpha=0.3)

# 小车
cart = FancyBboxPatch((-0.4, 0.05), 0.8, 0.3, boxstyle="round,pad=0.02",
                       facecolor='steelblue', edgecolor='navy', linewidth=2)
ax.add_patch(cart)

# 轮子
for wx in [-0.25, 0.25]:
    wheel = Circle((wx, 0.05), 0.08, facecolor='gray', edgecolor='black')
    ax.add_patch(wheel)

# 摆杆
theta = 0.2
pole_length = 2.0
pole_x = pole_length * np.sin(theta)
pole_y = pole_length * np.cos(theta)
ax.plot([0, pole_x], [0.35, 0.35 + pole_y], 'r-', linewidth=10, solid_capstyle='round')
ax.plot(pole_x, 0.35 + pole_y, 'ro', markersize=18)

ax.set_title('CartPole 系统示意图', fontsize=14)
ax.axis('off')
plt.show()

### CartPole 策略实现

In [None]:
def random_policy(obs):
    return np.random.randint(2)

def angle_policy(obs):
    """基于角度的策略"""
    return 1 if obs[2] > 0 else 0

def pd_policy(obs):
    """PD 控制策略"""
    x, x_dot, theta, theta_dot = obs
    u_theta = 50 * theta + 10 * theta_dot
    u_x = 0.5 * x + 1.0 * x_dot
    return 1 if (u_theta + u_x) > 0 else 0

In [None]:
def evaluate_policy(env_id, policy, n_episodes=50, seed=42):
    """评估策略"""
    env = gym.make(env_id)
    rewards = []
    for i in range(n_episodes):
        obs, _ = env.reset(seed=seed + i)
        total_reward = 0
        while True:
            action = policy(obs)
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        rewards.append(total_reward)
    env.close()
    return rewards

In [None]:
print("CartPole 策略评估 (50 回合)")
print("=" * 60)

policies = {
    "随机策略": random_policy,
    "角度策略": angle_policy,
    "PD控制": pd_policy,
}

results = {}
for name, policy in policies.items():
    rewards = evaluate_policy("CartPole-v1", policy)
    results[name] = rewards
    print(f"{name:12s}: {np.mean(rewards):6.1f} ± {np.std(rewards):5.1f}")

---

## 2. MountainCar - 稀疏奖励

### 动力学方程

$$v_{t+1} = v_t + 0.001 \cdot a - 0.0025 \cdot \cos(3x_t)$$
$$x_{t+1} = x_t + v_{t+1}$$

In [None]:
env = gym.make("MountainCar-v0")

print("=" * 70)
print("MountainCar-v0 环境详解")
print("=" * 70)
print(f"\n位置范围: [{env.observation_space.low[0]:.2f}, {env.observation_space.high[0]:.2f}]")
print(f"速度范围: [{env.observation_space.low[1]:.3f}, {env.observation_space.high[1]:.3f}]")
print(f"动作: 0=向左, 1=不动, 2=向右")

env.close()

### 地形和能量可视化

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 左图: 地形
ax1 = axes[0]
x = np.linspace(-1.2, 0.6, 300)
y = np.sin(3 * x) * 0.45 + 0.55

ax1.plot(x, y, 'b-', linewidth=3)
ax1.fill_between(x, 0, y, alpha=0.2, color='green')
ax1.axvline(x=-0.5, color='blue', linestyle='--', label='起点')
ax1.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='目标')
ax1.set_xlabel('位置')
ax1.set_ylabel('高度')
ax1.set_title('MountainCar 地形')
ax1.legend()

# 右图: 相图
ax2 = axes[1]
pos = np.linspace(-1.2, 0.6, 100)
vel = np.linspace(-0.07, 0.07, 100)
P, V = np.meshgrid(pos, vel)
potential = np.sin(3 * P) * 0.45 + 0.55
kinetic = 0.5 * V**2 * 500
total_energy = potential + kinetic

contour = ax2.contourf(P, V, total_energy, levels=20, cmap='coolwarm', alpha=0.7)
plt.colorbar(contour, ax=ax2, label='总能量')
ax2.axvline(x=0.5, color='red', linestyle='--', label='目标')
ax2.set_xlabel('位置')
ax2.set_ylabel('速度')
ax2.set_title('相空间与能量等高线')
ax2.legend()

plt.tight_layout()
plt.show()

### MountainCar 策略测试

In [None]:
def random_policy_mc(obs):
    return np.random.randint(3)

def momentum_policy(obs):
    """跟随速度方向"""
    return 2 if obs[1] > 0 else 0

In [None]:
print("MountainCar 策略评估 (20 回合)")
print("=" * 60)

for name, policy in [("随机", random_policy_mc), ("动量", momentum_policy)]:
    env = gym.make("MountainCar-v0")
    rewards = []
    successes = 0
    
    for i in range(20):
        obs, _ = env.reset(seed=i)
        total_reward = 0
        while True:
            obs, reward, terminated, truncated, _ = env.step(policy(obs))
            total_reward += reward
            if terminated:
                successes += 1
                break
            if truncated:
                break
        rewards.append(total_reward)
    env.close()
    
    print(f"{name:12s}: 平均奖励={np.mean(rewards):>7.1f}, 成功率={successes/20*100:>5.0f}%")

---

## 3. Pendulum - 连续控制

### 动力学方程

$$\ddot{\theta} = -\frac{3g}{2l}\sin(\theta + \pi) + \frac{3}{ml^2}u$$

### 奖励函数

$$r = -(\theta^2 + 0.1\dot{\theta}^2 + 0.001u^2)$$

In [None]:
env = gym.make("Pendulum-v1")

print("=" * 70)
print("Pendulum-v1 环境详解")
print("=" * 70)
print(f"\n观测: [cos(θ), sin(θ), θ̇]")
print(f"动作: 扭矩 u ∈ [-2, 2] (连续)")
print(f"奖励: r = -(θ² + 0.1·θ̇² + 0.001·u²)")

env.close()

### Pendulum 控制策略

In [None]:
def pd_controller(obs, Kp=10.0, Kd=2.0):
    """PD 控制器"""
    cos_theta, sin_theta, theta_dot = obs
    theta = np.arctan2(sin_theta, cos_theta)
    torque = -Kp * theta - Kd * theta_dot
    return np.clip([torque], -2.0, 2.0)

def energy_controller(obs):
    """能量成形控制器"""
    cos_theta, sin_theta, theta_dot = obs
    theta = np.arctan2(sin_theta, cos_theta)
    g, l, m = 10.0, 1.0, 1.0
    E = 0.5 * m * l**2 * theta_dot**2 - m * g * l * cos_theta
    E_target = m * g * l
    
    if np.abs(theta) < 0.3:
        torque = -10.0 * theta - 2.0 * theta_dot
    else:
        torque = -3.0 * (E - E_target) * theta_dot
    
    return np.clip([torque], -2.0, 2.0)

In [None]:
env = gym.make("Pendulum-v1")

controllers = {"PD控制": pd_controller, "能量控制": energy_controller}
all_data = {}

for name, controller in controllers.items():
    obs, _ = env.reset(seed=42)
    thetas, torques, rewards_list = [], [], []
    
    for _ in range(200):
        theta = np.arctan2(obs[1], obs[0])
        action = controller(obs)
        obs, reward, _, _, _ = env.step(action)
        thetas.append(np.degrees(theta))
        torques.append(action[0])
        rewards_list.append(reward)
    
    all_data[name] = {'theta': thetas, 'torque': torques, 'reward': rewards_list}
    print(f"{name}: 总奖励 = {sum(rewards_list):.1f}")

env.close()

### 控制效果对比

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
colors = ['#1f77b4', '#ff7f0e']

for idx, (name, data) in enumerate(all_data.items()):
    axes[0].plot(data['theta'], color=colors[idx], label=name, linewidth=2)
    axes[1].plot(np.cumsum(data['reward']), color=colors[idx], label=name, linewidth=2)

axes[0].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[0].set_xlabel('步数')
axes[0].set_ylabel('角度 (度)')
axes[0].set_title('摆角变化')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('步数')
axes[1].set_ylabel('累积奖励')
axes[1].set_title('累积奖励')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

## 4. 环境对比总结

In [None]:
from gymnasium import spaces

print("\n" + "=" * 90)
print("经典控制环境对比")
print("=" * 90)
print(f"{'环境ID':<25} {'状态维度':<10} {'动作空间':<15} {'推荐算法'}")
print("-" * 90)

envs_info = [
    ("CartPole-v1", "DQN, A2C"),
    ("MountainCar-v0", "需要探索"),
    ("Acrobot-v1", "DQN, PPO"),
    ("Pendulum-v1", "DDPG, SAC"),
]

for env_id, algos in envs_info:
    try:
        env = gym.make(env_id)
        obs_dim = env.observation_space.shape[0]
        if isinstance(env.action_space, spaces.Discrete):
            act_info = f"Discrete({env.action_space.n})"
        else:
            act_info = f"Box({env.action_space.shape[0]})"
        env.close()
        print(f"{env_id:<25} {obs_dim:<10} {act_info:<15} {algos}")
    except:
        print(f"{env_id:<25} 未安装")