# OpenAI Gymnasium 强化学习环境快速入门

---

## 核心思想

Gymnasium (原 OpenAI Gym) 提供了强化学习研究的**标准化接口**，定义了智能体与环境交互的统一 API。

## 数学原理

强化学习建模为**马尔可夫决策过程 (MDP)**:

$$MDP = (\mathcal{S}, \mathcal{A}, P, R, \gamma)$$

其中:
- $\mathcal{S}$: 状态空间
- $\mathcal{A}$: 动作空间  
- $P(s'|s,a)$: 状态转移概率
- $R(s,a,s')$: 奖励函数
- $\gamma \in [0,1]$: 折扣因子

智能体目标是最大化**期望累积折扣奖励**:

$$G_t = \sum_{k=0}^{\infty} \gamma^k R_{t+k+1}$$

---

## 1. 环境安装与导入

In [None]:
# 安装依赖 (如果尚未安装)
# !pip install gymnasium[classic-control] matplotlib numpy -q

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print(f"Gymnasium 版本: {gym.__version__}")

---

## 2. 创建第一个环境: CartPole

CartPole 是经典的控制问题：通过左右移动小车来平衡竖立的杆子。

| 属性 | 描述 |
|------|------|
| **状态** | [小车位置, 小车速度, 杆角度, 杆角速度] |
| **动作** | 0 (向左推) 或 1 (向右推) |
| **奖励** | 每步 +1 |
| **终止** | 杆角度 > 12° 或 小车位置 > 2.4 |

In [None]:
# 创建 CartPole 环境
env = gym.make("CartPole-v1")

print("=" * 50)
print("环境信息")
print("=" * 50)
print(f"环境 ID: {env.spec.id}")
print(f"观测空间: {env.observation_space}")
print(f"动作空间: {env.action_space}")
print(f"最大步数: {env.spec.max_episode_steps}")

env.close()

### 观测空间详情

In [None]:
env = gym.make("CartPole-v1")

print("观测空间详情:")
print(f"  形状: {env.observation_space.shape}")
print(f"  下界: {env.observation_space.low}")
print(f"  上界: {env.observation_space.high}")

env.close()

---

## 3. 环境交互基础

### 核心 API

```python
observation, info = env.reset()           # 重置环境
observation, reward, terminated, truncated, info = env.step(action)  # 执行动作
```

- `terminated`: 任务完成（成功或失败）
- `truncated`: 回合因时间限制等原因被截断

In [None]:
env = gym.make("CartPole-v1")

# 重置环境
observation, info = env.reset(seed=42)

print("初始观测:")
print(f"  小车位置: {observation[0]:.4f}")
print(f"  小车速度: {observation[1]:.4f}")
print(f"  杆角度:   {observation[2]:.4f} rad")
print(f"  杆角速度: {observation[3]:.4f}")

env.close()

In [None]:
env = gym.make("CartPole-v1")
observation, _ = env.reset(seed=42)

# 执行一个动作
action = 1  # 向右推
next_obs, reward, terminated, truncated, info = env.step(action)

print(f"执行动作: {action} (向右推)")
print(f"获得奖励: {reward}")
print(f"终止: {terminated}, 截断: {truncated}")
print(f"新观测: {next_obs}")

env.close()

---

## 4. 运行完整回合

定义回合执行函数：

In [None]:
def run_episode(env, policy_fn, seed=None):
    """
    运行一个完整回合
    
    Parameters
    ----------
    env : gym.Env
        环境实例
    policy_fn : callable
        策略函数，输入观测返回动作
    seed : int, optional
        随机种子
        
    Returns
    -------
    total_reward : float
        回合总奖励
    steps : int
        回合步数
    """
    obs, _ = env.reset(seed=seed)
    total_reward = 0
    steps = 0
    
    while True:
        action = policy_fn(obs)
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        steps += 1
        
        if terminated or truncated:
            break
    
    return total_reward, steps

### 定义不同策略

In [None]:
def random_policy(obs):
    """随机策略"""
    return np.random.randint(2)

def angle_policy(obs):
    """基于角度的简单策略: 杆往哪边倒就往哪边推"""
    pole_angle = obs[2]
    return 1 if pole_angle > 0 else 0

def pid_policy(obs):
    """PID 控制策略"""
    x, x_dot, theta, theta_dot = obs
    u = 50 * theta + 10 * theta_dot + 0.5 * x + 1.0 * x_dot
    return 1 if u > 0 else 0

### 评估不同策略

In [None]:
env = gym.make("CartPole-v1")

policies = {
    "随机策略": random_policy,
    "角度策略": angle_policy,
    "PID策略": pid_policy
}

n_episodes = 20
results = {}

print("=" * 60)
print(f"策略评估 ({n_episodes} 回合)")
print("=" * 60)

for name, policy in policies.items():
    rewards = []
    for i in range(n_episodes):
        reward, steps = run_episode(env, policy, seed=i)
        rewards.append(reward)
    
    results[name] = rewards
    print(f"\n{name}:")
    print(f"  平均奖励: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")
    print(f"  最小/最大: {np.min(rewards):.0f} / {np.max(rewards):.0f}")

env.close()

### 可视化策略比较

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 箱线图
ax1 = axes[0]
data = [results[name] for name in results]
bp = ax1.boxplot(data, labels=list(results.keys()), patch_artist=True)
colors = ['#ff7f0e', '#2ca02c', '#1f77b4']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax1.set_ylabel('回合奖励')
ax1.set_title('策略性能分布')
ax1.axhline(y=475, color='r', linestyle='--', label='成功阈值 (475)')
ax1.legend()

# 条形图
ax2 = axes[1]
means = [np.mean(results[name]) for name in results]
stds = [np.std(results[name]) for name in results]
x = np.arange(len(results))
bars = ax2.bar(x, means, yerr=stds, capsize=5, color=colors, alpha=0.7)
ax2.set_xticks(x)
ax2.set_xticklabels(list(results.keys()))
ax2.set_ylabel('平均奖励')
ax2.set_title('策略平均性能')
ax2.axhline(y=475, color='r', linestyle='--', label='成功阈值')

plt.tight_layout()
plt.show()

---

## 5. MountainCar - 稀疏奖励环境

**挑战**: 小车引擎不够强，无法直接爬上山顶，需要利用来回摆动积累动量。

In [None]:
env = gym.make("MountainCar-v0")

print("=" * 50)
print("MountainCar-v0 环境信息")
print("=" * 50)
print(f"观测空间: {env.observation_space}")
print(f"  位置范围: [{env.observation_space.low[0]:.2f}, {env.observation_space.high[0]:.2f}]")
print(f"  速度范围: [{env.observation_space.low[1]:.3f}, {env.observation_space.high[1]:.3f}]")
print(f"动作空间: {env.action_space}")
print(f"  0: 向左, 1: 不动, 2: 向右")

env.close()

### MountainCar 地形可视化

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

x = np.linspace(-1.2, 0.6, 200)
y = np.sin(3 * x) * 0.45 + 0.55

ax.plot(x, y, 'b-', linewidth=3, label='地形')
ax.fill_between(x, 0, y, alpha=0.3, color='green')
ax.axvline(x=-0.5, color='red', linestyle='--', alpha=0.7, label='起点')
ax.axvline(x=0.5, color='gold', linestyle='--', linewidth=2, label='目标')

# 绘制小车
car_x = -0.5
car_y = np.sin(3 * car_x) * 0.45 + 0.55
ax.plot(car_x, car_y + 0.05, 'ro', markersize=15, label='小车')

ax.set_xlabel('位置')
ax.set_ylabel('高度')
ax.set_title('MountainCar 环境地形')
ax.legend(loc='upper left')
ax.set_xlim(-1.3, 0.7)
ax.set_ylim(0, 1.2)

plt.tight_layout()
plt.show()

### 动量策略测试

In [None]:
def momentum_policy(obs):
    """动量策略: 跟随当前速度方向加速"""
    position, velocity = obs
    return 2 if velocity > 0 else 0

env = gym.make("MountainCar-v0")

print("测试动量策略:")
rewards = []
for i in range(10):
    reward, steps = run_episode(env, momentum_policy, seed=i)
    rewards.append(reward)
    print(f"  回合 {i+1}: 奖励={reward:.0f}, 步数={steps}")

print(f"\n平均奖励: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")

env.close()

---

## 6. Pendulum - 连续动作空间

动作是连续的扭矩值 $u \in [-2, 2]$。

In [None]:
env = gym.make("Pendulum-v1")

print("=" * 50)
print("Pendulum-v1 环境信息")
print("=" * 50)
print(f"观测空间: {env.observation_space}")
print(f"  观测: [cos(θ), sin(θ), θ̇]")
print(f"动作空间: {env.action_space}")
print(f"  扭矩范围: [{env.action_space.low[0]:.1f}, {env.action_space.high[0]:.1f}]")
print(f"\n这是一个连续动作空间环境!")

env.close()

### PD 控制器演示

In [None]:
def pd_controller(obs):
    """PD 控制器策略"""
    cos_theta, sin_theta, theta_dot = obs
    theta = np.arctan2(sin_theta, cos_theta)
    Kp, Kd = 10.0, 2.0
    torque = -Kp * theta - Kd * theta_dot
    return np.clip([torque], -2.0, 2.0)

In [None]:
env = gym.make("Pendulum-v1")

obs, _ = env.reset(seed=42)
observations = [obs]
actions = []
rewards_list = []

for _ in range(200):
    action = pd_controller(obs)
    obs, reward, terminated, truncated, _ = env.step(action)
    observations.append(obs)
    actions.append(action[0])
    rewards_list.append(reward)

print(f"回合总奖励: {sum(rewards_list):.1f}")

env.close()

### 控制过程可视化

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

observations = np.array(observations)
steps = np.arange(len(observations))

# 角度
ax1 = axes[0, 0]
theta = np.arctan2(observations[:, 1], observations[:, 0])
ax1.plot(steps, np.degrees(theta), 'b-', linewidth=2)
ax1.axhline(y=0, color='r', linestyle='--', alpha=0.5)
ax1.set_xlabel('步数')
ax1.set_ylabel('角度 (度)')
ax1.set_title('摆角变化')
ax1.grid(True, alpha=0.3)

# 角速度
ax2 = axes[0, 1]
ax2.plot(steps, observations[:, 2], 'g-', linewidth=2)
ax2.axhline(y=0, color='r', linestyle='--', alpha=0.5)
ax2.set_xlabel('步数')
ax2.set_ylabel('角速度 (rad/s)')
ax2.set_title('角速度变化')
ax2.grid(True, alpha=0.3)

# 扭矩
ax3 = axes[1, 0]
ax3.plot(range(len(actions)), actions, 'r-', linewidth=2)
ax3.set_xlabel('步数')
ax3.set_ylabel('扭矩')
ax3.set_title('控制输入')
ax3.grid(True, alpha=0.3)

# 奖励
ax4 = axes[1, 1]
ax4.plot(range(len(rewards_list)), np.cumsum(rewards_list), 'purple', linewidth=2)
ax4.set_xlabel('步数')
ax4.set_ylabel('累积奖励')
ax4.set_title('累积奖励')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

## 7. 空间类型详解

In [None]:
from gymnasium import spaces

print("=" * 60)
print("Gymnasium 空间类型")
print("=" * 60)

# 1. Discrete
discrete = spaces.Discrete(5)
print(f"\n1. Discrete(5): {{0, 1, 2, 3, 4}}")
print(f"   采样: {[discrete.sample() for _ in range(5)]}")

In [None]:
# 2. Box
box = spaces.Box(low=-1.0, high=1.0, shape=(3,), dtype=np.float32)
print(f"Box([-1,1]^3): 连续空间")
print(f"  形状: {box.shape}")
print(f"  采样: {box.sample()}")

In [None]:
# 3. MultiDiscrete
multi_discrete = spaces.MultiDiscrete([3, 2, 4])
print(f"MultiDiscrete([3,2,4]): 多维离散空间")
print(f"  每维范围: [0,3), [0,2), [0,4)")
print(f"  采样: {multi_discrete.sample()}")

In [None]:
# 4. Dict
dict_space = spaces.Dict({
    "position": spaces.Box(-10, 10, shape=(2,)),
    "velocity": spaces.Box(-1, 1, shape=(2,)),
    "flag": spaces.Discrete(2)
})
print(f"Dict 空间:")
sample = dict_space.sample()
for key, value in sample.items():
    print(f"  {key}: {value}")

---

## 8. 环境包装器

In [None]:
from gymnasium.wrappers import RecordEpisodeStatistics

env = gym.make("CartPole-v1")
env = RecordEpisodeStatistics(env)

print("使用 RecordEpisodeStatistics 包装器:")
print("=" * 50)

obs, _ = env.reset()
done = False

while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated

if 'episode' in info:
    print(f"回合奖励: {info['episode']['r']}")
    print(f"回合长度: {info['episode']['l']}")

env.close()

### 自定义归一化包装器

In [None]:
class NormalizeObservation(gym.ObservationWrapper):
    """在线观测归一化包装器"""
    
    def __init__(self, env, epsilon=1e-8):
        super().__init__(env)
        self.epsilon = epsilon
        self.mean = np.zeros(env.observation_space.shape)
        self.var = np.ones(env.observation_space.shape)
        self.count = 0
    
    def observation(self, obs):
        self.count += 1
        delta = obs - self.mean
        self.mean += delta / self.count
        self.var += delta * (obs - self.mean)
        std = np.sqrt(self.var / max(1, self.count) + self.epsilon)
        return (obs - self.mean) / std

In [None]:
env = gym.make("CartPole-v1")
env = NormalizeObservation(env)

print("归一化前后观测对比:")

obs, _ = env.reset()
print(f"步骤 0 - 归一化观测: {obs}")

for i in range(50):
    obs, _, terminated, truncated, _ = env.step(env.action_space.sample())
    if terminated or truncated:
        obs, _ = env.reset()

print(f"步骤 50 - 归一化观测: {obs}")
print(f"\n估计的均值: {env.mean}")

env.close()

---

## 9. 向量化环境

In [None]:
from gymnasium.vector import SyncVectorEnv

n_envs = 4

def make_env():
    return gym.make("CartPole-v1")

vec_env = SyncVectorEnv([make_env for _ in range(n_envs)])

print(f"向量化环境信息:")
print(f"  环境数量: {vec_env.num_envs}")
print(f"  单环境观测空间: {vec_env.single_observation_space}")
print(f"  批量观测空间: {vec_env.observation_space}")

In [None]:
# 并行采样
obs, info = vec_env.reset()
print(f"批量观测形状: {obs.shape}")

actions = vec_env.action_space.sample()
print(f"批量动作: {actions}")

obs, rewards, terminateds, truncateds, infos = vec_env.step(actions)
print(f"批量奖励: {rewards}")

vec_env.close()

---

## 10. 总结

本教程涵盖了 Gymnasium 的核心概念:

1. **环境创建**: `gym.make(env_id)`
2. **基本交互**: `reset()`, `step()`, `close()`
3. **空间类型**: Discrete, Box, MultiDiscrete, Dict
4. **包装器**: 观测/动作/奖励预处理
5. **向量化环境**: 并行采样

### 下一步

- 学习 Q-Learning 和 SARSA 算法
- 探索深度强化学习 (DQN, PPO)
- 尝试更复杂的环境

In [None]:
# 经典控制环境列表
print("经典控制环境列表:")
print("=" * 50)

classic_envs = [
    ("CartPole-v1", "倒立摆平衡"),
    ("MountainCar-v0", "爬山车 (离散)"),
    ("Acrobot-v1", "双摆控制"),
    ("Pendulum-v1", "单摆控制 (连续)"),
]

for env_id, desc in classic_envs:
    try:
        env = gym.make(env_id)
        obs_dim = env.observation_space.shape
        act_type = "离散" if isinstance(env.action_space, spaces.Discrete) else "连续"
        print(f"  {env_id:30s} | 观测: {str(obs_dim):10s} | 动作: {act_type}")
        env.close()
    except:
        print(f"  {env_id:30s} | 未安装")