In [1]:
!pip install gymnasium[mujoco]
!pip install torch

Collecting gymnasium[mujoco]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[mujoco])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting mujoco>=2.3.3 (from gymnasium[mujoco])
  Downloading mujoco-3.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, farama-notifications, gymnasium, mujoco
Successfully instal

In [3]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

# 하이퍼파라미터 설정
gamma = 0.99
lam = 0.95
clip_param = 0.2
learning_rate = 3e-4
num_steps = 2048
num_epochs = 10
mini_batch_size = 64
ppo_epochs = 10

# 환경 설정
env = gym.make('Ant-v4')
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# 네트워크 정의
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Tanh()
        )
        self.critic = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        value = self.critic(x)
        probs = self.actor(x)
        dist = Categorical(logits=probs)
        return dist, value

model = ActorCritic(obs_dim, action_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Truncated Generalized Advantage Estimation 함수
def compute_gae(next_value, rewards, masks, values):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * lam * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

# PPO 업데이트 함수
def ppo_update(policy, optimizer, observations, actions, log_probs, returns, advantages):
    for _ in range(ppo_epochs):
        sampler = np.random.permutation(len(observations))
        for i in range(0, len(observations), mini_batch_size):
            batch_indices = sampler[i:i + mini_batch_size]
            obs_batch = torch.tensor(observations[batch_indices], dtype=torch.float32)
            actions_batch = torch.tensor(actions[batch_indices], dtype=torch.float32)
            log_probs_batch = torch.tensor(log_probs[batch_indices], dtype=torch.float32)
            returns_batch = torch.tensor(returns[batch_indices], dtype=torch.float32)
            advantages_batch = torch.tensor(advantages[batch_indices], dtype=torch.float32)

            dist, value = policy(obs_batch)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(actions_batch)
            ratio = (new_log_probs - log_probs_batch).exp()

            surr1 = ratio * advantages_batch
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantages_batch
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = (returns_batch - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.01 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

# 훈련 루프
def train():
    for _ in range(num_epochs):
        obs, _ = env.reset()
        done = False
        observations = []
        actions = []
        log_probs = []
        rewards = []
        values = []
        masks = []

        for _ in range(num_steps):
            dist, value = model(torch.tensor(obs, dtype=torch.float32))
            action = dist.sample()
            next_obs, reward, terminated, truncated, _ = env.step(action.numpy())

            observations.append(obs)
            actions.append(action)
            log_probs.append(dist.log_prob(action))
            rewards.append(reward)
            values.append(value)
            done = int(terminated or truncated)
            masks.append(1 - done)

            obs = next_obs

            if done:
                obs = env.reset()

        next_value = model(torch.tensor(obs, dtype=torch.float32))[1].detach().numpy()
        returns = compute_gae(next_value, rewards, masks, values)
        returns = torch.tensor(returns, dtype=torch.float32)
        advantages = returns - torch.tensor(values, dtype=torch.float32)

        ppo_update(model, optimizer, observations, actions, log_probs, returns, advantages)

if __name__ == "__main__":
    train()

ValueError: Action dimension mismatch. Expected (8,), found ()