In [1]:
!pip install gymnasium[mujoco]
!pip install torch

Collecting gymnasium[mujoco]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[mujoco])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting mujoco>=2.3.3 (from gymnasium[mujoco])
  Downloading mujoco-3.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium[mujoco])
  Downloading glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, farama-notifications, gymnasium, mujoco
Successfully instal

In [2]:
import gymnasium as gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import MultivariateNormal

import sys

In [3]:
# 모델 정의
class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(obs_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc_mean = nn.Linear(64, action_dim)
        self.fc_log_std = nn.Linear(64, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        mean = self.fc_mean(x)
        log_std = self.fc_log_std(x)
        return mean, log_std

In [4]:
# 가우시안 분포 생성 함수
def get_action_and_log_prob(state, policy):
    mean, log_std = policy(state)
    std = log_std.exp()
    dist = MultivariateNormal(mean, torch.diag_embed(std))
    action = dist.sample()
    log_prob = dist.log_prob(action)
    return action, log_prob

In [5]:
# PPO 업데이트 함수
def ppo_update(policy, optimizer, states, actions, rewards, old_log_probs, advantages):
    for _ in range(K_epochs):
        mean, log_std = policy(states)
        std = log_std.exp()
        dist = MultivariateNormal(mean, torch.diag_embed(std))
        new_log_probs = dist.log_prob(actions)
        ratio = (new_log_probs - old_log_probs).exp()

        surrogate1 = ratio * advantages
        surrogate2 = torch.clamp(ratio, 1 - epsilon_clip, 1 + epsilon_clip) * advantages

        loss = -torch.min(surrogate1, surrogate2).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [7]:
# 환경 설정
env = gym.make('Ant-v4')
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

In [8]:
# 하이퍼파라미터 설정
learning_rate = 3e-4
gamma = 0.99
epsilon_clip = 0.2
K_epochs = 10
T_horizon = 2048

In [9]:
# 모델 초기화 및 옵티마이저 설정
policy = PolicyNetwork()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [None]:
# 메인 학습 루프
for episode in range(1000):
    state, _ = env.reset()
    done = False
    rewards = []
    log_probs = []
    states = []
    actions = []
    episode_reward = 0

    for t in range(T_horizon):
        state = torch.tensor(state, dtype=torch.float32)
        action, log_prob = get_action_and_log_prob(state, policy)

        next_state, reward, terminated, truncated, info = env.step(action.detach().numpy())

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        log_probs.append(log_prob)

        state = next_state
        episode_reward += reward

        if done:
            break

    # Advantage 계산
    discounted_rewards = []
    for t in range(len(rewards)):
        Gt = sum([gamma ** i * rewards[t + i] for i in range(len(rewards) - t)])
        discounted_rewards.append(Gt)

    discounted_rewards = torch.tensor(discounted_rewards)
    states = torch.stack(states)
    actions = torch.stack(actions)
    old_log_probs = torch.stack(log_probs).detach()

    advantages = discounted_rewards - discounted_rewards.mean()
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

    # PPO 업데이트
    ppo_update(policy, optimizer, states, actions, discounted_rewards, old_log_probs, advantages)

    if episode % 10 == 0:
        print(f"Episode {episode}: Reward {episode_reward}")

env.close()

Episode 0: Reward -7729.14562595329
Episode 10: Reward -4863.71556950801
Episode 20: Reward -3000.679376661323
Episode 30: Reward -1741.388917285942
Episode 40: Reward -481.0536939356044
Episode 50: Reward 227.70838585125733
Episode 60: Reward 523.2066076521032
Episode 70: Reward 858.425029763921
Episode 80: Reward 1050.723646972466
Episode 90: Reward 1073.9205313406796
Episode 100: Reward 1178.3620112389242
Episode 110: Reward 1183.904337140508
Episode 120: Reward 1235.3981510830054
Episode 130: Reward 1448.1357556664525
Episode 140: Reward 1385.4940454702562
Episode 150: Reward 1354.2113396554357
Episode 160: Reward 1329.6911948224163
Episode 170: Reward 1434.185604585034
Episode 180: Reward 1477.6207080721774
Episode 190: Reward 1435.3814802324823
Episode 200: Reward 1453.7460626010743
