## 1. 학습 진행

In [1]:
import os
from typing import Callable
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.td3.policies import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env 
import matplotlib.pyplot as plt
#from gsde import gSDE

# Set LD_LIBRARY_PATH environment variable
os.environ['LD_LIBRARY_PATH'] = '/usr/lib/nvidia:/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:/home/jovyan/.mujoco/mujoco210/bin'

In [2]:
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback

class TrainingRecorderCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TrainingRecorderCallback, self).__init__(verbose)
        self.training_rewards = []
        self.training_observations = []
        self.training_actions = []

    def _on_step(self) -> bool:
        self.training_rewards.append(self.locals["rewards"])
        self.training_observations.append(self.locals["new_obs"]) 
        self.training_actions.append(self.locals["actions"])
        return True  # Keep training

In [None]:
env_id = "Humanoid-v4"  # Specify the environment
n_envs = 8
n_steps = 1024
env = make_vec_env(env_id, n_envs=n_envs)  # Create vectorized environment (optional, but recommended for efficiency)

def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate decay schedule.
    :param initial_value: (float)
    :return: (function)
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0
        :param progress_remaining: (float)
        :return: (float)
        """
        return progress_remaining * initial_value

    return func


model = PPO("MlpPolicy", env, learning_rate=linear_schedule(0.001), 
            n_steps=n_steps, verbose=1, 
            use_sde=True,batch_size=n_steps*n_envs//4, n_epochs=2000, 
            gamma=0.95, gae_lambda=0.9, tensorboard_log="./PPO_Humanoid_tensorboard/")

# training_rewards = []  
# training_observations = []
# training_actions = []

model.learn(total_timesteps=500000, callback = TrainingRecorderCallback())
model.save("ppo_humanoid")

Import error. Trying to rebuild mujoco_py.
running build_ext
building 'mujoco_py.cymj' extension
x86_64-linux-gnu-gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -Ixrl/lib/python3.8/site-packages/mujoco_py -I/home/jovyan/xrl_with_trajectories/xrl/lib/python3.8/site-packages/mujoco_py -I/home/jovyan/.mujoco/mujoco210/include -I/home/jovyan/xrl_with_trajectories/xrl/lib/python3.8/site-packages/numpy/core/include -I/home/jovyan/xrl_with_trajectories/xrl/include -I/usr/include/python3.8 -c /home/jovyan/xrl_with_trajectories/xrl/lib/python3.8/site-packages/mujoco_py/cymj.c -o /home/jovyan/xrl_with_trajectories/xrl/lib/python3.8/site-packages/mujoco_py/generated/_pyxbld_2.1.2.14_38_linuxcpuextensionbuilder/temp.linux-x86_64-3.8/home/jovyan/xrl_with_trajectories/xrl/lib/python3.8/site-package

In [None]:
del model  # Remove old model
model = PPO.load("ppo_humanoid")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render("human")  

In [None]:
import matplotlib.pyplot as plt

# Assuming you have rewards, observations, and actions in dedicated lists
plt.plot(rewards)
plt.xlabel("Training Step")
plt.ylabel("Reward")
plt.title("Reward Progress")
plt.show()