In [None]:
!apt-get update
!pip install mujoco
!pip install pyvirtualdisplay
!sudo apt-get install xvfb
!pip install xvfbwrapper
!pip install numpy --upgrade
!pip install gym --upgrade
!pip install tensorboard
!pip uninstall dopamine-rl

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [830 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,798 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,082 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:9 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jam

In [None]:
import gym
from gym.wrappers.record_video import RecordVideo

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init

import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque

import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

# 가상 디스플레이를 사용해 화면 표시
display = Display(visible=0, size=(1400, 900))
display.start()

# 비디오 녹화를 위한 함수 정의
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")

In [None]:
# 신경망 정의
class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=256):
        super(Actor, self).__init__()
        self.neural_net = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size),
            nn.Tanh()
        )

    def forward(self, x):
        if isinstance(x, np.ndarray):
            x = torch.from_numpy(x).to(device)
        return self.neural_net(x.float())

class Critic(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=256):
        super(Critic, self).__init__()
        self.neural_net = nn.Sequential(
            nn.Linear(state_size + action_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, state, action):
        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).to(device)
        if isinstance(action, np.ndarray):
            action = torch.from_numpy(action).to(device)
        x = torch.hstack((state, action))
        return self.neural_net(x.float())

In [None]:
# 리플레이 버퍼 정의
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def __len__(self):
        return len(self.memory)

    def add(self, state, action, reward, next_state, done):
        self.memory.append((np.array(state), np.array(action), reward, np.array(next_state), done))

    def sample(self):
        experiences = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*experiences))
        return (
            torch.from_numpy(states).float(),
            torch.from_numpy(actions).float(),
            torch.from_numpy(rewards).float(),
            torch.from_numpy(next_states).float(),
            torch.from_numpy(dones).int()
        )

In [None]:
# DDPG 에이전트 정의
class DDPGAgent:
    def __init__(self, state_size, action_size, action_low, action_high,
                 actor_lr=1e-4, critic_lr=1e-3, buffer_size=100000, batch_size=400,
                 gamma=0.99, tau=1e-3, noise_scale=0.5,
                 initial_std=1.0, min_std=0.1, decay_rate=0.01):

        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high

        # 액터 및 크리틱 네트워크 초기화
        self.actor = Actor(state_size, action_size).to(device)
        self.critic = Critic(state_size, action_size).to(device)
        self.target_actor = Actor(state_size, action_size).to(device)
        self.target_critic = Critic(state_size, action_size).to(device)

        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # 최적화 및 버퍼
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)

        self.replay_buffer = ReplayBuffer(buffer_size, batch_size)

        # 파라미터
        self.gamma = gamma
        self.tau = tau

        self.noise_scale = noise_scale
        self.current_std = initial_std
        self.min_std = min_std
        self.decay_rate = decay_rate

    def update_std(self, episode_num):
        self.current_std = max(self.min_std, self.current_std * np.exp(-self.decay_rate * episode_num))

    def act(self, state):
        if not isinstance(state, np.ndarray):
            state = np.array(state)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)  # state.shape: torch, [1,27]]
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().numpy().squeeze()  # action.shape: np, [8,]
        self.actor.train()

        noise = self.noise_scale * (np.random.randn(self.action_size) * self.current_std)  # noise.shape: np, [8,]
        action = np.clip(action + noise, self.action_low, self.action_high)
        return action

    def learn(self):
        if len(self.replay_buffer) < self.replay_buffer.batch_size:
            return None

        states, actions, rewards, next_states, dones = self.replay_buffer.sample()  # .shape: torch.float, [64,27][64,8][64,][64,27][64,]

        states = states.to(device)
        actions = actions.to(device)
        rewards = rewards.unsqueeze(1).to(device)
        next_states = next_states.to(device)
        dones = dones.unsqueeze(1).to(device)

        # 타깃 액터와 크리틱의 계산
        next_actions = self.target_actor(next_states)  # [64,8]
        target_q_values = self.target_critic(next_states, next_actions)  # [64,1]
        q_targets = rewards + (self.gamma * target_q_values * (1 - dones))  # [64,1]

        # 크리틱 업데이트
        q_expected = self.critic(states, actions)  # [64,1]
        critic_loss = F.mse_loss(q_expected, q_targets)
        critic_log = critic_loss.item()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=1.0)
        self.critic_optimizer.step()

        # 액터 업데이트
        predicted_actions = self.actor(states)  # [64,8]
        actor_loss = - self.critic(states, predicted_actions).mean()
        actor_log = actor_loss.item()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=1.0)
        self.actor_optimizer.step()

        # 타깃 네트워크 업데이트
        with torch.no_grad():
            for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
                target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

            for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
                target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)

        return (critic_log, actor_log)

In [None]:
# 환경 및 DDPG 에이전트 초기화
env_name = 'Ant-v4'
env = gym.make(env_name, render_mode="rgb_array")
env = RecordVideo(env, './video', episode_trigger=lambda episode_number: (episode_number + 1) % 33 == 0)

state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
action_low = float(env.action_space.low[0])
action_high = float(env.action_space.high[0])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

agent = DDPGAgent(state_size, action_size, action_low, action_high,
                  actor_lr=1e-4, critic_lr=1e-3, buffer_size=100000, batch_size=64,
                  gamma=0.99, tau=1e-3, noise_scale=0.1,
                  initial_std=1.0, min_std=0.1, decay_rate=0.01
                  )

  logger.warn(


In [None]:
# 학습 루프
num_episodes = 10000
checkpoint_path = './ddpg_checkpoint.pth'
save_interval = 33

global_total_rewards = []
q_losses = []
policy_losses = []
mean_q_losses = []
mean_policy_losses = []

# 필요한 경우 이전에 저장된 체크포인트에서 상태를 불러옴
try:
    agent.load_checkpoint(checkpoint_path)
    print("Checkpoint loaded successfully.")
except Exception as e:
    print("No checkpoint found or failed to load, starting fresh.")

for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False
    q_losses = []
    policy_losses = []

    while not done:
        action = agent.act(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        agent.replay_buffer.add(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        # 학습
        log = agent.learn()
        if log is not None:
            q_loss, policy_loss = log
            q_losses.append(q_loss)
            policy_losses.append(policy_loss)

    global_total_rewards.append(total_reward)
    mean_q_losses.append(np.array(q_losses).mean())
    mean_policy_losses.append(np.array(policy_losses).mean())

    agent.update_std(episode)  # 각 에피소드마다 표준 편차 갱신
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

    # 일정한 에피소드마다 체크포인트를 저장
    if (episode + 1) % save_interval == 0:
        agent.save_checkpoint(checkpoint_path)
        print(f"Checkpoint saved at episode {episode + 1}.")

# 환경 종료
env.close()
show.video()

plt.figure(figsize=(10, 5))
plt.plot(mean_q_losses, label='Q-Loss')
plt.plot(mean_policy_losses, label='Policy Loss')
plt.plot(global_total_rewards, label='Total Reward')
plt.xlabel('Episode')
plt.ylabel('Loss')
plt.title('Training Losses & Rewards Over Time')
plt.legend()
plt.show()

Checkpoint loaded successfully.


  if not isinstance(terminated, (bool, np.bool8)):


Episode 1: Total Reward = -40.13842351942996
Episode 2: Total Reward = -220.68893227341096
Episode 3: Total Reward = -56.08848181070197
Episode 4: Total Reward = -31.48094640820867
Episode 5: Total Reward = -75.11191497805379
Episode 6: Total Reward = -62.88342347094091
Episode 7: Total Reward = -97.52615203911378
Episode 8: Total Reward = -128.0798960499395
Episode 9: Total Reward = -26.80444793563153
Episode 10: Total Reward = -81.79983212151006
Episode 11: Total Reward = -47.1381048500408
Episode 12: Total Reward = -851.1588498001141
Episode 13: Total Reward = -95.31579684537184
Episode 14: Total Reward = -461.9516456988403
Episode 15: Total Reward = -373.6428705585123
Episode 16: Total Reward = -133.94393052241722
Episode 17: Total Reward = -1621.1145321696365
Episode 18: Total Reward = -90.04993089700591
Episode 19: Total Reward = -136.02507378571093
Episode 20: Total Reward = -33.883999825961354
Episode 21: Total Reward = -179.23358388323172
Episode 22: Total Reward = -131.119523

KeyboardInterrupt: 

In [None]:
# env_name = 'Ant-v4'
# env = gym.make(env_name, render_mode="rgb_array")
# env = RecordVideo(env, './video', episode_trigger=lambda episode_number: True)
# state = env.reset()

# while True:

#   action = env.action_space.sample()
#   next_state, reward, terminated, truncated, info = env.step(action)

#   if terminated or truncated: break

# env.close()
# show_video()