In [1]:
!pip install "gym [accept-rom-license, atari]"

INFO: pip is looking at multiple versions of gym[accept-rom-license,atari] to determine which version is compatible with other requirements. This could take a while.
Collecting gym[accept-rom-license,atari]
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ale-py~=0.8.0 (from gym[accept-rom-license,atari])
  Downloading ale_py-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting autorom~=0.4.2 (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gym[accept-rom-license,atari])
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.4

In [2]:


def compute_gae(next_value, rewards, masks, values, gamma=0.999, tau=0.95):
    gae = 0
    returns = []
    get_log_gae = []

    # Duyệt ngược từ cuối episode về đầu episode
    for i in reversed(range(len(rewards))):
        # Tính toán delta
        delta = rewards[i] + gamma * next_value * masks[i] - values[i]

        # Cập nhật GAE
        gae = delta + gamma * tau * masks[i] * gae

        # Cập nhật next_value cho bước tiếp theo
        next_value = values[i]

        # Lưu giá trị returns và GAE
        returns.append(gae + values[i])
        get_log_gae.append(gae)

    # Đảo ngược lại để trả về theo thứ tự thời gian gốc
    returns = returns[::-1]
    get_log_gae = get_log_gae[::-1]


    return returns, get_log_gae

In [3]:
import torch
from torch import nn
import torch as T
import torch.nn.functional as F

import numpy as np
from torch.distributions.categorical import Categorical
from torch.distributions.normal import Normal

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs,device):
        super(Agent, self).__init__()
        self.network = nn.Sequential(
            layer_init(nn.Conv2d(4, 32, 8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, 4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, 3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            layer_init(nn.Linear(64 * 7 * 7, 512)),
            nn.ReLU(),
        )
        self.actor = layer_init(nn.Linear(512, envs.action_space.n), std=0.01)
        self.critic = layer_init(nn.Linear(512, 1), std=1)
        self.to(device)
    def get_value(self, x):
        return self.critic(self.network(x / 255.0))

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x / 255.0)
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)

In [4]:
import torch
def ppo_loss(new_dist, actions, old_log_probs, advantages, clip_param):
      new_log_probs = new_dist

      ratio = (new_log_probs - old_log_probs).exp()

      surr1 = ratio * advantages

      surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantages

      actor_loss = -torch.min(surr1, surr2)


      return actor_loss.mean()

In [5]:
import torch

def clipped_critic_loss(new_value, old_value, returns, clip_param):
      vf_loss1 = (new_value - returns).pow(2.)

    # 2. MSE/L2 loss on the clipped value and the returns
    # Here we create an "approximation" of the new value (aka the current value) by finding the difference
    # between the "new" and "old" value and adding a clipped amount back to the old value
      vpredclipped = old_value + torch.clamp(new_value - old_value, -clip_param, clip_param)
    # Note that we ONLY backprop through the new value
      vf_loss2 = (vpredclipped - returns).pow(2.)

    # 3. Take the MAX between the two losses
    # This trick has the effect of only updating the current value DIRECTLY if is it WORSE (higher error)
    # than the old value.
    # If the old value was worse then the "approximation" will be worse and we update
    # the new value only a little bit!
      critic_loss = torch.max(vf_loss1, vf_loss2)

    # 4. Return the Expectation over the batch
      return critic_loss.mean()

In [6]:
def xuly_dulieu(env, model, gamma, tau, device):
    obs, _ = env.reset()
    dulieu = {
        "obs": [],
        "action": [],
        "reward": [],
        "logprob": [],
        "value": [],
        "done": []
    }

    for _ in range(512):
        with torch.no_grad():
            obs_array = np.array(obs)
            obs_tensor = torch.tensor(obs_array, dtype=torch.float32).unsqueeze(0).to(device)
            action, logprob, _, value = model.get_action_and_value(obs_tensor)

        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = 0 if terminated or truncated else 1

        dulieu["obs"].append(obs_array)  # Save as np.array to avoid LazyFrames
        dulieu["action"].append(action)
        dulieu["reward"].append(reward)
        dulieu["logprob"].append(logprob.detach())
        dulieu["value"].append(value.detach())
        dulieu["done"].append(done)

        obs = next_obs
        if terminated or truncated:
            obs, _ = env.reset()

    with torch.no_grad():
        next_value = model.get_value(
            torch.tensor(np.array(obs), dtype=torch.float32).unsqueeze(0).to(device)
        )

    # GAE
    returns, advantages = compute_gae(
        next_value,
        dulieu["reward"],
        dulieu["done"],
        dulieu["value"],
        gamma,
        tau
    )

    advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    advantages = advantages.view(-1)
    # Convert everything to tensors and shuffle
    obs_tensor = torch.tensor(np.array(dulieu["obs"]), dtype=torch.float32).to(device)
    action_tensor = torch.cat(dulieu["action"]).to(device)
    logprob_tensor = torch.cat(dulieu["logprob"]).to(device)
    value_tensor = torch.cat(dulieu["value"]).squeeze(-1).to(device)
    return_tensor = torch.tensor(returns, dtype=torch.float32).to(device)

    idx = torch.randperm(advantages.shape[0])
    obs_tensor = obs_tensor[idx].reshape(4, 128, 4, 84, 84)
    action_tensor = action_tensor[idx].reshape(4, 128)
    logprob_tensor = logprob_tensor[idx].reshape(4, 128)
    value_tensor = value_tensor[idx].reshape(4, 128)
    return_tensor = return_tensor[idx].reshape(4, 128)
    advantage_tensor = advantages[idx].reshape(4, 128)

    rs = torch.tensor(dulieu["reward"]).sum()

    minibatch = {
        "obs": obs_tensor,
        "action": action_tensor,
        "logprob": logprob_tensor,
        "value": value_tensor,
        "returns": return_tensor,
        "advantage": advantage_tensor,
    }

    return minibatch, rs


In [9]:


import torch
import numpy as np
import torch.nn as nn

def ppo_update(data_buffer, ppo_epochs, clip_param, model, optimizer,device):
     for _ in range(ppo_epochs):
        for i in range(len(data_buffer["obs"])):

            obs =data_buffer["obs"][i].to(device)
            logprob =data_buffer["logprob"][i].to(device)
            values = data_buffer["value"][i].to(device)
            returns = data_buffer["returns"][i].to(device)
            action = data_buffer["action"][i]
            # Clone advantage để tránh inplace operation
            advantage = data_buffer["advantage"][i].to(device)


            _, n_logprob, entropy, n_value = model.get_action_and_value(obs, action)

            # Tính toán loss của actor và critic
            actor_loss = ppo_loss(n_logprob, action, logprob, advantage, clip_param)
            critic_loss = clipped_critic_loss(n_value, values, returns, clip_param)

            # Đảm bảo agent_loss là một scalar
            agent_loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy.mean()


            # Debugging: Kiểm tra agent_loss có phải là scalar không
           
            optimizer.zero_grad()

            # Backpropagation
            agent_loss.backward()

            # Cắt gradient norm để tránh cập nhật quá lớn
            nn.utils.clip_grad_norm_(model.parameters(),0.5)

            # Cập nhật optimizer
            optimizer.step()
    #print(actor_loss.item(),critic_loss.item())


In [11]:
import gym
import torch
import numpy as np
import torch.optim as optim
from stable_baselines3.common.atari_wrappers import (  # isort:skip
    ClipRewardEnv,
    EpisodicLifeEnv,
    FireResetEnv,
    MaxAndSkipEnv,
    NoopResetEnv,
)
seed=42
name="ALE/Breakout-v5"
env = gym.make(name)
env = gym.wrappers.RecordEpisodeStatistics(env)
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
env = EpisodicLifeEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)
env.seed(seed)
env.action_space.seed(seed)
env.observation_space.seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 2.5e-4
ppo_epochs = 4
clip_param = 0.1
gamma=0.99
tau=0.95
agent=Agent(env,device)
optimizer = optim.Adam(agent.parameters(), lr=lr, eps=1e-5)

def tes(run_name,env):
    observation,_ = env.reset()
    obs = observation
    k = 0
    for _ in range(1000):
        with torch.no_grad():
            obs_s = torch.from_numpy(np.array(obs, dtype=np.float32))[None].to(device)

            action, logprob, _, value = agent.get_action_and_value(obs_s)
            action_s = action
            # TRY NOT TO MODIFY: execute the game and log data.
            next_obs, reward, terminated,ter , info = env.step(action_s)
            k+=reward
            obs=next_obs
            # Fix for multi-env outputs
            if terminated or ter :
                break
    return k



for i in range(1000):
   du_lieu,rs= xuly_dulieu(env,agent,gamma, tau,device)
   ppo_update(du_lieu, ppo_epochs, clip_param,agent,optimizer,device)
   if i % 20 ==0:
       print(i,rs.item())
       k=tes(i,env)
       print(k)
       print()

0 18.0
0.0

20 22.0
0.0

40 31.0
0.0

60 39.0
0.0

80 46.0
0.0

100 43.0
0.0

120 39.0
0.0

140 54.0
0.0

160 36.0
0.0

180 40.0
1.0

200 43.0
0.0

220 40.0
0.0

240 44.0
0.0

260 52.0
0.0

280 44.0
0.0

300 54.0
0.0

320 42.0
1.0

340 47.0
0.0

360 42.0
0.0



KeyboardInterrupt: 

In [12]:
from typing import Tuple, Dict, Optional, Iterable, Callable

import numpy as np
import seaborn as sns

import matplotlib
from matplotlib import animation

from IPython.display import HTML

import gym
from gym import spaces
from gym.error import DependencyNotInstalled


import numpy as np
import matplotlib.pyplot as plt

In [13]:
def display_video(frames):
    # Copied from: https://colab.research.google.com/github/deepmind/dm_control/blob/master/tutorial.ipynb
    orig_backend = matplotlib.get_backend()
    matplotlib.use('Agg')
    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    matplotlib.use(orig_backend)
    ax.set_axis_off()
    ax.set_aspect('equal')
    ax.set_position([0, 0, 1, 1])
    im = ax.imshow(frames[0])
    def update(frame):
        im.set_data(frame)
        return [im]
    anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                    interval=50, blit=True, repeat=False)
    return HTML(anim.to_html5_video())

In [14]:
env = gym.make("ALE/Breakout",render_mode='rgb_array')
seed=42

env = gym.wrappers.RecordEpisodeStatistics(env)
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
env = EpisodicLifeEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)
env.seed(seed)
env.action_space.seed(seed)
env.observation_space.seed(seed)

  logger.warn(
  logger.warn(


[42]

In [18]:
obs,_ = env.reset()
done = False
k=0
frames = []
for i in range(1000):
    with torch.no_grad():
      obs_np = np.array(obs)  
      
      obs_s= torch.tensor([obs_np], dtype=torch.float32).to(device)

      action, logprob, _, value = agent.get_action_and_value(obs_s)
    next_obs, reward, done,ter, info = env.step(action)
    obs=next_obs
    img = env.render()
    frames.append(img)
    if done or ter:
        break
    k += reward

In [None]:
action

In [None]:
print(obs_s[0][0][0])

In [None]:
obs_s[0][0][0]

In [19]:
display_video(frames)

In [None]:
matrix3=dataset["train"]['actions']

In [None]:
matrix3[757][0]

In [None]:
env.reset()
frames = []
tong_phan_thuong=0
for i in range(len(matrix3[757])) :
    frames.append(env.render(mode="rgb_array"))
    action = matrix3[150][i].detach().cpu().numpy()
    state, reward, done, _ = env.step(action)
    tong_phan_thuong+=reward
print(tong_phan_thuong)

In [None]:
display_video(frames)