In [55]:
#!pip install "gym [accept-rom-license, atari]"

In [56]:
!pip install gym[atari,accept-rom-license] --quiet
!pip install ale-py --quiet

In [57]:
import torch
import time
import numpy as np
import gym
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
class FireResetEnv(gym.Wrapper):
    """
    Take action on reset for environments that are fixed until firing.
    Args:
        env (gym.Env): The environment to wrap
    """
    def __init__(self, env: gym.Env):
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs) -> np.ndarray:
        self.env.reset(**kwargs)
        obs, _, terminated, truncated, info = self.env.step(1)
        if terminated or truncated:
            self.env.reset(**kwargs)
        obs, _, terminated, truncated, info = self.env.step(2)
        if terminated or truncated:
            self.env.reset(**kwargs)
        return obs, info

# This wrapper is an exact copy of the SB3 wrapper
class ClipRewardEnv(gym.RewardWrapper):
    """
    Clips the reward to {+1, 0, -1} by its sign.
    Args:
        env (gym.Env): The environment to wrap
    """

    def __init__(self, env: gym.Env):
        gym.RewardWrapper.__init__(self, env)
    
    def reward(self, reward: float) -> float:
        return np.sign(reward)

def AtariWrappers(env):
    env = gym.wrappers.AtariPreprocessing(
        env,
        noop_max=30,                   # 30 random actions a the beginning of an episode
        frame_skip=4,  # Repeats each input 3 times
        screen_size=84,                # Changes observation size to 84x84
        terminal_on_life_loss=True,    # Returns done=True if episode terminates
        grayscale_obs=True,            # Convert RGB to grayscale
        scale_obs=True,                # Scales observations to range 0-1
    )
    return env




In [58]:
from collections import deque

def compute_gae(next_value, rewards, masks, values, gamma=0.999, tau=0.95):
    # Similar to calculating the returns we can start at the end of the sequence and go backwards
    gae = 0
    returns = deque()
    gae_logger = deque()
    for step in reversed(range(len(rewards))):
        # Calculate the current delta value
        delta = rewards[step] + gamma * next_value * masks[step] - values[step]
        
        # The GAE is the decaying sum of these delta values
        gae = delta + gamma * tau * masks[step] * gae
        # Get the new next value
        next_value = values[step]

        
        
        # If we add the value back to the GAE we get a TD approximation for the returns
        # which we can use to train the Value function
        returns.appendleft(gae + values[step])
        gae_logger.appendleft(gae)
    

    return returns, gae_logger

In [59]:
import torch
from torch import nn
import torch as T
import torch.nn.functional as F
import numpy as np
from torch.distributions.categorical import Categorical
from torch.distributions.normal import Normal

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.network = nn.Sequential(
            layer_init(nn.Conv2d(4, 32, 8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, 4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, 3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            layer_init(nn.Linear(64 * 7 * 7, 512)),
            nn.ReLU(),
        )
        self.actor = layer_init(nn.Linear(512, envs.action_space.n), std=0.01)
        self.critic = layer_init(nn.Linear(512, 1), std=1)
        self.to(device)
    def get_value(self, x):
        return self.critic(self.network(x))

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x)
        logits = self.actor(hidden)
        probs = torch.distributions.Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)

In [60]:
import torch

def ppo_loss(new_dist, old_log_probs, advantages, clip_param):
      new_log_probs = new_dist
      ratio = torch.exp(new_log_probs - old_log_probs)
    
      surr1 = - advantages * ratio 

      surr2 = - advantages * torch.clamp(ratio, 1- clip_param, 1 + clip_param) 

      actor_loss = torch.max(surr1, surr2)


      return actor_loss.mean()

In [None]:
import torch

def clipped_critic_loss(new_value, old_value, returns, clip_param):
      vf_loss1 = (new_value - returns)**2

      vpredclipped = old_value + torch.clamp(new_value - old_value, -clip_param, clip_param)

      vf_loss2 = (vpredclipped - returns)**2


      critic_loss = torch.max(vf_loss1, vf_loss2)
      return 0.5 * critic_loss.mean()

In [62]:
def xuly_dulieu(env, model, gamma, tau, device,up_len,len_max,tong_up):
    obs, _ = env.reset()
    dulieu = {
        "obs": [],
        "action": [],
        "reward": [],
        "logprob": [],
        "value": [],
        "done": []
    }

    for _ in range(tong_up):
        with torch.no_grad():
            obs_array = np.array(obs)
            obs_tensor = torch.tensor(obs_array, dtype=torch.float32).unsqueeze(0).to(device)
            action, logprob, _, value = model.get_action_and_value(obs_tensor)

        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = 0 if terminated or truncated else 1

        dulieu["obs"].append(obs_array)  # Save as np.array to avoid LazyFrames
        dulieu["action"].append(action)
        dulieu["reward"].append(reward)
        dulieu["logprob"].append(logprob)
        dulieu["value"].append(value)
        dulieu["done"].append(done)

        obs = next_obs
        if terminated or truncated:
            obs, _ = env.reset()

    with torch.no_grad():
        next_value = model.get_value(
            torch.tensor(np.array(obs), dtype=torch.float32).unsqueeze(0).to(device)
        )
        next_value=next_value.view(-1)

    # GAE
    with torch.no_grad():
        returns, advantages = compute_gae(
        next_value,
        dulieu["reward"],
        dulieu["done"],
        dulieu["value"],
        gamma,
        tau
    )

    advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
    # Convert everything to tensors and shuffle
    obs_tensor = torch.tensor(np.array(dulieu["obs"]), dtype=torch.float32).to(device)
    action_tensor = torch.cat(dulieu["action"]).to(device)
    logprob_tensor = torch.cat(dulieu["logprob"]).to(device)
    value_tensor = torch.cat(dulieu["value"]).squeeze(-1).to(device)
    return_tensor = torch.tensor(returns, dtype=torch.float32).to(device)

    b_states = obs_tensor.reshape((-1,) + env.observation_space.shape)
    b_actions = action_tensor.reshape((-1,) +  env.action_space.shape)
    b_logprobs= logprob_tensor.reshape(-1)
    b_advantages = advantages.reshape(-1)
    b_returns = return_tensor.reshape(-1)
    b_values = value_tensor.reshape(-1)

    rs = torch.tensor(dulieu["reward"]).sum()

    minibatch = {
        "obs": b_states,
        "action":  b_actions,
        "logprob": b_logprobs,
        "value":  b_values,
        "returns": b_returns,
        "advantage": b_advantages,
    }

    return minibatch, rs


In [63]:
def ppo_update(data_buffer, ppo_epochs, clip_param, model, optimizer, device, minibatch_size=256):

    obs =data_buffer["obs"].to(device)    
    logprob =data_buffer["logprob"].to(device)
    values = data_buffer["value"].to(device)
    returns = data_buffer["returns"].to(device)
    actions = data_buffer["action"]
    advantages = data_buffer["advantage"].to(device)
    batch_size = obs.shape[0]
    for _ in range(ppo_epochs):
        idx = torch.randperm(batch_size)           
        for start in range(0, batch_size, minibatch_size):
            end = start + minibatch_size
            mb_idx = idx[start:end]
            mb_obs = obs[mb_idx]
            
            mb_old_logprob = logprob[mb_idx]
            mb_advantage =advantages[mb_idx]
            mb_advantages = (mb_advantage - mb_advantage.mean()) / (mb_advantage.std() +1e-8)
            mb_returns = returns[mb_idx]
            mb_old_values = values[mb_idx]
    
                # 4. Tính toán giá trị mới
            _, new_logprob, entropy, new_value = model.get_action_and_value(mb_obs,actions.long()[mb_idx])
    
                # 5. Tính loss
            actor_loss = ppo_loss(new_logprob, mb_old_logprob, mb_advantages, clip_param)
            critic_loss = clipped_critic_loss(new_value.view(-1), mb_old_values, mb_returns, clip_param)
            loss_entropy = entropy.mean()
    
            total_loss = actor_loss + 0.5 * critic_loss + 0.01 * -loss_entropy
    
                # 6. Tối ưu hóa
            optimizer.zero_grad()
            total_loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

    return loss_entropy  # hoặc return None nếu không cần


In [64]:
import gym
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR
import numpy as np
import torch.optim as optim


lr = 2.5e-4
ppo_epochs = 4
clip_param = 0.1
gamma=0.99
tau=0.95
seed=42

name="BreakoutNoFrameskip-v4"
env = gym.make(name)
env = AtariWrappers(env)                           # Use preconfigured Atari preprocessing wrapper
if 'FIRE' in env.unwrapped.get_action_meanings():  # Automatically 'fire' at the start
    env = FireResetEnv(env)                        
env = ClipRewardEnv(env)                           # Clip all rewards to {-1, 0, +1}
env = gym.wrappers.FrameStack(env, 4)  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent=Agent(env).to(device)
optimizer = optim.Adam(agent.parameters(), lr=lr, eps=1e-5)


In [None]:
'''checkpoint = torch.load("checkpoints/agent_update_40.pth")
agent.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
bat_dau = checkpoint['update'] + 1 '''

In [65]:
len_max=128
up_len=8
tong_up=1024
bat_dau=0
total_timesteps=10000000
num_updates = total_timesteps // tong_up
print(env.action_space)
def tes():
    observation,_ = env.reset()
    obs = observation
    k = 0
    for _ in range(1000):
        with torch.no_grad():
            obs_array = np.array(obs)
            obs_tensor = torch.tensor(obs_array, dtype=torch.float32).unsqueeze(0).to(device)

            action, logprob, _, value = agent.get_action_and_value(obs_tensor)
            # TRY NOT TO MODIFY: execute the game and log data.
            next_obs, reward, terminated,ter , info = env.step(action)
            k+=reward
            obs=next_obs
            # Fix for multi-env outputs
            if terminated or ter :
                break
    return k



for update in range(bat_dau, num_updates + 1):
    
   fraction = 1.0 - ((update - 1.0) / num_updates)
   lr_current = fraction * lr
   optimizer.param_groups[0]['lr'] = lr_current
    
   du_lieu,rs= xuly_dulieu(env,agent,gamma, tau,device,up_len,len_max,tong_up)

   ep=ppo_update(du_lieu, ppo_epochs, clip_param,agent,optimizer,device)
   if update % 50 ==0:
       torch.save({
            'update': update,
            'model_state_dict': agent.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }, f"checkpoints/agent_update_{update}.pth")
   if update % 20 ==0:
       
       print(f"num_up: {update}/{num_updates} -- total_timesteps: {update*(len_max*up_len)}/{total_timesteps}")
       print(f"phần thưởng: {rs.item():.3f} -- entropy: {ep.item():.3f}")
       k=tes()
       print( "phần thưởng thử",k )
       print()

Discrete(4)
num_up: 20/9765 -- total_timesteps: 20480/10000000
phần thưởng: 6.000 -- entropy: 1.372
phần thưởng thử 0.0

num_up: 40/9765 -- total_timesteps: 40960/10000000
phần thưởng: 14.000 -- entropy: 1.253
phần thưởng thử 0.0

num_up: 60/9765 -- total_timesteps: 61440/10000000
phần thưởng: 14.000 -- entropy: 1.251
phần thưởng thử 0.0

num_up: 80/9765 -- total_timesteps: 81920/10000000
phần thưởng: 17.000 -- entropy: 1.252
phần thưởng thử 1.0

num_up: 100/9765 -- total_timesteps: 102400/10000000
phần thưởng: 23.000 -- entropy: 1.174
phần thưởng thử 1.0

num_up: 120/9765 -- total_timesteps: 122880/10000000
phần thưởng: 21.000 -- entropy: 1.160
phần thưởng thử 1.0

num_up: 140/9765 -- total_timesteps: 143360/10000000
phần thưởng: 21.000 -- entropy: 1.156
phần thưởng thử 7.0

num_up: 160/9765 -- total_timesteps: 163840/10000000
phần thưởng: 21.000 -- entropy: 1.020
phần thưởng thử 1.0

num_up: 180/9765 -- total_timesteps: 184320/10000000
phần thưởng: 25.000 -- entropy: 0.962
phần thưởn

KeyboardInterrupt: 

In [66]:
from typing import Tuple, Dict, Optional, Iterable, Callable

import numpy as np
import seaborn as sns

import matplotlib
from matplotlib import animation

from IPython.display import HTML

import gym
from gym import spaces
from gym.error import DependencyNotInstalled


import numpy as np
import matplotlib.pyplot as plt

In [67]:
def display_video(frames):
    # Copied from: https://colab.research.google.com/github/deepmind/dm_control/blob/master/tutorial.ipynb
    orig_backend = matplotlib.get_backend()
    matplotlib.use('Agg')
    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    matplotlib.use(orig_backend)
    ax.set_axis_off()
    ax.set_aspect('equal')
    ax.set_position([0, 0, 1, 1])
    im = ax.imshow(frames[0])
    def update(frame):
        im.set_data(frame)
        return [im]
    anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                    interval=50, blit=True, repeat=False)
    return HTML(anim.to_html5_video())

In [89]:
env = gym.make("BreakoutNoFrameskip-v4",render_mode='rgb_array')
env = AtariWrappers(env)                           # Use preconfigured Atari preprocessing wrapper
if 'FIRE' in env.unwrapped.get_action_meanings():  # Automatically 'fire' at the start
    env = FireResetEnv(env)                        
env = ClipRewardEnv(env)                           # Clip all rewards to {-1, 0, +1}
env = gym.wrappers.FrameStack(env, 4)

In [146]:
obs,_ = env.reset()
k=0
frames = []
for i in range(1000):
    with torch.no_grad():
        obs_array = np.array(obs)
        obs_tensor = torch.tensor(obs_array, dtype=torch.float32).unsqueeze(0).to(device)

        action, logprob, _, value = agent.get_action_and_value(obs_tensor)
        next_obs, reward, done,ter, info = env.step(action)
        obs=next_obs
        k += reward
        img = env.render()
        frames.append(img)
        if done or ter:
            break
env.close()

In [149]:
display_video(frames)