<a href="https://colab.research.google.com/github/tuongpsa/OOP/blob/master/lunarlanding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 1 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 0s (3,570 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 117528 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubun

In [64]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

In [65]:
import gymnasium as gym
env = gym.make('LunarLander-v3') # The Lunar Lander environment was upgraded to v3
state_shape = env.observation_space.shape
state_size = env.observation_space.shape[0]
number_actions = env.action_space.n
print('State shape: ', state_shape)
print('State size: ', state_size)
print('Number of actions: ', number_actions)

State shape:  (8,)
State size:  8
Number of actions:  4


In [None]:
learning_rate = 5e-4
minibatch_size = 100
discount_factor = 0.99
replay_buffer_size = int(1e5)
interpolation_parameter = 1e-3

In [66]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, state_size, action_size, seed=42):
        super(Network, self).__init__()
        self.seed = torch.manual_seed(seed)

        # --- PH·∫¶N 1: Feature Extraction (Tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng chung) ---
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)

        # --- PH·∫¶N 2: T√°ch nh√°nh (Dueling Streams) ---
        # Nh√°nh Value (V): Output l√† 1 gi√° tr·ªã (scalar)
        self.value_stream = nn.Linear(64, 1)

        # Nh√°nh Advantage (A): Output l√† vector (action_size)
        self.advantage_stream = nn.Linear(64, action_size)

    def forward(self, state):
        # ƒêi qua c√°c l·ªõp ƒë·∫∑c tr∆∞ng chung
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))

        # --- T√°ch lu·ªìng d·ªØ li·ªáu ---
        val = self.value_stream(x)       # T√≠nh V(s)
        adv = self.advantage_stream(x)   # T√≠nh A(s, a)

        # --- L·ªõp k·∫øt h·ª£p (Aggregation Layer) ---
        # C√¥ng th·ª©c: Q(s,a) = V(s) + (A(s,a) - mean(A(s,a)))
        return val + (adv - adv.mean(dim=1, keepdim=True))

In [None]:
from collections import deque, namedtuple
import random
import numpy as np
import torch

class ReplayMemory(object):
    """
    B·ªô nh·ªõ ƒë·ªám ƒë·ªÉ l∆∞u tr·ªØ kinh nghi·ªám (Experience Replay)
    Gi√∫p ph√° v·ª° s·ª± t∆∞∆°ng quan gi·ªØa c√°c m·∫´u d·ªØ li·ªáu li√™n ti·∫øp.
    """
    def __init__(self, capacity):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.capacity = capacity
        self.memory = []  # Danh s√°ch l∆∞u tr·ªØ c√°c s·ª± ki·ªán

    def push(self, event):
        """L∆∞u tr·ªØ m·ªôt s·ª± ki·ªán (state, action, reward, next_state, done) v√†o b·ªô nh·ªõ"""
        self.memory.append(event)
        # N·∫øu v∆∞·ª£t qu√° dung l∆∞·ª£ng, x√≥a ph·∫ßn t·ª≠ c≈© nh·∫•t
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        """L·∫•y ng·∫´u nhi√™n m·ªôt batch c√°c s·ª± ki·ªán ƒë·ªÉ hu·∫•n luy·ªán"""
        experiences = random.sample(self.memory, k=batch_size)

        # Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu th√†nh c√°c Tensor c·ªßa PyTorch ƒë·ªÉ ƒë∆∞a v√†o m·∫°ng
        states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)

        return states, next_states, actions, rewards, dones

In [None]:
import numpy as np
import random
import torch
import torch.optim as optim
import torch.nn.functional as F

class Agent():
    def __init__(self, state_size, action_size):
        # Thi·∫øt l·∫≠p thi·∫øt b·ªã (∆Øu ti√™n GPU n·∫øu c√≥)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.state_size = state_size
        self.action_size = action_size

        # --- KH·ªûI T·∫†O M·∫†NG (DUELING DQN) ---
        # L∆∞u √Ω: Class 'Network' ·ªü ƒë√¢y ch√≠nh l√† class Dueling b·∫°n ƒë√£ s·ª≠a ·ªü tr√™n
        self.local_qnetwork = Network(state_size, action_size).to(self.device)
        self.target_qnetwork = Network(state_size, action_size).to(self.device)

        # Optimizer
        self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr=learning_rate)

        # B·ªô nh·ªõ (Replay Memory)
        self.memory = ReplayMemory(replay_buffer_size)

        # Bi·∫øn ƒë·∫øm th·ªùi gian ƒë·ªÉ update sau m·ªói 4 b∆∞·ªõc
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # L∆∞u tr·∫£i nghi·ªám v√†o b·ªô nh·ªõ
        self.memory.push((state, action, reward, next_state, done))

        # C·∫≠p nh·∫≠t m·ªói 4 b∆∞·ªõc th·ªùi gian (UPDATE_EVERY = 4)
        self.t_step = (self.t_step + 1) % 4
        if self.t_step == 0:
            # Ch·ªâ h·ªçc n·∫øu trong b·ªô nh·ªõ ƒë√£ c√≥ ƒë·ªß d·ªØ li·ªáu
            if len(self.memory.memory) > minibatch_size:
                # S·ª¨A L·ªñI: L·∫•y m·∫´u ƒë√∫ng theo k√≠ch th∆∞·ªõc minibatch (th∆∞·ªùng l√† 64)
                experiences = self.memory.sample(minibatch_size)
                self.learn(experiences, discount_factor)

    def act(self, state, epsilon=0.):
        """Tr·∫£ v·ªÅ h√†nh ƒë·ªông d·ª±a tr√™n state hi·ªán t·∫°i (Epsilon-greedy)"""
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        self.local_qnetwork.eval() # Chuy·ªÉn sang ch·∫ø ƒë·ªô ƒë√°nh gi√° (kh√¥ng t√≠nh gradient)
        with torch.no_grad():
            action_values = self.local_qnetwork(state)
        self.local_qnetwork.train() # Chuy·ªÉn l·∫°i ch·∫ø ƒë·ªô train

        # Epsilon-greedy selection
        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, discount_factor):
        """C·∫≠p nh·∫≠t tr·ªçng s·ªë m·∫°ng d·ª±a tr√™n batch tr·∫£i nghi·ªám"""
        states, next_states, actions, rewards, dones = experiences

        # ------------------- T√çNH TO√ÅN TARGET (Nh√£n) -------------------
        # 1. L·∫•y gi√° tr·ªã Q max cho tr·∫°ng th√°i ti·∫øp theo t·ª´ m·∫°ng Target
        # .detach() ƒë·ªÉ kh√¥ng lan truy·ªÅn ng∆∞·ª£c gradient qua m·∫°ng target
        next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)

        # (M·ªû R·ªòNG - N√¢ng cao): N·∫øu mu·ªën d√πng Double DQN th√¨ b·ªè comment 2 d√≤ng d∆∞·ªõi, x√≥a d√≤ng tr√™n
        # best_actions = self.local_qnetwork(next_states).max(1)[1].unsqueeze(1)
        # next_q_targets = self.target_qnetwork(next_states).detach().gather(1, best_actions)

        # 2. T√≠nh Q targets cho tr·∫°ng th√°i hi·ªán t·∫°i (C√¥ng th·ª©c Bellman)
        q_targets = rewards + (discount_factor * next_q_targets * (1 - dones))

        # ------------------- T√çNH TO√ÅN EXPECTED (D·ª± ƒëo√°n) -------------------
        # L·∫•y gi√° tr·ªã Q m√† m·∫°ng Local d·ª± ƒëo√°n cho h√†nh ƒë·ªông ƒë√£ ch·ªçn
        q_expected = self.local_qnetwork(states).gather(1, actions)

        # ------------------- T√çNH LOSS V√Ä BACKPROP -------------------
        loss = F.mse_loss(q_expected, q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- SOFT UPDATE -------------------
        # C·∫≠p nh·∫≠t t·ª´ t·ª´ m·∫°ng Target theo m·∫°ng Local
        self.soft_update(self.local_qnetwork, self.target_qnetwork, interpolation_parameter)

    def soft_update(self, local_model, target_model, interpolation_parameter):
        """Soft update model parameters.
        Œ∏_target = œÑ*Œ∏_local + (1 - œÑ)*Œ∏_target
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data)

In [None]:
agent = Agent(state_size, number_actions)

In [None]:
number_episodes = 2000
maximum_number_timesteps_per_episode = 1000
epsilon_starting_value  = 1.0
epsilon_ending_value  = 0.01
epsilon_decay_value  = 0.995
epsilon = epsilon_starting_value
scores_on_100_episodes = deque(maxlen = 100)

for episode in range(1, number_episodes + 1):
  state, _ = env.reset()
  score = 0
  for t in range(maximum_number_timesteps_per_episode):
    action = agent.act(state, epsilon)
    next_state, reward, done, _, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward
    if done:
      break
  scores_on_100_episodes.append(score)
  epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon)
  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end = "")
  if episode % 100 == 0:
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))
  if np.mean(scores_on_100_episodes) >= 200.0:
    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode - 100, np.mean(scores_on_100_episodes)))
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth')
    break

Episode 100	Average Score: -177.49
Episode 200	Average Score: -111.06
Episode 300	Average Score: -32.23
Episode 400	Average Score: -21.83
Episode 500	Average Score: 44.07
Episode 600	Average Score: 111.11
Episode 700	Average Score: 143.82
Episode 800	Average Score: 169.74
Episode 855	Average Score: 201.48
Environment solved in 755 episodes!	Average Score: 201.48


In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
import os
import gymnasium as gym

def show_video_of_model(agent, env_name, filename='video.mp4'):
    # T·∫°o m√¥i tr∆∞·ªùng
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []

    print(f"üöÄ ƒêang ghi h√¨nh m√¥i tr∆∞·ªùng: {env_name}...")

    while not done:
        frame = env.render()
        frames.append(frame)

        # Agent ch·ªçn h√†nh ƒë·ªông
        action = agent.act(state)

        # X·ª≠ l√Ω ƒë·ªãnh d·∫°ng Action (Int vs Tensor)
        if hasattr(action, "item"):
            action_step = action.item()
        else:
            action_step = action

        # Th·ª±c hi·ªán b∆∞·ªõc ƒëi
        state, reward, terminated, truncated, _ = env.step(action_step)
        done = terminated or truncated

    env.close()
    # L∆∞u video
    imageio.mimsave(filename, frames, fps=30)
    print(f"‚úÖ ƒê√£ l∆∞u video v√†o file: {filename}")

def show_video(filename='video.mp4'):
    if os.path.exists(filename):
        try:
            video = io.open(filename, 'r+b').read()
            encoded = base64.b64encode(video)
            # Code HTML hi·ªÉn th·ªã video
            display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
        except Exception as e:
            print(f"Error displaying video {filename}: {e}")
    else:
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y file video: {filename}")

# --- PH·∫¶N CH·∫†Y CH√çNH (MAIN) ---

# 1. Quay video tr∆∞·ªõc (D√πng LunarLander-v3 cho Colab m·ªõi)
show_video_of_model(agent, 'LunarLander-v3', filename='ket_qua_cuoi.mp4')

# 2. Ph√°t video v·ª´a quay
show_video(filename='ket_qua_cuoi.mp4')

üöÄ ƒêang ghi h√¨nh m√¥i tr∆∞·ªùng: LunarLander-v3...




‚úÖ ƒê√£ l∆∞u video v√†o file: ket_qua_cuoi.mp4
