In [1]:
import gymnasium as gym
import time

env = gym.make("CartPole-v1", render_mode="human")

observation, info = env.reset()

for step in range(5000):  
    time.sleep(0.02)  

    action = env.action_space.sample()

    observation, reward, terminated, truncated, info = env.step(action)

    print(f"Step {step}: Reward={reward:.2f}, Terminated={terminated}, Truncated={truncated}")

    # Nếu game over, reset lại
    if terminated or truncated:
        print("Resetting environment...\n")
        observation, info = env.reset()

env.close()


Step 0: Reward=1.00, Terminated=False, Truncated=False
Step 1: Reward=1.00, Terminated=False, Truncated=False
Step 2: Reward=1.00, Terminated=False, Truncated=False
Step 3: Reward=1.00, Terminated=False, Truncated=False
Step 4: Reward=1.00, Terminated=False, Truncated=False
Step 5: Reward=1.00, Terminated=False, Truncated=False
Step 6: Reward=1.00, Terminated=False, Truncated=False
Step 7: Reward=1.00, Terminated=False, Truncated=False
Step 8: Reward=1.00, Terminated=False, Truncated=False
Step 9: Reward=1.00, Terminated=False, Truncated=False
Step 10: Reward=1.00, Terminated=False, Truncated=False
Step 11: Reward=1.00, Terminated=False, Truncated=False
Step 12: Reward=1.00, Terminated=False, Truncated=False
Step 13: Reward=1.00, Terminated=False, Truncated=False
Step 14: Reward=1.00, Terminated=False, Truncated=False
Step 15: Reward=1.00, Terminated=False, Truncated=False
Step 16: Reward=1.00, Terminated=False, Truncated=False
Step 17: Reward=1.00, Terminated=True, Truncated=False
Res

In [3]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 64

        self.model = QNetwork(state_size, action_size).to(device)
        self.target_model = QNetwork(state_size, action_size).to(device)
        self.update_target_model()

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            act_values = self.model(state)
        return torch.argmax(act_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

        current_q = self.model(states).gather(1, actions)
        next_q = self.target_model(next_states).max(1)[0].detach().unsqueeze(1)
        target_q = rewards + (1 - dones) * self.gamma * next_q

        loss = self.criterion(current_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

env = gym.make("CartPole-v1", render_mode="human")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size)

episodes = 500

for e in range(episodes):
    state, info = env.reset()
    total_reward = 0

    for time_t in range(500):
        time.sleep(0.01)  
        action = agent.act(state)
        next_state, reward, terminated, truncated, info = env.step(action)

        done = terminated or truncated
        agent.remember(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        if done:
            agent.update_target_model()
            print(f"Episode {e+1}/{episodes}, Score: {total_reward}, Epsilon: {agent.epsilon:.2f}")
            break

        agent.replay()

env.close()


Episode 1/500, Score: 62.0, Epsilon: 1.00
Episode 2/500, Score: 21.0, Epsilon: 0.91
Episode 3/500, Score: 21.0, Epsilon: 0.82
Episode 4/500, Score: 26.0, Epsilon: 0.73
Episode 5/500, Score: 39.0, Epsilon: 0.60
Episode 6/500, Score: 27.0, Epsilon: 0.53
Episode 7/500, Score: 14.0, Epsilon: 0.49
Episode 8/500, Score: 13.0, Epsilon: 0.46
Episode 9/500, Score: 11.0, Epsilon: 0.44
Episode 10/500, Score: 19.0, Epsilon: 0.40
Episode 11/500, Score: 17.0, Epsilon: 0.37
Episode 12/500, Score: 9.0, Epsilon: 0.36
Episode 13/500, Score: 13.0, Epsilon: 0.34
Episode 14/500, Score: 11.0, Epsilon: 0.32
Episode 15/500, Score: 11.0, Epsilon: 0.30
Episode 16/500, Score: 10.0, Epsilon: 0.29
Episode 17/500, Score: 11.0, Epsilon: 0.28
Episode 18/500, Score: 10.0, Epsilon: 0.26
Episode 19/500, Score: 9.0, Epsilon: 0.25
Episode 20/500, Score: 11.0, Epsilon: 0.24
Episode 21/500, Score: 10.0, Epsilon: 0.23
Episode 22/500, Score: 9.0, Epsilon: 0.22
Episode 23/500, Score: 13.0, Epsilon: 0.21
Episode 24/500, Score: 

In [4]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# DQN Agent and Double DQN
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 64

        self.model = QNetwork(state_size, action_size).to(device)
        self.target_model = QNetwork(state_size, action_size).to(device)
        self.update_target_model()

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            act_values = self.model(state)
        return torch.argmax(act_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

        next_action = torch.argmax(self.model(next_states), dim=1, keepdim=True)

        next_q = self.target_model(next_states).gather(1, next_action)

        target_q = rewards + (1 - dones) * self.gamma * next_q

        current_q = self.model(states).gather(1, actions)

        loss = self.criterion(current_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

env = gym.make("CartPole-v1", render_mode="human")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size)

episodes = 500

for e in range(episodes):
    state, info = env.reset()
    total_reward = 0

    for time_t in range(500):
        time.sleep(0.01) 
        action = agent.act(state)
        next_state, reward, terminated, truncated, info = env.step(action)

        done = terminated or truncated
        agent.remember(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        if done:
            agent.update_target_model()
            print(f"Episode {e+1}/{episodes}, Score: {total_reward}, Epsilon: {agent.epsilon:.2f}")
            break

        agent.replay()

env.close()


Using device: cuda
Episode 1/500, Score: 29.0, Epsilon: 1.00
Episode 2/500, Score: 15.0, Epsilon: 1.00
Episode 3/500, Score: 12.0, Epsilon: 1.00
Episode 4/500, Score: 30.0, Epsilon: 0.90
Episode 5/500, Score: 20.0, Epsilon: 0.81
Episode 6/500, Score: 20.0, Epsilon: 0.74
Episode 7/500, Score: 17.0, Epsilon: 0.68
Episode 8/500, Score: 10.0, Epsilon: 0.65
Episode 9/500, Score: 9.0, Epsilon: 0.63
Episode 10/500, Score: 9.0, Epsilon: 0.60
Episode 11/500, Score: 16.0, Epsilon: 0.56
Episode 12/500, Score: 11.0, Epsilon: 0.53
Episode 13/500, Score: 10.0, Epsilon: 0.51
Episode 14/500, Score: 10.0, Epsilon: 0.49
Episode 15/500, Score: 12.0, Epsilon: 0.46
Episode 16/500, Score: 10.0, Epsilon: 0.44
Episode 17/500, Score: 10.0, Epsilon: 0.42
Episode 18/500, Score: 13.0, Epsilon: 0.40
Episode 19/500, Score: 8.0, Epsilon: 0.38
Episode 20/500, Score: 12.0, Epsilon: 0.36
Episode 21/500, Score: 9.0, Epsilon: 0.35
Episode 22/500, Score: 10.0, Epsilon: 0.33
Episode 23/500, Score: 8.0, Epsilon: 0.32
Episod

In [5]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dueling DQN Network
class DuelingQNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DuelingQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)

        # Value Stream
        self.value_fc = nn.Linear(128, 1)
        # Advantage Stream
        self.advantage_fc = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))

        value = self.value_fc(x)
        advantage = self.advantage_fc(x)

        # Dueling formula
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_values

# Double + Dueling DQN
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 64

        self.model = DuelingQNetwork(state_size, action_size).to(device)
        self.target_model = DuelingQNetwork(state_size, action_size).to(device)
        self.update_target_model()

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            act_values = self.model(state)
        return torch.argmax(act_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

        # Double DQN
        next_action = torch.argmax(self.model(next_states), dim=1, keepdim=True)
        next_q = self.target_model(next_states).gather(1, next_action)

        target_q = rewards + (1 - dones) * self.gamma * next_q

        current_q = self.model(states).gather(1, actions)

        loss = self.criterion(current_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Tạo môi trường
env = gym.make("CartPole-v1", render_mode="human")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size)

episodes = 500

for e in range(episodes):
    state, info = env.reset()
    total_reward = 0

    for time_t in range(500):
        time.sleep(0.01)
        action = agent.act(state)
        next_state, reward, terminated, truncated, info = env.step(action)

        done = terminated or truncated
        agent.remember(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        if done:
            agent.update_target_model()
            print(f"Episode {e+1}/{episodes}, Score: {total_reward}, Epsilon: {agent.epsilon:.2f}")
            break

        agent.replay()

env.close()


Using device: cuda
Episode 1/500, Score: 16.0, Epsilon: 1.00
Episode 2/500, Score: 19.0, Epsilon: 1.00
Episode 3/500, Score: 9.0, Epsilon: 1.00
Episode 4/500, Score: 20.0, Epsilon: 1.00
Episode 5/500, Score: 21.0, Epsilon: 0.90
Episode 6/500, Score: 19.0, Epsilon: 0.83
Episode 7/500, Score: 14.0, Epsilon: 0.77
Episode 8/500, Score: 24.0, Epsilon: 0.69
Episode 9/500, Score: 17.0, Epsilon: 0.64
Episode 10/500, Score: 12.0, Epsilon: 0.60
Episode 11/500, Score: 22.0, Epsilon: 0.54
Episode 12/500, Score: 13.0, Epsilon: 0.51
Episode 13/500, Score: 117.0, Epsilon: 0.29
Episode 14/500, Score: 124.0, Epsilon: 0.15
Episode 15/500, Score: 63.0, Epsilon: 0.11
Episode 16/500, Score: 62.0, Epsilon: 0.08
Episode 17/500, Score: 113.0, Epsilon: 0.05
Episode 18/500, Score: 114.0, Epsilon: 0.03
Episode 19/500, Score: 85.0, Epsilon: 0.02
Episode 20/500, Score: 187.0, Epsilon: 0.01
Episode 21/500, Score: 231.0, Epsilon: 0.01
Episode 22/500, Score: 161.0, Epsilon: 0.01
Episode 23/500, Score: 199.0, Epsilon:

In [6]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque, namedtuple
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Noisy Linear Layer
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, std_init=0.5):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))

        self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))

        self.std_init = std_init
        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        mu_range = 1 / np.sqrt(self.in_features)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.std_init / np.sqrt(self.in_features))

        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / np.sqrt(self.out_features))

    def reset_noise(self):
        self.weight_epsilon.normal_()
        self.bias_epsilon.normal_()

    def forward(self, x):
        if self.training:
            weight = self.weight_mu + self.weight_sigma * self.weight_epsilon
            bias = self.bias_mu + self.bias_sigma * self.bias_epsilon
        else:
            weight = self.weight_mu
            bias = self.bias_mu
        return torch.nn.functional.linear(x, weight, bias)

# Dueling Network + Noisy Layer
class RainbowDQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(RainbowDQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)

        self.value_fc = NoisyLinear(128, 1)
        self.advantage_fc = NoisyLinear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))

        value = self.value_fc(x)
        advantage = self.advantage_fc(x)

        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_values

    def reset_noise(self):
        self.value_fc.reset_noise()
        self.advantage_fc.reset_noise()

# Prioritized Replay Buffer
class PrioritizedReplayBuffer:
    def __init__(self, capacity, alpha=0.6):
        self.capacity = capacity
        self.alpha = alpha
        self.buffer = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)
        self.Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

    def push(self, state, action, reward, next_state, done):
        max_priority = self.priorities.max() if self.buffer else 1.0
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.pos] = self.Transition(state, action, reward, next_state, done)
        self.priorities[self.pos] = max_priority
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            priorities = self.priorities
        else:
            priorities = self.priorities[:self.pos]

        probs = priorities ** self.alpha
        probs /= probs.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights = torch.FloatTensor(weights).unsqueeze(1).to(device)

        batch = self.Transition(*zip(*samples))

        states = torch.FloatTensor(batch.state).to(device)
        actions = torch.LongTensor(batch.action).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(batch.reward).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(batch.next_state).to(device)
        dones = torch.FloatTensor(batch.done).unsqueeze(1).to(device)

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            self.priorities[idx] = priority

# Rainbow Agent
class RainbowAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = PrioritizedReplayBuffer(50000)
        self.gamma = 0.99
        self.batch_size = 64
        self.multi_step = 3
        self.beta = 0.4

        self.model = RainbowDQN(state_size, action_size).to(device)
        self.target_model = RainbowDQN(state_size, action_size).to(device)
        self.update_target_model()

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.0005)
        self.criterion = nn.SmoothL1Loss()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)

    def replay(self):
        if len(self.memory.buffer) < self.batch_size:
            return

        states, actions, rewards, next_states, dones, indices, weights = self.memory.sample(self.batch_size, beta=self.beta)

        with torch.no_grad():
            next_action = torch.argmax(self.model(next_states), dim=1, keepdim=True)
            next_q = self.target_model(next_states).gather(1, next_action)

            target_q = rewards + (1 - dones) * self.gamma * next_q

        current_q = self.model(states).gather(1, actions)
        loss = (current_q - target_q).pow(2) * weights
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        priorities = (current_q - target_q).abs().detach().cpu().numpy() + 1e-6
        self.memory.update_priorities(indices, priorities)

        self.model.reset_noise()
        self.target_model.reset_noise()

# Khởi tạo môi trường
env = gym.make("CartPole-v1", render_mode="human")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = RainbowAgent(state_size, action_size)

episodes = 500

for e in range(episodes):
    state, info = env.reset()
    total_reward = 0

    for t in range(500):
        time.sleep(0.01)
        action = agent.act(state)
        next_state, reward, terminated, truncated, info = env.step(action)

        done = terminated or truncated
        agent.remember(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        agent.replay()

        if done:
            agent.update_target_model()
            print(f"Episode {e+1}/{episodes}, Score: {total_reward}")
            break

env.close()


Using device: cuda
Episode 1/500, Score: 9.0
Episode 2/500, Score: 8.0
Episode 3/500, Score: 9.0
Episode 4/500, Score: 10.0
Episode 5/500, Score: 9.0
Episode 6/500, Score: 8.0
Episode 7/500, Score: 9.0


  self.priorities[idx] = priority


Episode 8/500, Score: 12.0
Episode 9/500, Score: 20.0
Episode 10/500, Score: 10.0
Episode 11/500, Score: 11.0
Episode 12/500, Score: 10.0
Episode 13/500, Score: 10.0
Episode 14/500, Score: 9.0
Episode 15/500, Score: 8.0
Episode 16/500, Score: 10.0
Episode 17/500, Score: 9.0
Episode 18/500, Score: 8.0
Episode 19/500, Score: 10.0
Episode 20/500, Score: 9.0
Episode 21/500, Score: 10.0
Episode 22/500, Score: 9.0
Episode 23/500, Score: 8.0
Episode 24/500, Score: 10.0
Episode 25/500, Score: 9.0
Episode 26/500, Score: 10.0
Episode 27/500, Score: 10.0
Episode 28/500, Score: 8.0
Episode 29/500, Score: 10.0
Episode 30/500, Score: 10.0
Episode 31/500, Score: 8.0
Episode 32/500, Score: 11.0
Episode 33/500, Score: 9.0
Episode 34/500, Score: 9.0
Episode 35/500, Score: 10.0
Episode 36/500, Score: 12.0
Episode 37/500, Score: 10.0
Episode 38/500, Score: 12.0
Episode 39/500, Score: 12.0
Episode 40/500, Score: 10.0
Episode 41/500, Score: 13.0
Episode 42/500, Score: 10.0
Episode 43/500, Score: 10.0
Episod

In [7]:
# Khởi tạo môi trường
env = gym.make("MountainCar-v0", render_mode="human")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = RainbowAgent(state_size, action_size)

episodes = 1000  # MountainCar cần nhiều episode hơn

for e in range(episodes):
    state, info = env.reset()
    total_reward = 0

    for t in range(200):  # max steps MountainCar-v0 là 200
        time.sleep(0.01)
        action = agent.act(state)
        next_state, reward, terminated, truncated, info = env.step(action)

        # Optional: reward shaping để dễ học hơn
        # reward = reward + abs(next_state[1]) * 0.1  # cộng thêm vận tốc như bonus

        done = terminated or truncated
        agent.remember(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        agent.replay()

        if done:
            agent.update_target_model()
            print(f"Episode {e+1}/{episodes}, Score: {total_reward}")
            break

env.close()


  self.priorities[idx] = priority


Episode 1/1000, Score: -200.0
Episode 2/1000, Score: -200.0
Episode 3/1000, Score: -200.0
Episode 4/1000, Score: -200.0
Episode 5/1000, Score: -200.0
Episode 6/1000, Score: -200.0
Episode 7/1000, Score: -200.0
Episode 8/1000, Score: -200.0
Episode 9/1000, Score: -200.0
Episode 10/1000, Score: -200.0
Episode 11/1000, Score: -200.0
Episode 12/1000, Score: -200.0
Episode 13/1000, Score: -200.0
Episode 14/1000, Score: -200.0
Episode 15/1000, Score: -200.0
Episode 16/1000, Score: -200.0
Episode 17/1000, Score: -200.0
Episode 18/1000, Score: -200.0
Episode 19/1000, Score: -200.0
Episode 20/1000, Score: -200.0
Episode 21/1000, Score: -200.0
Episode 22/1000, Score: -200.0
Episode 23/1000, Score: -200.0
Episode 24/1000, Score: -200.0
Episode 25/1000, Score: -200.0
Episode 26/1000, Score: -200.0
Episode 27/1000, Score: -200.0
Episode 28/1000, Score: -200.0
Episode 29/1000, Score: -200.0
Episode 30/1000, Score: -200.0
Episode 31/1000, Score: -200.0
Episode 32/1000, Score: -200.0
Episode 33/1000, 

In [8]:
import random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import time

# Thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Rainbow Network (Dueling DQN + NoisyNet)
class RainbowNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(RainbowNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)

        # Dueling
        self.fc_value = nn.Linear(128, 1)
        self.fc_advantage = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))

        value = self.fc_value(x)
        advantage = self.fc_advantage(x)
        q = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q

# Agent
class RainbowAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.memory = deque(maxlen=50000)
        self.batch_size = 64
        self.gamma = 0.99
        self.lr = 0.001

        self.policy_net = RainbowNetwork(state_size, action_size).to(device)
        self.target_net = RainbowNetwork(state_size, action_size).to(device)
        self.update_target_model()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

    def update_target_model(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, eps=0.1):
        if random.random() <= eps:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = self.policy_net(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

        curr_q = self.policy_net(states).gather(1, actions)
        next_q = self.target_net(next_states).max(1)[0].unsqueeze(1)
        target_q = rewards + (1 - dones) * self.gamma * next_q

        loss = nn.MSELoss()(curr_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

# Training
env = gym.make("MountainCar-v0", render_mode="human")

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = RainbowAgent(state_size, action_size)

episodes = 500
max_steps = 200

for e in range(episodes):
    state, _ = env.reset()
    total_reward = 0

    for t in range(max_steps):
        time.sleep(0.01)
        action = agent.act(state, eps=0.05)  # exploration nhỏ thôi
        next_state, reward, terminated, truncated, _ = env.step(action)

        # 👉 Reward shaping: tăng phần thưởng nếu cao độ tăng
        position, velocity = next_state
        reward += abs(position - (-0.5))  # càng xa -0.5 càng tốt

        done = terminated or truncated
        agent.remember(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        agent.replay()

        if done:
            agent.update_target_model()
            print(f"Episode {e+1}/{episodes}, Total reward: {total_reward:.2f}")
            break

env.close()


Episode 1/500, Total reward: -139.30
Episode 2/500, Total reward: -175.77
Episode 3/500, Total reward: -172.67
Episode 4/500, Total reward: -183.04
Episode 5/500, Total reward: -180.62
Episode 6/500, Total reward: -178.61
Episode 7/500, Total reward: -179.43
Episode 8/500, Total reward: -170.52
Episode 9/500, Total reward: -172.59
Episode 10/500, Total reward: -175.90
Episode 11/500, Total reward: -175.72
Episode 12/500, Total reward: -159.44
Episode 13/500, Total reward: -169.34
Episode 14/500, Total reward: -172.78
Episode 15/500, Total reward: -174.30
Episode 16/500, Total reward: -167.08
Episode 17/500, Total reward: -163.92
Episode 18/500, Total reward: -172.13
Episode 19/500, Total reward: -182.72
Episode 20/500, Total reward: -182.33
Episode 21/500, Total reward: -175.83
Episode 22/500, Total reward: -179.56
Episode 23/500, Total reward: -176.40
Episode 24/500, Total reward: -183.44
Episode 25/500, Total reward: -175.89
Episode 26/500, Total reward: -177.28
Episode 27/500, Total

In [9]:
import random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, namedtuple
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Rainbow Network (Dueling DQN)
class RainbowNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(RainbowNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc_value = nn.Linear(128, 1)
        self.fc_advantage = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        value = self.fc_value(x)
        advantage = self.fc_advantage(x)
        q = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q

# Prioritized Replay Buffer
class PrioritizedReplayBuffer:
    def __init__(self, capacity, alpha=0.6, n_step=3, gamma=0.99):
        self.capacity = capacity
        self.alpha = alpha
        self.memory = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)

        self.n_step = n_step
        self.gamma = gamma
        self.n_step_buffer = deque(maxlen=n_step)

    def _get_priority(self, td_error):
        return (np.abs(td_error) + 1e-5) ** self.alpha

    def push(self, state, action, reward, next_state, done):
        self.n_step_buffer.append((state, action, reward, next_state, done))
        
        if len(self.n_step_buffer) < self.n_step:
            return

        # N-step return
        state, action = self.n_step_buffer[0][:2]
        reward, next_state, done = self._calc_n_step_return()

        max_priority = self.priorities.max() if self.memory else 1.0

        if len(self.memory) < self.capacity:
            self.memory.append((state, action, reward, next_state, done))
        else:
            self.memory[self.pos] = (state, action, reward, next_state, done)

        self.priorities[self.pos] = max_priority
        self.pos = (self.pos + 1) % self.capacity

    def _calc_n_step_return(self):
        reward, next_state, done = self.n_step_buffer[-1][-3:]
        for transition in reversed(list(self.n_step_buffer)[:-1]):
            r, n_s, d = transition[2:]
            reward = r + self.gamma * reward
            next_state, done = (n_s, d) if d else (next_state, done)
        return reward, next_state, done

    def sample(self, batch_size, beta=0.4):
        if len(self.memory) == self.capacity:
            probs = self.priorities
        else:
            probs = self.priorities[:self.pos]

        probs /= probs.sum()

        indices = np.random.choice(len(self.memory), batch_size, p=probs)
        samples = [self.memory[idx] for idx in indices]

        total = len(self.memory)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)

        states, actions, rewards, next_states, dones = zip(*samples)
        return (np.array(states), np.array(actions), np.array(rewards),
                np.array(next_states), np.array(dones), indices, weights)

    def update_priorities(self, indices, td_errors):
        for idx, td_error in zip(indices, td_errors):
            self.priorities[idx] = self._get_priority(td_error)

# Agent
class RainbowAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.memory = PrioritizedReplayBuffer(50000)
        self.batch_size = 64
        self.gamma = 0.99
        self.lr = 0.001
        self.beta = 0.4

        self.policy_net = RainbowNetwork(state_size, action_size).to(device)
        self.target_net = RainbowNetwork(state_size, action_size).to(device)
        self.update_target_model()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

    def update_target_model(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def act(self, state, eps=0.01):
        if random.random() <= eps:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = self.policy_net(state)
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)

    def replay(self):
        if len(self.memory.memory) < self.batch_size:
            return

        states, actions, rewards, next_states, dones, indices, weights = self.memory.sample(self.batch_size, self.beta)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)
        weights = torch.FloatTensor(weights).unsqueeze(1).to(device)

        curr_q = self.policy_net(states).gather(1, actions)
        next_q = self.target_net(next_states).max(1)[0].unsqueeze(1)
        target_q = rewards + (1 - dones) * self.gamma * next_q

        td_errors = (curr_q - target_q).detach().cpu().numpy()
        loss = (weights * nn.MSELoss(reduction='none')(curr_q, target_q)).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.memory.update_priorities(indices, td_errors)

# Training
env = gym.make("MountainCar-v0", render_mode="human")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = RainbowAgent(state_size, action_size)

episodes = 500
max_steps = 200

for e in range(episodes):
    state, _ = env.reset()
    total_reward = 0

    for t in range(max_steps):
        time.sleep(0.01)
        action = agent.act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)

        position, velocity = next_state
        reward += abs(position - (-0.5))  # reward shaping

        done = terminated or truncated
        agent.remember(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        agent.replay()

        if done:
            agent.update_target_model()
            print(f"Episode {e+1}/{episodes}, Total reward: {total_reward:.2f}")
            break

env.close()


  self.priorities[idx] = self._get_priority(td_error)


Episode 1/500, Total reward: -157.84
Episode 2/500, Total reward: -147.80
Episode 3/500, Total reward: -186.59
Episode 4/500, Total reward: -188.40
Episode 5/500, Total reward: -194.70
Episode 6/500, Total reward: -194.29
Episode 7/500, Total reward: -190.54
Episode 8/500, Total reward: -195.02
Episode 9/500, Total reward: -195.10
Episode 10/500, Total reward: -194.19
Episode 11/500, Total reward: -192.57
Episode 12/500, Total reward: -154.20
Episode 13/500, Total reward: -180.56
Episode 14/500, Total reward: -179.73
Episode 15/500, Total reward: -181.85
Episode 16/500, Total reward: -174.49
Episode 17/500, Total reward: -171.83
Episode 18/500, Total reward: -176.30
Episode 19/500, Total reward: -172.55
Episode 20/500, Total reward: -177.46
Episode 21/500, Total reward: -152.35
Episode 22/500, Total reward: -175.68
Episode 23/500, Total reward: -168.96
Episode 24/500, Total reward: -176.41
Episode 25/500, Total reward: -99.99
Episode 26/500, Total reward: -166.51
Episode 27/500, Total 