<a href="https://colab.research.google.com/github/velpulakaran/reinforcement-learning/blob/main/RLML_LAB_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#2303A51587

V.KARAN

Batch-09


Implementing Deep Q-Networks -DQN for Atari games or similar environments with functions. give python code and explaination and formulas for execution

In [None]:
# PART 1: Imports, Replay Buffer, and Q-Network (Gymnasium compatible)

import random
import numpy as np
from collections import deque, namedtuple
import gymnasium as gym   # ✅ Use Gymnasium instead of Gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Transition tuple for replay buffer
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states = torch.tensor([b.state for b in batch], dtype=torch.float32)
        actions = torch.tensor([b.action for b in batch], dtype=torch.int64)
        rewards = torch.tensor([b.reward for b in batch], dtype=torch.float32)
        next_states = torch.tensor([b.next_state for b in batch], dtype=torch.float32)
        dones = torch.tensor([b.done for b in batch], dtype=torch.float32)
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

# Q-Network (MLP for CartPole)
class QNetwork(nn.Module):
    def __init__(self, state_dim, n_actions):
        super(QNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128), nn.ReLU(),
            nn.Linear(128, 128), nn.ReLU(),
            nn.Linear(128, n_actions)
        )
    def forward(self, x):
        return self.net(x)

  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# PART 2: DQN Agent

class DQNAgent:
    def __init__(self, state_dim, n_actions, device):
        self.n_actions = n_actions
        self.device = device

        # Networks
        self.online = QNetwork(state_dim, n_actions).to(device)
        self.target = QNetwork(state_dim, n_actions).to(device)
        self.target.load_state_dict(self.online.state_dict())

        # Optimizer & Replay Buffer
        self.optimizer = optim.Adam(self.online.parameters(), lr=1e-3)
        self.replay = ReplayBuffer(50000)

        # Hyperparameters
        self.batch_size = 64
        self.gamma = 0.99
        self.target_update_freq = 1000
        self.min_replay_size = 1000
        self.epsilon_start = 1.0
        self.epsilon_final = 0.01
        self.epsilon_decay = 5000
        self.steps_done = 0

    def select_action(self, state):
        eps = self.epsilon_final + (self.epsilon_start - self.epsilon_final) * \
              np.exp(-1.0 * self.steps_done / self.epsilon_decay)
        self.steps_done += 1
        if random.random() < eps:
            return random.randrange(self.n_actions)
        else:
            with torch.no_grad():
                state_t = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
                qvals = self.online(state_t)
                return int(torch.argmax(qvals, dim=1).item())

    def compute_td_loss(self):
        states, actions, rewards, next_states, dones = self.replay.sample(self.batch_size)

        # Move to device
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)

        # Q(s,a)
        q_values = self.online(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Target Q
        with torch.no_grad():
            max_next_q = self.target(next_states).max(1)[0]
            td_target = rewards + self.gamma * (1 - dones) * max_next_q

        loss = F.mse_loss(q_values, td_target)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.online.parameters(), 10.0)
        self.optimizer.step()
        return loss.item()

In [None]:
def train_dqn(env_id="CartPole-v1", num_frames=20000, log_interval=1000):
    env = gym.make(env_id)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    agent = DQNAgent(
        state_dim=env.observation_space.shape[0],
        n_actions=env.action_space.n,
        device=device
    )

    # Reset handling for new/old Gym API
    reset_output = env.reset()
    if isinstance(reset_output, tuple):   # new API
        state, _ = reset_output
    else:                                 # old API
        state = reset_output

    episode_reward = 0
    all_rewards, losses = [], []

    for frame_idx in range(1, num_frames + 1):
        action = agent.select_action(state)
        step_output = env.step(action)

        # Handle both API versions
        if len(step_output) == 5:  # new API
            next_state, reward, terminated, truncated, _ = step_output
            done = terminated or truncated
        else:                      # old API
            next_state, reward, done, _ = step_output

        agent.replay.push(state, action, reward, next_state, float(done))
        state = next_state
        episode_reward += reward

        if len(agent.replay) > agent.min_replay_size:
            loss = agent.compute_td_loss()
            losses.append(loss)

        # Update target
        if frame_idx % agent.target_update_freq == 0:
            agent.target.load_state_dict(agent.online.state_dict())

        # End of episode
        if done:
            reset_output = env.reset()
            state = reset_output[0] if isinstance(reset_output, tuple) else reset_output
            all_rewards.append(episode_reward)
            episode_reward = 0

        # Logging
        if frame_idx % log_interval == 0:
            avg_reward = np.mean(all_rewards[-10:]) if all_rewards else 0.0
            avg_loss = np.mean(losses[-100:]) if losses else 0.0
            print(f"Frame {frame_idx}, AvgReward(10) {avg_reward:.2f}, AvgLoss(100) {avg_loss:.4f}")

    env.close()
    return agent, all_rewards, losses

In [None]:
if __name__ == "__main__":
    trained_agent, rewards, losses = train_dqn(
        env_id="CartPole-v1",
        num_frames=20000,
        log_interval=2000
    )

    print("Training complete!")
    print("Last 10 episode rewards:", rewards[-10:])

  states = torch.tensor([b.state for b in batch], dtype=torch.float32)


Frame 2000, AvgReward(10) 13.80, AvgLoss(100) 0.0008
Frame 4000, AvgReward(10) 32.30, AvgLoss(100) 0.0265
Frame 6000, AvgReward(10) 59.30, AvgLoss(100) 0.0297
Frame 8000, AvgReward(10) 114.60, AvgLoss(100) 0.0559
Frame 10000, AvgReward(10) 115.00, AvgLoss(100) 0.1126
Frame 12000, AvgReward(10) 135.90, AvgLoss(100) 0.1212
Frame 14000, AvgReward(10) 123.10, AvgLoss(100) 0.2170
Frame 16000, AvgReward(10) 142.90, AvgLoss(100) 0.3196
Frame 18000, AvgReward(10) 119.40, AvgLoss(100) 0.3120
Frame 20000, AvgReward(10) 117.60, AvgLoss(100) 0.2839
Training complete!
Last 10 episode rewards: [127.0, 168.0, 115.0, 133.0, 145.0, 110.0, 119.0, 128.0, 118.0, 13.0]
