In [1]:
# Enhanced DQN with Optional Double and Dueling Architecture
# Saves best-performing model and records gameplay video

import os
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
from datetime import datetime
from skimage.transform import resize
from skimage.color import rgb2gray
from torch.utils.tensorboard import SummaryWriter
from gymnasium.wrappers import RecordVideo
from IPython.display import Video
import os

  import distutils as _distutils
2025-05-10 02:46:22.253071: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

# --- Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda')

In [5]:
# --- Preprocessing ---
def preprocess_observation(obs):
    gray = rgb2gray(obs)
    resized = resize(gray, (84, 84), mode='constant')
    return np.uint8(resized * 255)

In [6]:
# --- Replay Buffer ---
class ReplayBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

In [7]:
# --- DQN Model ---
class DQN(nn.Module):
    def __init__(self, action_space, dueling=False):
        super(DQN, self).__init__()
        self.dueling = dueling
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 512)

        if self.dueling:
            self.value = nn.Linear(512, 1)
            self.advantage = nn.Linear(512, action_space)
        else:
            self.fc2 = nn.Linear(512, action_space)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))

        if self.dueling:
            value = self.value(x)
            advantage = self.advantage(x)
            return value + (advantage - advantage.mean(dim=1, keepdim=True))
        else:
            return self.fc2(x)

In [8]:
# --- Action Selection ---
def select_action(state, epsilon, action_space, policy_net):
    if random.random() < epsilon:
        return random.randrange(action_space)
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        q_values = policy_net(state_tensor)
        return q_values.argmax(1).item()

In [9]:
# --- Target Calculation ---
def compute_targets(batch, policy_net, target_net, gamma, double_dqn):
    states = torch.FloatTensor(np.array([b[0] for b in batch])).to(device)
    actions = torch.LongTensor(np.array([b[1] for b in batch])).to(device)
    rewards = torch.FloatTensor(np.array([b[2] for b in batch])).to(device)
    next_states = torch.FloatTensor(np.array([b[3] for b in batch])).to(device)
    dones = torch.FloatTensor(np.array([b[4] for b in batch])).to(device)

    q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)

    with torch.no_grad():
        if double_dqn:
            next_actions = policy_net(next_states).argmax(1)
            next_qs = target_net(next_states).gather(1, next_actions.unsqueeze(1)).squeeze(1)
        else:
            next_qs = target_net(next_states).max(1)[0]

        targets = rewards + (1 - dones) * gamma * next_qs

    return q_values, targets

In [None]:
# --- Training Loop ---
def train_dqn(env_name='ALE/Breakout-v5', episodes=1000,epsilon=1.0,epsilon_min=0.1,epsilon_decay=0.995):
    env = gym.make(env_name, render_mode='rgb_array', obs_type='rgb')
    action_space = env.action_space.n

    policy_net = DQN(action_space, dueling=use_dueling_dqn).to(device)
    target_net = DQN(action_space, dueling=use_dueling_dqn).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    optimizer = optim.Adam(policy_net.parameters(), lr=0.0001)
    criterion = nn.MSELoss()

    replay_buffer = ReplayBuffer(100000)
    batch_size = 32
    gamma = 0.99
    target_update = 10

    writer = SummaryWriter(log_dir=f'runs/{save_tag}_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
    best_avg_reward = -float('inf')
    reward_history = []

    for episode in range(episodes):
        obs, _ = env.reset()
        processed = preprocess_observation(obs)
        state = np.stack([processed] * 4, axis=0)
        total_reward = 0
        done = False

        while not done:
            action = select_action(state, epsilon, action_space, policy_net)
            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            next_processed = preprocess_observation(next_obs)
            next_state = np.append(state[1:], np.expand_dims(next_processed, 0), axis=0)

            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            total_reward += reward

            if len(replay_buffer) > batch_size:
                batch = replay_buffer.sample(batch_size)
                q_vals, targets = compute_targets(batch, policy_net, target_net, gamma, use_double_dqn)
                loss = criterion(q_vals, targets.detach())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        epsilon = max(epsilon * epsilon_decay, epsilon_min)
        reward_history.append(total_reward)
        avg_reward = np.mean(reward_history[-50:])
        writer.add_scalar('Reward/Total', total_reward, episode)
        writer.add_scalar('Reward/Avg50', avg_reward, episode)

        if episode % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if avg_reward > best_avg_reward:
            best_avg_reward = avg_reward
            torch.save(policy_net.state_dict(), f'{save_dir}/best_{save_tag}.pth')
            print(f"[Episode {episode}] New best avg reward: {avg_reward:.2f}, model saved.")

        print(f"Episode {episode} | Total Reward: {total_reward} | Avg50: {avg_reward:.2f} | Epsilon: {epsilon:.2f}")

    env.close()
    writer.close()



In [11]:
# --- Evaluation ---
def evaluate_model(model_path, env_name='ALE/Breakout-v5', tag='eval', dueling=False):
    env = gym.make(env_name, render_mode="rgb_array")
    env = RecordVideo(env, video_folder=f"./videos/{tag}", episode_trigger=lambda x: True)
    action_space = env.action_space.n

    model = DQN(action_space, dueling=dueling).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    obs, _ = env.reset()
    state = np.stack([preprocess_observation(obs)] * 4, axis=0)
    done = False

    while not done:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
            action = model(state_tensor).argmax(1).item()
        obs, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        processed = preprocess_observation(obs)
        state = np.append(state[1:], np.expand_dims(processed, 0), axis=0)

    env.close()
    print(f"Video saved at ./videos/{tag}")

In [17]:
use_double_dqn = False
use_dueling_dqn = False
save_tag = 'vanilla'  # change to 'vanilla' or 'double' if needed
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

train_dqn(episodes=1000)

[Episode 0] New best avg reward: 0.00, model saved.
Episode 0 | Total Reward: 0.0 | Avg50: 0.00 | Epsilon: 0.99
[Episode 1] New best avg reward: 0.50, model saved.
Episode 1 | Total Reward: 1.0 | Avg50: 0.50 | Epsilon: 0.99
Episode 2 | Total Reward: 0.0 | Avg50: 0.33 | Epsilon: 0.99
Episode 3 | Total Reward: 0.0 | Avg50: 0.25 | Epsilon: 0.98
Episode 4 | Total Reward: 1.0 | Avg50: 0.40 | Epsilon: 0.98
Episode 5 | Total Reward: 1.0 | Avg50: 0.50 | Epsilon: 0.97
[Episode 6] New best avg reward: 0.71, model saved.
Episode 6 | Total Reward: 2.0 | Avg50: 0.71 | Epsilon: 0.97
[Episode 7] New best avg reward: 1.00, model saved.
Episode 7 | Total Reward: 3.0 | Avg50: 1.00 | Epsilon: 0.96
[Episode 8] New best avg reward: 1.11, model saved.
Episode 8 | Total Reward: 2.0 | Avg50: 1.11 | Epsilon: 0.96
Episode 9 | Total Reward: 0.0 | Avg50: 1.00 | Epsilon: 0.95
Episode 10 | Total Reward: 2.0 | Avg50: 1.09 | Epsilon: 0.95
[Episode 11] New best avg reward: 1.33, model saved.
Episode 11 | Total Reward:

In [18]:
evaluate_model('models/best_vanilla.pth', tag='vanilla', dueling=False)

  logger.warn(
  logger.warn(


MoviePy - Building video /home/sprince0031/Thendral/Code/DQN Project/videos/vanilla/rl-video-episode-0.mp4.
MoviePy - Writing video /home/sprince0031/Thendral/Code/DQN Project/videos/vanilla/rl-video-episode-0.mp4



                                                              

MoviePy - Done !
MoviePy - video ready /home/sprince0031/Thendral/Code/DQN Project/videos/vanilla/rl-video-episode-0.mp4
Video saved at ./videos/vanilla




In [19]:

vids = [f for f in os.listdir('./videos/vanilla') if f.endswith('.mp4')]
Video(f'./videos/vanilla/{vids[-1]}', embed=True)


In [11]:
use_double_dqn = True
use_dueling_dqn = False
save_tag = 'double_dqn'  # change to 'vanilla' or 'double' if needed
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

train_dqn(episodes=1000)



A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


[Episode 0] New best avg reward: 0.00, model saved.
Episode 0 | Total Reward: 0.0 | Avg50: 0.00 | Epsilon: 0.99
[Episode 1] New best avg reward: 1.50, model saved.
Episode 1 | Total Reward: 3.0 | Avg50: 1.50 | Epsilon: 0.99
Episode 2 | Total Reward: 0.0 | Avg50: 1.00 | Epsilon: 0.99
Episode 3 | Total Reward: 2.0 | Avg50: 1.25 | Epsilon: 0.98
Episode 4 | Total Reward: 0.0 | Avg50: 1.00 | Epsilon: 0.98
Episode 5 | Total Reward: 1.0 | Avg50: 1.00 | Epsilon: 0.97
Episode 6 | Total Reward: 0.0 | Avg50: 0.86 | Epsilon: 0.97
Episode 7 | Total Reward: 2.0 | Avg50: 1.00 | Epsilon: 0.96
Episode 8 | Total Reward: 1.0 | Avg50: 1.00 | Epsilon: 0.96
Episode 9 | Total Reward: 3.0 | Avg50: 1.20 | Epsilon: 0.95
Episode 10 | Total Reward: 0.0 | Avg50: 1.09 | Epsilon: 0.95
Episode 11 | Total Reward: 0.0 | Avg50: 1.00 | Epsilon: 0.94
Episode 12 | Total Reward: 0.0 | Avg50: 0.92 | Epsilon: 0.94
Episode 13 | Total Reward: 3.0 | Avg50: 1.07 | Epsilon: 0.93
Episode 14 | Total Reward: 0.0 | Avg50: 1.00 | Epsil

In [12]:
evaluate_model('models/best_double_dqn.pth', tag='double_dqn', dueling=False)
vids = [f for f in os.listdir('./videos/double_dqn') if f.endswith('.mp4')]
Video(f'./videos/double_dqn/{vids[-1]}', embed=True)

  logger.warn(


MoviePy - Building video /home/sprince0031/Thendral/Code/DQN Project/videos/double_dqn/rl-video-episode-0.mp4.
MoviePy - Writing video /home/sprince0031/Thendral/Code/DQN Project/videos/double_dqn/rl-video-episode-0.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready /home/sprince0031/Thendral/Code/DQN Project/videos/double_dqn/rl-video-episode-0.mp4
Video saved at ./videos/double_dqn




In [13]:
use_double_dqn = False
use_dueling_dqn = True
save_tag = 'dueling_dqn'  # change to 'vanilla' or 'double' if needed
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

train_dqn(episodes=1000)

[Episode 0] New best avg reward: 1.00, model saved.
Episode 0 | Total Reward: 1.0 | Avg50: 1.00 | Epsilon: 0.99
Episode 1 | Total Reward: 0.0 | Avg50: 0.50 | Epsilon: 0.99
[Episode 2] New best avg reward: 1.67, model saved.
Episode 2 | Total Reward: 4.0 | Avg50: 1.67 | Epsilon: 0.99
[Episode 3] New best avg reward: 1.75, model saved.
Episode 3 | Total Reward: 2.0 | Avg50: 1.75 | Epsilon: 0.98
Episode 4 | Total Reward: 0.0 | Avg50: 1.40 | Epsilon: 0.98
Episode 5 | Total Reward: 0.0 | Avg50: 1.17 | Epsilon: 0.97
Episode 6 | Total Reward: 2.0 | Avg50: 1.29 | Epsilon: 0.97
Episode 7 | Total Reward: 2.0 | Avg50: 1.38 | Epsilon: 0.96
Episode 8 | Total Reward: 1.0 | Avg50: 1.33 | Epsilon: 0.96
Episode 9 | Total Reward: 2.0 | Avg50: 1.40 | Epsilon: 0.95
Episode 10 | Total Reward: 1.0 | Avg50: 1.36 | Epsilon: 0.95
Episode 11 | Total Reward: 1.0 | Avg50: 1.33 | Epsilon: 0.94
Episode 12 | Total Reward: 1.0 | Avg50: 1.31 | Epsilon: 0.94
Episode 13 | Total Reward: 0.0 | Avg50: 1.21 | Epsilon: 0.93


In [17]:
evaluate_model('models/best_dueling_dqn.pth', tag='dueling_dqn', dueling=True)
vids = [f for f in os.listdir('./videos/dueling_dqn') if f.endswith('.mp4')]
Video(f'./videos/dueling_dqn/{vids[-1]}', embed=True)

MoviePy - Building video /home/sprince0031/Thendral/Code/DQN Project/videos/dueling_dqn/rl-video-episode-0.mp4.
MoviePy - Writing video /home/sprince0031/Thendral/Code/DQN Project/videos/dueling_dqn/rl-video-episode-0.mp4



                                                              

MoviePy - Done !
MoviePy - video ready /home/sprince0031/Thendral/Code/DQN Project/videos/dueling_dqn/rl-video-episode-0.mp4
Video saved at ./videos/dueling_dqn




In [None]:
tensorboard --logdir=runs


In [20]:
use_double_dqn = True
use_dueling_dqn = True
save_tag = 'dueling_double_dqn'  # change to 'vanilla' or 'double' if needed
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

train_dqn(episodes=1000)

[Episode 0] New best avg reward: 2.00, model saved.
Episode 0 | Total Reward: 2.0 | Avg50: 2.00 | Epsilon: 0.99
Episode 1 | Total Reward: 1.0 | Avg50: 1.50 | Epsilon: 0.99
Episode 2 | Total Reward: 3.0 | Avg50: 2.00 | Epsilon: 0.99
[Episode 3] New best avg reward: 2.25, model saved.
Episode 3 | Total Reward: 3.0 | Avg50: 2.25 | Epsilon: 0.98
Episode 4 | Total Reward: 0.0 | Avg50: 1.80 | Epsilon: 0.98
Episode 5 | Total Reward: 2.0 | Avg50: 1.83 | Epsilon: 0.97
Episode 6 | Total Reward: 2.0 | Avg50: 1.86 | Epsilon: 0.97
Episode 7 | Total Reward: 1.0 | Avg50: 1.75 | Epsilon: 0.96
Episode 8 | Total Reward: 2.0 | Avg50: 1.78 | Epsilon: 0.96
Episode 9 | Total Reward: 0.0 | Avg50: 1.60 | Epsilon: 0.95
Episode 10 | Total Reward: 3.0 | Avg50: 1.73 | Epsilon: 0.95
Episode 11 | Total Reward: 3.0 | Avg50: 1.83 | Epsilon: 0.94
Episode 12 | Total Reward: 5.0 | Avg50: 2.08 | Epsilon: 0.94
Episode 13 | Total Reward: 1.0 | Avg50: 2.00 | Epsilon: 0.93
Episode 14 | Total Reward: 0.0 | Avg50: 1.87 | Epsil

In [25]:
evaluate_model('models/best_dueling_double_dqn.pth', tag='dueling_double_dqn', dueling=True)
vids = [f for f in os.listdir('./videos/dueling_double_dqn') if f.endswith('.mp4')]
Video(f'./videos/dueling_double_dqn/{vids[-1]}', embed=True)

MoviePy - Building video /home/sprince0031/Thendral/Code/DQN Project/videos/dueling_double_dqn/rl-video-episode-0.mp4.
MoviePy - Writing video /home/sprince0031/Thendral/Code/DQN Project/videos/dueling_double_dqn/rl-video-episode-0.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready /home/sprince0031/Thendral/Code/DQN Project/videos/dueling_double_dqn/rl-video-episode-0.mp4
Video saved at ./videos/dueling_double_dqn




In [12]:
use_double_dqn = False
use_dueling_dqn = False
save_tag = 'vanilla_dqn_10k'  # change to 'vanilla' or 'double' if needed
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

train_dqn(episodes=10000)

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


[Episode 0] New best avg reward: 3.00, model saved.
Episode 0 | Total Reward: 3.0 | Avg50: 3.00 | Epsilon: 0.99
Episode 1 | Total Reward: 0.0 | Avg50: 1.50 | Epsilon: 0.99
Episode 2 | Total Reward: 4.0 | Avg50: 2.33 | Epsilon: 0.99
Episode 3 | Total Reward: 2.0 | Avg50: 2.25 | Epsilon: 0.98
Episode 4 | Total Reward: 3.0 | Avg50: 2.40 | Epsilon: 0.98
Episode 5 | Total Reward: 0.0 | Avg50: 2.00 | Epsilon: 0.97
Episode 6 | Total Reward: 2.0 | Avg50: 2.00 | Epsilon: 0.97
Episode 7 | Total Reward: 0.0 | Avg50: 1.75 | Epsilon: 0.96
Episode 8 | Total Reward: 1.0 | Avg50: 1.67 | Epsilon: 0.96
Episode 9 | Total Reward: 0.0 | Avg50: 1.50 | Epsilon: 0.95
Episode 10 | Total Reward: 2.0 | Avg50: 1.55 | Epsilon: 0.95
Episode 11 | Total Reward: 0.0 | Avg50: 1.42 | Epsilon: 0.94
Episode 12 | Total Reward: 1.0 | Avg50: 1.38 | Epsilon: 0.94
Episode 13 | Total Reward: 1.0 | Avg50: 1.36 | Epsilon: 0.93
Episode 14 | Total Reward: 0.0 | Avg50: 1.27 | Epsilon: 0.93
Episode 15 | Total Reward: 1.0 | Avg50: 1.2

In [None]:
evaluate_model('models/best_vanilla_dqn_10k.pth', tag='vanilla_dqn_10k', dueling=False)
vids = [f for f in os.listdir('./videos/vanilla_dqn_10k') if f.endswith('.mp4')]
Video(f'./videos/vanilla_dqn_10k/{vids[-1]}', embed=True)

MoviePy - Building video /home/sprince0031/Thendral/Code/DQN Project/videos/vanilla_dqn_10k/rl-video-episode-0.mp4.
MoviePy - Writing video /home/sprince0031/Thendral/Code/DQN Project/videos/vanilla_dqn_10k/rl-video-episode-0.mp4



                                                              

MoviePy - Done !
MoviePy - video ready /home/sprince0031/Thendral/Code/DQN Project/videos/vanilla_dqn_10k/rl-video-episode-0.mp4
Video saved at ./videos/vanilla_dqn_10k




In [23]:
use_double_dqn = True
use_dueling_dqn = False
save_tag = 'double_dqn_10k'  
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

train_dqn(episodes=10000,epsilon_min=0.01,epsilon_decay=0.9995)


[Episode 0] New best avg reward: 0.00, model saved.
Episode 0 | Total Reward: 0.0 | Avg50: 0.00 | Epsilon: 1.00
[Episode 1] New best avg reward: 0.50, model saved.
Episode 1 | Total Reward: 1.0 | Avg50: 0.50 | Epsilon: 1.00
[Episode 2] New best avg reward: 1.00, model saved.
Episode 2 | Total Reward: 2.0 | Avg50: 1.00 | Epsilon: 1.00
Episode 3 | Total Reward: 1.0 | Avg50: 1.00 | Epsilon: 1.00
Episode 4 | Total Reward: 1.0 | Avg50: 1.00 | Epsilon: 1.00
Episode 5 | Total Reward: 0.0 | Avg50: 0.83 | Epsilon: 1.00
Episode 6 | Total Reward: 1.0 | Avg50: 0.86 | Epsilon: 1.00
Episode 7 | Total Reward: 1.0 | Avg50: 0.88 | Epsilon: 1.00
Episode 8 | Total Reward: 1.0 | Avg50: 0.89 | Epsilon: 1.00
Episode 9 | Total Reward: 2.0 | Avg50: 1.00 | Epsilon: 1.00
[Episode 10] New best avg reward: 1.09, model saved.
Episode 10 | Total Reward: 2.0 | Avg50: 1.09 | Epsilon: 0.99
Episode 11 | Total Reward: 0.0 | Avg50: 1.00 | Epsilon: 0.99
Episode 12 | Total Reward: 0.0 | Avg50: 0.92 | Epsilon: 0.99
Episode 