# DQN Atari Paper Implementation

In [1]:
# ! pip install gymnasium[atari,accept-rom-license] torch numpy opencv-python matplotlib

In [2]:
import gymnasium as gym
import numpy as np
import torch
from torchsummary import summary
from tqdm import tqdm

from copy import deepcopy

## Hyperparameters
As per the paper, we use certain hyperparameters that were tuned across various Atari games.

In [3]:
LEARNING_RATE = 0.0002  # Paper used a similar learning rate
DISCOUNT_FACTOR = 0.99  # The Î³ discount factor as mentioned in the paper
REPLAY_MEMORY_SIZE = 150_000  # Large replay buffer as described, but not too large
MINI_BATCH_SIZE = 32  # Minibatch size for training
TARGET_UPDATE_FREQ = 1_200  # C steps for target network update
FRAME_SKIP = 4  # Number of frames skipped
MIN_EPSILON = 0.1  # Minimum value of epsilon (for more exploitation)
MAX_EPSILON = 1.0  # Starting value of epsilon (for exploration)
EPSILON_PHASE = 0.1  # Percentage of steps for epsilon to reach MIN_EPSILON
MAX_STEPS = 2_500_000  # Total training episodes
REPLAY_START_SIZE = 75_000  # Size of replay memory before starting training
SAVE_FREQUENCY = 500_000  # Save model every 50k steps

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

## Gymnasium Environment Setup
Here we set up the gym environment, by selecting the Breakout game.

In [4]:
def make_env(env_id, render_mode=None, frame_skip=4):
    """Create environment with preprocessing wrappers."""
    env = gym.make(env_id, render_mode=render_mode, frameskip=1)
    # Handles resizing, grayscale, frameskip and stacking of frames
    env = gym.wrappers.AtariPreprocessing(env, frame_skip=4)
    # Record statistics like the precise score and lives
    env = gym.wrappers.RecordEpisodeStatistics(env)
    # Stack 4 frames
    env = gym.wrappers.FrameStack(env, 4)
    env = gym.wrappers.AutoResetWrapper(env)
    return env


env = make_env("ALE/Breakout-v5")

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


## Target Network Update
Target network helps stabilize learning. The paper mentions that the target network is updated every 1,250 steps.
Unlike the paper here we will use a Double DQN approach where the target network is updated with the weights of the main network every C steps. This helps in reducing overestimation of Q-values and training time.

## Deep Q-Network Architecture
Referring to the paper's architecture (3 convolutional layers, followed by fully connected layers). We also normalize the input states to the network.

In [5]:
class DeepQNetwork(torch.nn.Module):
    def __init__(self, n_actions):
        super(DeepQNetwork, self).__init__()
        self.conv = torch.nn.Sequential(
            torch.nn.Conv2d(4, 32, kernel_size=8, stride=4),
            torch.nn.ReLU(),
            torch.nn.Conv2d(32, 64, kernel_size=4, stride=2),
            torch.nn.ReLU(),
            torch.nn.Conv2d(64, 64, kernel_size=3, stride=1),
            torch.nn.ReLU(),
            torch.nn.Flatten(),
        )

        self.linear = torch.nn.Sequential(
            torch.nn.Linear(7 * 7 * 64, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, n_actions),
        )

    def forward(self, x):
        x = self.conv(x / 255.0)
        return self.linear(x)

In [6]:
def count_trainable_parameters(model):
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params


print(
    f"Trainable parameters: {count_trainable_parameters(DeepQNetwork(env.action_space.n)):,}"
)

Trainable parameters: 1,686,180


## Replay Memory
We implement an experience replay buffer as described in the paper to store past experiences and sample them randomly during training to break the correlation between consecutive frames.

In [7]:
class ReplayBuffer:
    def __init__(self, size, obs_shape, action_shape):
        """
        Initialize the replay buffer

        Args:
            size : int  Size of the replay buffer
            obs_shape :ctuple  Shape of the observations
            action : tuple  Shape of the actions
        """
        self.size = size
        self.obs_shape = obs_shape
        self.action_shape = action_shape

        self.t_obs = np.empty((size, *obs_shape), dtype=np.uint8)
        self.t1_obs = np.empty((size, *obs_shape), dtype=np.uint8)
        self.actions = np.empty((size, *action_shape), dtype=np.uint8)
        self.rewards = np.empty(size, dtype=np.float16)
        self.dones = np.empty(size, dtype=np.bool_)

        self.idx = 0
        self.current_size = 0

    def append(self, t_obs, t1_obs, actions, reward, done):
        """
        Append a new transition to the replay buffer

        Args:
            t_obs : np.array  Current observation
            t1_obs : np.array  Next observation
            actions : np.array  Action
            reward : float  Reward
            done : bool  Done
        """
        self.t_obs[self.idx] = t_obs
        self.t1_obs[self.idx] = t1_obs
        self.actions[self.idx] = actions
        self.rewards[self.idx] = reward
        self.dones[self.idx] = done

        self.current_size = min(self.current_size + 1, self.size)
        self.idx = (self.idx + 1) % self.size

    def get_minibatch(self, batch_size, device="cpu"):
        """
        Sample a minibatch from the replay buffer

        Args:
            batch_size : int  Size of the minibatch
            device : str  Device to use
        """
        ids = np.random.choice(self.current_size, batch_size, replace=False)
        batch = (
            self.t_obs[ids],
            self.t1_obs[ids],
            self.actions[ids],
            self.rewards[ids],
            self.dones[ids],
        )

        return tuple(
            torch.as_tensor(item, dtype=torch.float32).to(device) for item in batch
        )

# Training loop

The main constraint is that the training is relatively long and requires a lot of time. The limit is that Colab automatically stops after a certain time. The second limitation is that it is difficult to reproduce the results of the paper because we do not have enough memory to reach a replay buffer of 1 million.

The implementation is as follows:

1. We start by defining two Q models: one is updated more regularly and the other is the target network, which is a copy of the Q model at a given time t. We use the Adam optimizer, which seems to offer better results with my hyperparameters.
2. T corresponds to the total number of timesteps and not the number of timesteps in an episode. We can proceed this way because AutoResetWrapper takes care of resetting the environment when we have 0 lives.
3. We set up an exploratory policy, reducing linearly over 10% of the timesteps. We start with an exploration of 1.0, then linearly decrease to 0.1.
4. Initially, we fill the replay buffer with 75,000 elements before starting the training.
5. As described in the paper, we choose to limit the reward based on its sign.
6. We also perform a checkpoint every 500,000 steps.
7. Finally, every 1,250 steps, we update the target network.
The implementation is as follows:


In [None]:
n_actions = env.action_space.n
dqn = DeepQNetwork(n_actions).to(device)
optimizer = torch.optim.Adam(dqn.parameters(), lr=LEARNING_RATE)
dqn_prime = DeepQNetwork(n_actions).to(device)

buffer = ReplayBuffer(REPLAY_MEMORY_SIZE, (4, 84, 84), (1,))
training_history = {"loss": [0], "mean_q_value": [0], "episode_rewards": [0], "steps": [0]}

t_observation, _ = env.reset()
episode_reward = 0

progress_bar = tqdm(range(MAX_STEPS), desc="Training Progress")

episode_steps = 0
episode_loss = 0
episode_q_values = 0

for t in progress_bar:
    # Epsilon with linear decay
    eps = max(
        MIN_EPSILON,
        MIN_EPSILON
        + (MAX_EPSILON - MIN_EPSILON) * (1 - t / (EPSILON_PHASE * MAX_STEPS)),
    )

    # epsilon-greedy policy
    if np.random.rand(1) < eps:
        action = env.action_space.sample()
    else:
        with torch.no_grad():
            q_values = dqn(
                torch.tensor(np.array(t_observation), device=device).unsqueeze(0)
            )
            action = torch.argmax(q_values, dim=1).item()
            episode_q_values += q_values.mean().item()

    # store the action in the replay buffer
    t1_observation, reward, done, _, info = env.step(action)
    buffer.append(
        t_observation, t1_observation, np.array([action]), np.sign(reward), done
    )
    episode_reward += reward

    # appears when the episode is done
    if "final_info" in info:
        training_history["steps"].append(t)
        training_history["episode_rewards"].append(episode_reward)
        training_history["mean_q_value"].append(episode_q_values / episode_steps)
        training_history["loss"].append(episode_loss / episode_steps)
        episode_reward = 0
        progress_bar.set_description(
            f"R: {training_history['episode_rewards'][-1]:.2f}, l: {training_history['loss'][-1]:.2f}, Mean Q: {training_history['mean_q_value'][-1]:.2f}, e: {eps:.2f}"
        )
        episode_steps = 0
        episode_loss = 0
        episode_q_values = 0

    # Check point every 500_000 step
    if t > 0 and t % SAVE_FREQUENCY == 0:
        torch.save(dqn.state_dict(), f"checkpoint{t}.pt")

    if t > REPLAY_START_SIZE:
        if t % 4 == 0:
            # Sample a minibatch
            t_obs, t1_obs, actions, rewards, dones = buffer.get_minibatch(
                MINI_BATCH_SIZE, device=device
            )

            with torch.no_grad():
                not_done = ~dones.bool()
                a_prime = dqn_prime(t1_obs).amax(dim=1)
                y_j = rewards + DISCOUNT_FACTOR * a_prime * not_done

            optimizer.zero_grad()

            # Perform a gradient descent step on
            q_values = dqn(t_obs)
            idx = torch.arange(actions.size(0)).to(device).long()
            values = q_values[idx, actions.squeeze().long()]
            
            loss = torch.nn.functional.huber_loss(y_j, values)

            loss.backward()
            optimizer.step()
            episode_loss += loss.item()
            
        if t % TARGET_UPDATE_FREQ == 0:
            dqn_prime = deepcopy(dqn)
        
    episode_steps += 1

    t_observation = t1_observation

env.close()

*The trainings were run on Kaggle, so the training logs are not included in this notebook.*

Save the data for later use.

In [10]:
import pandas as pd

# Convert the dictionary to a DataFrame
df_plot_infos = pd.DataFrame(training_history)

# Save the DataFrame to a CSV file
df_plot_infos.to_csv('../data/original_training_history.csv', index=False)

## Result
*The tests were run on Kaggle, so the tests logs are not included in this notebook.*

In [10]:
def test_model(model_path):
    """
    Test the model

    Args:
        model_path (str): Path to the model checkpoint to load
    """

    env = gym.make("ALE/Breakout-v5", frameskip=1, render_mode="rgb_array")
    env = gym.wrappers.AtariPreprocessing(env, frame_skip=4)
    env = gym.wrappers.FrameStack(env, 4)
    env = gym.wrappers.AutoResetWrapper(env)

    # load model
    dqn = DeepQNetwork(env.action_space.n).to(device)
    state_dict = torch.load(model_path, map_location=torch.device(device))
    dqn.load_state_dict(state_dict=state_dict)
    dqn.eval()

    t_observation, _ = env.reset()
    done = False
    total_reward = 0
    info = None
    while not done:
        if np.random.rand(1) < 0.01:
            action = env.action_space.sample()
        else:
            q_values = dqn(
            torch.tensor(np.array(t_observation), device=device).unsqueeze(0)
            )
            action = torch.argmax(q_values, dim=1).cpu().numpy().squeeze()
        t_observation, reward, done, _, info = env.step(action)
        total_reward += reward
        env.render()

    print(f"Total reward: {total_reward}")

    env.close()
    return total_reward

In [None]:
rewards = []
for i in tqdm(range(50)):
    rewards.append(test_model(f"../checkpoints/original/checkpoint1500000.pt"))

print(f"Mean reward: {np.mean(rewards)}")
print(f"Std reward: {np.std(rewards)}")
print(f"Max reward: {np.max(rewards)}")
print(f"Min reward: {np.min(rewards)}")