# google colab boilerplate

In [6]:
is_google_colab:bool = 'google.colab' in str(get_ipython())
is_google_colab

False

In [7]:
if is_google_colab:
    # install vizdoom
    !pip install vizdoom
    # google collab
    import shutil
    from google.colab import files


In [8]:
#misc
import numpy as np
import random
from tqdm import tqdm
from collections import deque, namedtuple
from copy import deepcopy
# env
import gymnasium as gym
from vizdoom import gymnasium_wrapper
# learning
import torch
import torch.nn as nn
import torchvision.transforms as T

Tensorboard for viz

In [9]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

# Setup for DQN

check for cuda

In [10]:
# Utilize GPU for training if GPU present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Image preprocessing

In [11]:
transformer = T.Compose([
    T.ToPILImage(),
    T.Resize((45,60)),
    T.Grayscale(),
    T.ToTensor()
])

def preprocess(obs):
    # returns shape [1, C, H, W] where C = 1 because gray
    return transformer(obs["screen"]).squeeze(1).to(device)

def stack_frames(frames):
    return torch.cat(list(frames), dim=0)

We first need a **Transition** class which represents a `(state, action) -> (state', reward)` datapoint.

Then we need a **Replay Memory** class to store and utilize these transitions.

In [12]:
Transition = namedtuple(
    "Transition", 
    ["obs", "next_obs", "action", "reward", "done"]
)

def make_transition(obs_frames, next_obs_frames, action, rew, done) -> Transition:
    t = Transition(obs_frames.clone().detach(), 
                   next_obs_frames.clone().detach(), 
                   int(action), 
                   float(rew),
                   bool(done)
    )
    return t

In [13]:
class ReplayBuffer(object):
    def __init__(self, buffer_size):
        # deque's FIFO structure will forget older memories as the agent explores
        self.memory = deque([], buffer_size)

    def __len__(self) -> int:
        return len(self.memory)

    def push(self, t:Transition):
        """Save a transition"""
        self.memory.append(t)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

# DQN Class

In [14]:
class DQN(nn.Module):
    def __init__(self, n_actions) -> None:
        super(DQN, self).__init__()
        in_channels = 4
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=7),
            nn.ReLU(),
            nn.MaxPool2d(2),

        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(2),

        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(2),

        )

        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 2 * 4, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )


    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.fc_layers(x)
        return x

# Define environment

In [15]:
env = gym.make("VizdoomBasic-v0", render_mode="human")
env.action_space.n

np.int64(4)

We will add some boilerplate for the training, and some helper functions.

In [16]:
# Initialize RNG seed
seed:int = 42 #rng seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# HYPERPARAMETERS
total_timesteps:int = 400000 # timestep max of an experiment
lr:float = 0.01
buffer_size:int = 10000 # experience replay buffer size
gamma: float = 0.99 # discount factor
batch_size: int = 128 # batch size for experience replay buffer sampling
epsilon_max: float = 1 # starting epsilon value (exploration/exploitation)
epsilon_min:float = 0.1 # ending epsilon value
epsilon_decay_min = 100000 # total_timesteps / 4
epsilon_decay_max = 200000 # total_timesteps / 1.7
# epsilon_duration:float = 0.5 # time spent before min epsilon is reached
training_start:int = 128 # steps needed before training begins
tnuf: int = 100 # target network update frequency
qntf: int = 4 # qnetwork training frequency (deepmind dqn baseline)

Epsilon decay let's us start by picking random actions, then slowly start picking actions that yield high rewards. We first explore a wide array of options, and once we have an idea of what works and what doesn't, we start exploiting that knowledge and ldive deeper.

In [17]:
def epsilon_decay(current_timestep: int):
    if current_timestep < epsilon_decay_min: return epsilon_max
    if current_timestep > epsilon_decay_max: return epsilon_min
    epsilon_decaying = (epsilon_decay_max - current_timestep)/(epsilon_decay_max-epsilon_decay_min)
    return max(epsilon_min, epsilon_decaying)

In [18]:
def get_action(action_space:gym.Space, obs, q_net:DQN, current_timestep:int):
    # print(" === GETTING ACTION === ")
    rng = random.random()
    epsilon = epsilon_decay(current_timestep)
    # print("obs shape:", obs.unsqueeze(0).shape)

    # FOR TESTING
    # epsilon = 0.01
    if rng > epsilon:
        # action with highest q_value
        with torch.no_grad():
            q_values = q_net(obs.unsqueeze(0).to(device))
            action = int(torch.argmax(q_values).cpu().numpy())
            # print("action picked from", q_values, " ->", action, type(action))
    else:
        # random action 
        action = int(action_space.sample())
        # print("random action ->", action, type(action))
    return action

Define q network, optimizer, and target network

In [19]:
# Initialize agent & target network
q_net = DQN(env.action_space.n).to(device)
optimizer = torch.optim.Adam(q_net.parameters(), lr)
# Target network is used to evaluate the progress of our DQN.
# It represents the past policy from which we evaluate surplus reward gains.
target_net = DQN(env.action_space.n).to(device)
target_net.load_state_dict(q_net.state_dict())

<All keys matched successfully>

# Initialize Experience Replay (ER) buffer
ER is used in DQN to avoid catastrophic forgetting. It allows the model to re-train on previous experiences in order to mix it with novel experiences and not forget previous training. Another benefit of ER is that by randomly sampling data from memory we avoid sequential correlation of experiences.


In [20]:
replay_buffer = ReplayBuffer(buffer_size)

We need to be able to read the memory, get a `batch_size` amount of transitions, and make that into a batch with tensors

In [21]:
def collate(batch) -> dict:
    """regarding the shapes:
    a preprocessed frame is [1, 45, 60] (grayscale screenshot)
    
    we stack a few adjacent frames to make a [4, 45, 60] tensor

    when we batch these stacks, the final batch will be of size [B, C, H, W] where:
      - B (batch size) = 128
      - C (channels) = 4 because we are stacking our 1 channel images
      - H & W = 45,60 from the image dimensions
    """
    batch_dict = {}
    batch_dict["obs"] = torch.stack([t.obs for t in batch]).to(device) # shape [B, C, H, W]
    batch_dict["next_obs"] = torch.stack([t.next_obs for t in batch]).to(device) # shape [B, C, H, W]
    batch_dict["action"] = torch.tensor([t.action for t in batch]).to(device) # shape [B]
    batch_dict["reward"] = torch.tensor([t.reward for t in batch]).to(device) # shape [B]
    batch_dict["done"] = torch.tensor([t.done for t in batch]).to(device) # shape [B]
    return batch_dict

In [22]:
def reset_frames(obs):
    obs_frames = deque([], maxlen=4)
    # fill the stack first
    frame = preprocess(obs)
    for _ in range(4):
        obs_frames.append(frame)
    return obs_frames

Training Loop

In [23]:
def train(global_step):
    # create batch from memory
    batch = collate(replay_buffer.sample(batch_size))
    # get predictions in the form of q-values over all actions
    q_values = q_net(batch["obs"])
    writer.add_scalar("QValue/mean", q_values.mean(), global_step)
    # get the prediction of the action that was actually taken
    q_values_for_actions = q_values.gather(1, batch["action"].unsqueeze(1)).squeeze(1)
    # get the prediction of the target network
    target_net_max = target_net(batch["next_obs"]).max(dim=1)[0]
    # target q-values based on bellman equation
    q_target = batch["reward"] + gamma * target_net_max * (1 - batch["done"].float())
    # BACKPROP
    # compute loss
    loss = nn.functional.mse_loss(q_target, q_values_for_actions)
    # print(loss)
    writer.add_scalar("Training Loss", loss, global_step)
    optimizer.zero_grad()
    # backprop
    loss.backward()
    optimizer.step()

In [None]:
obs, info = env.reset(seed=42)
# get frames
obs_frames = reset_frames(obs)
# copy over for next_obs
next_obs_frames = deepcopy(obs_frames)
current_episode_reward = 0

for global_step in tqdm(range(total_timesteps)):
    # get action epsilon-greedy
    action = get_action(env.action_space, stack_frames(obs_frames), q_net, global_step)
    # step through the env
    # print("action before stepping", 3, "type:", type(3))
    next_obs, reward, term, trun, info = env.step(action)
    current_episode_reward += reward
    next_obs_frames.append(preprocess(next_obs))
    # store transition into memory
    t = make_transition(stack_frames(obs_frames), 
                        stack_frames(next_obs_frames), 
                        action, 
                        reward, 
                        term or trun) # term or trun = done
    replay_buffer.push(t) 
    # update obs
    obs_frames = next_obs_frames
    # env reset when finished
    if term or trun:
        obs, info = env.reset()
        # get frames
        obs_frames = reset_frames(obs)
        # copy over for next_obs
        next_obs_frames = deepcopy(obs_frames)
        writer.add_scalar("Episode Reward", current_episode_reward, global_step)
        current_episode_reward = 0

    # TRAINING
    if global_step > training_start and global_step % qntf == 0:
        train(global_step)
        # print(q_net.conv1[0].weight)
    if global_step % tnuf == 0:
        # print(target_net.fc_layers[3].weight)
        target_net.load_state_dict(q_net.state_dict())
    
    # make sure that all pending events have been written to disk
    writer.flush()

env.close()
writer.close()

In [None]:
# save final model
torch.save(q_net.state_dict(), "model_dqn.pth")

# google colab download

In [None]:
if is_google_colab:
    files.download("model_dqn.pth")
    shutil.make_archive("runs", "zip", "runs")
    files.download("runs.zip")

### frame stacking demo

In [None]:
# import matplotlib.pyplot as plt

# env = gym.make("VizdoomBasic-v0", render_mode="human")
# obs, info = env.reset(seed=42)

# # get frames
# obs_frames = deque([], maxlen=4)
# # fill the stack first
# for _ in range(4):
#     frame = preprocess(obs)
#     obs_frames.append(frame)
# # copy over for next_obs
# next_obs_frames = deepcopy(obs_frames)

# for _ in range(100):
#     action = env.action_space.sample()
#     next_obs, rew, term, trun, info = env.step(action)
#     next_obs_frames.append(preprocess(next_obs))

#     t = make_transition(stack_frames(obs_frames), stack_frames(next_obs_frames), action, rew, term or trun) # term or trun = done
#     replay_buffer.push(t)

#     obs_frames = next_obs_frames
    
#     if term or trun:
#         obs, info = env.reset()

### testing

In [25]:
env = gym.make("VizdoomBasic-v0", frame_skip=2, render_mode="human")
# Re-create the model architecture
model = DQN(env.action_space.n)
model.load_state_dict(torch.load("models/model_dqn_highLR.pth", map_location=torch.device('cpu')))
model.eval()  # Set to eval mode if you're not training

obs, info = env.reset(seed=42)
obs_frames = reset_frames(obs)
for _ in range(10000):
    # get action
    q_values = model(stack_frames(obs_frames).unsqueeze(0).to(device))
    action = int(torch.argmax(q_values).cpu().numpy())
    obs, rew, term, trun, info = env.step(action)
    obs_frames.append(preprocess(obs))
    # break
    
    if term or trun:
        obs, info = env.reset()

env.close()
    

KeyboardInterrupt: 