In [34]:
#misc
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import deque
# env
import gymnasium as gym
from vizdoom import gymnasium_wrapper
# learning
import torch
import torch.nn as nn
import torch.autograd as autograd
import torchvision.transforms as T

# Setup for DQN

We first need a **Transition** class which represents a `(state, action) -> (state', reward)` datapoint.

Then we need a **Replay Memory** class to store and utilize these transitions.

In [35]:
class Transition():
    def __init__(self, obs, action, next_obs, reward) -> None:
        self.obs = obs
        self.action = action
        self.next_obs = next_obs
        self.reward = reward

In [36]:
class ReplayBuffer(object):
    def __init__(self, buffer_size):
        self.memory = deque([], buffer_size)

    def __len__(self) -> int:
        return len(self.memory)

    def push(self, transition:Transition):
        """Save a transition"""
        self.memory.append(transition)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# DQN Class

In [37]:
class DQN(nn.Module):
    def __init__(self, n_actions) -> None:
        super(DQN, self).__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, stride=2),
            nn.ReLU()
        )

        dummy_input = torch.zeros(1, 1, 50, 75)  # [batch_size, in_channels, height, width]
        output = self.conv1(dummy_input)
        print(output.shape)
        
        conv1_out_shape = output.view(output.size(0), -1).shape # output.size(0) = batch_size

        self.lin1 = nn.Sequential(
            nn.Linear(conv1_out_shape[1], 128),
            nn.ReLU()
        )

        self.lin2 = nn.Sequential(
            nn.Linear(128, n_actions)
        )
    
    def forward(self, x):
        x = self.conv1(x)
        x = x.view(x.size(0), -1) # x.size(0) = batch_size
        x = self.lin1(x)
        x = self.lin2(x)
        return x

# Define environment

In [38]:
env:gym.Env = gym.make("VizdoomBasic-v0")



We will add some boilerplate for the training, and some helper functions.

In [39]:
# Utilize GPU for training if GPU present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize RNG seed
seed:int = 42 #rng seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# HYPERPARAMETERS
total_timesteps:int = 500000 # timestep max of an experiment
lr:float = 0.0001
buffer_size:int = 10000 # experience replay buffer size
gamma: float = 0.99 # discount factor
batch_size: int = 128 # batch size for experience replay buffer sampling
epsilon_max: float = 1 # starting epsilon value (exploration/exploitation)
epsilon_min:float = 0.05 # ending epsilon value
epsilon_duration:float = 0.5 # time spent before min epsilon is reached
training_start:int = 10000 # steps needed before training begins
tnur: int = 1 # target network update rate
tnuf: int = 500 # target network update frequency
qntf: int = 10 # qnetwork training frequency

Epsilon decay let's us start by picking random actions, then slowly start picking actions that yield high rewards. We first explore a wide array of options, and once we have an idea of what works and what doesn't, we start exploiting that knowledge and ldive deeper.

In [40]:
def epsilon_decay(current_timestep: int):
    slope = (epsilon_min - epsilon_max) / total_timesteps
    return max(slope * current_timestep + epsilon_max, epsilon_min)

In [41]:
def get_action(action_space:gym.Space, obs, policy_net:DQN, current_timestep:int):
    rng = random.random()
    epsilon = epsilon_decay(current_timestep)
    if rng > epsilon:
        # action with highest q_value
        q_values = policy_net(torch.Tensor(obs).to(device))
        action = torch.argmax(q_values).cpu().numpy()
    else:
        # random action 
        action = action_space.sample()
    return action

Define model, optimizer, and replay buffer.

In [42]:
# Initialize agent & target network
q_net = DQN(env.action_space.n).to(device)
optimizer = torch.optim.Adam(q_net.parameters(), lr)
# Target network is used to evaluate the progress of our DQN.
# It represents the past policy from which we evaluate surplus reward gains.
target_net = DQN(env.action_space.n).to(device)
target_net.load_state_dict(q_net.state_dict())

# Initialize Experience Replay (ER) buffer
# ER is used in DQN to avoid catastrophic forgetting.
# It allows the model to re-train on previous experiences in order to
# mix it with novel experiences and not forget previous training.
# Another benefit of ER is that by randomly sampling data from memory 
# we avoid sequential correlation of experiences.
replay_buffer = ReplayBuffer(buffer_size)

torch.Size([1, 8, 24, 37])
torch.Size([1, 8, 24, 37])


Image preprocessing

In [43]:
transformer = T.Compose([
    T.ToPILImage(),
    T.Resize((50,75)),
    T.Grayscale(),
    T.ToTensor()
])

Training Loop

In [48]:
obs, info = env.reset(seed=42)
for global_step in tqdm(range(total_timesteps)):
    # get action epsilon-greedy
    action = get_action(env.action_space, obs["screen"], q_net, global_step)
    # step through the env
    next_obs, rew, term, trun, info = env.step(action)
    # store transition into memory
    replay_buffer.push(Transition(obs, action, next_obs, rew))
    # update obs
    obs = next_obs
    # env reset when finished
    if term or trun:
        obs, info = env.reset()
    
    # TRAINING
    # if rb.


env.close()

  0%|          | 610/500000 [00:05<1:09:08, 120.39it/s]


KeyboardInterrupt: 

In [None]:
env = gym.make("VizdoomBasic-v0", render_mode="human")



obs, info = env.reset(seed=42)


img = transformer(obs["screen"]).squeeze(0).numpy()
print(img.shape)
plt.imshow(img)
plt.show()

NameError: name 'net' is not defined