# Training a Deep Q Learning Neural Network Agent in Gymnasium's CliffWalking-v0 Discrete Environment using AgileRL


## Imports and Installs

In [None]:
!pip install gym gym[atari] gym[accept-rom-license] agilerl accelerate>=0.21.0

[0m

In [None]:
import os
import gymnasium as gym
import numpy as np

### These imports will be used to implement the NN Agent ##
import torch
from agilerl.algorithms.dqn import DQN
from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.training.train_off_policy import train_off_policy
from agilerl.utils.utils import create_population, make_vect_envs
# import trange
from tqdm.notebook import trange

from tqdm import tqdm
from __future__ import annotations
from collections import defaultdict

## Setting up the Reinforcement Learning Environment in Gym

In this example notebook, we chose to use the 'CliffWalking-v0' environment prior to tackling the Blackjack environment as the CliffWalking environment has discrete observations and actions, thus it is easier to use with AgileRL's DQN network class. Similarly, gym.vector.make was also needed in order to convert the observation and action data types to vectors or arrays - required for use with the DQN. Thus, this was a learning experience that we used to build our own custom DQN and training and evaluation in a seperate notebook for use with the Blackjack environment. Another setback faced when using the AgileRL library is that it had difficulties training with a GPU device (in colab notebooks). Since we are not dealing with a computationally heavy environment GPU support is not required.

In [None]:
#env = make_vect_envs("CliffWalking-v0", num_envs=1) # uncomment if want to run across envs
env = gym.vector.make("CliffWalking-v0")

In [None]:
try:
    state_dim = env.single_observation_space.n  # Discrete observation space
    one_hot = True  # Requires one-hot encoding
    is_discrete_obs = True
except Exception:
    state_dim = env.single_observation_space.shape  # Continuous observation space
    one_hot = False  # Does not require one-hot encoding
    is_discrete_obs = False
try:
    action_dim = env.single_action_space.n  # Discrete action space
    is_discrete_actions = True
except Exception:
    action_dim = env.single_action_space.shape[0]  # Continuous action space
    is_discrete_actions = False

In [None]:
print(f"Action dimension: {action_dim}")
print(f"Observation dimension: {state_dim}")
print(f"Is discrete action space: {is_discrete_actions}")
print(f"Is discrete observation space: {is_discrete_obs}")
print(f"Is one-hot:  {one_hot}")

Action dimension: 4
Observation dimension: 48
Is discrete action space: True
Is discrete observation space: True
Is one-hot:  True


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

The blackjack environment does not implement a .shape method on its' observation_space. Since the DQN agent expects a python tuple instead of a gym.tuple, we need to create our own state_dim.

## Setting up the Deep Q-Learning Agent

In this block we set the default hyperparameters

In [None]:
INIT_HP = {
    "BATCH_SIZE": 32,  # Smaller batch size
    "LR": 0.01,  # Much higher learning rate
    "GAMMA": 0.90,  # Lower discount factor for shorter-term rewards
    "MEMORY_SIZE": 10_000,
    "LEARN_STEP": 1,
    "N_STEP": 1,
    "PER": False,  # Disable prioritized replay for simplicity
    "TAU": 0.1,  # Faster target updates
    "NOISY": False,
    "CHANNELS_LAST": False,
    "LEARNING_DELAY": 500,
    "MAX_STEPS": 100000,  # More steps
    "EVO_STEPS": 100000,
    "EVAL_STEPS": 1000,
    "EVAL_LOOP": 1,
}

In this block we set the neural network config. Since we are using the more simple discrete observation and action environment, a typical multi-layer perceptron network is sufficient.

In [None]:
# Simpler network
NET_CONFIG = {
      'arch': 'mlp',
      'hidden_size': [64],  # Single hidden layer
}

Finally, the neural network config is passed to the DQN agent to alter the default network.

In [None]:
# Initialize agent and memory
agent = DQN(
    net_config=NET_CONFIG,
    batch_size=int(state_dim),
    state_dim=[state_dim],
    action_dim=action_dim,
    one_hot=one_hot,
    lr=INIT_HP["LR"],
    learn_step=INIT_HP["LEARN_STEP"],
    gamma=INIT_HP["GAMMA"],
    tau=INIT_HP["TAU"],
    device=device)

## Training the DQN Agent in the Blackjack Environment

### What is the difference in training with a DQN agent vs a Q-Learning agent?

#### Memory and Replay buffer
A difference in training an agent with a memory or replay buffer is that the simpler agent.update(state, action, reward, next_state, done) function is decomposed into multiple functions:

1. memory.save_to_memory_vect_envs(state, action, reward, next_state, done)
2. experience = memory.sample(agent.batch_size)
3. agent.learn(experience)

This allows for a higher dimensional input used for training (for example with multiple channels or multiple observations).

#### Training steps

The training steps when using a memory or replay buffer is dependent on the 'batch_size' of the memory. This determines how many 'experiences' / memory samples (or steps in the environment) the memory should be filled with prior to training. Once the memory is filled (this could be taken as the exploration phase as no learning is taking place), the agent continues taking steps while also learning (the training phase).

In [None]:
field_names = ["state", "action", "reward", "next_state", "done"]

In [None]:
memory = ReplayBuffer(memory_size=INIT_HP["MEMORY_SIZE"], field_names=field_names, device=device)

In [None]:
# More aggressive exploration
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995


In [None]:
total_steps = 0
pop = [agent]
print("Training...")
pbar = trange(INIT_HP["MAX_STEPS"], unit="step")

best_fitness = float('-inf')
episodes_without_improvement = 0
max_episodes_without_improvement = 20

while np.less([agent.steps[-1] for agent in pop], INIT_HP["MAX_STEPS"]).all():
    pop_episode_scores = []
    for agent in pop:
        state, info = env.reset()
        scores = np.zeros(1)
        completed_episode_scores = []
        steps = 0
        epsilon = max(eps_end, eps_start * (eps_decay ** (total_steps / 1000)))

        for idx_step in range(INIT_HP['MAX_STEPS'] // 1):
            action = agent.get_action(state, epsilon)
            next_state, reward, terminated, truncated, info = env.step(action)

            # Modify reward to encourage faster completion
            modified_reward = reward - 0.1  # Small penalty for each step

            scores += np.array(reward)
            steps += 1
            total_steps += 1

            for idx, (d, t) in enumerate(zip(terminated, truncated)):
                if d or t:
                    if scores[idx] > -15:  # If performed reasonably well
                        modified_reward += 10  # Bonus for good completion
                    completed_episode_scores.append(scores[idx])
                    agent.scores.append(scores[idx])
                    scores[idx] = 0

            memory.save_to_memory(
                state,
                action,
                modified_reward,  # Use modified reward
                next_state,
                terminated,
                is_vectorised=True,
            )

            if len(memory) >= agent.batch_size:
                experiences = memory.sample(agent.batch_size)
                agent.learn(experiences)

            state = next_state

            if total_steps % 1000 == 0:
                avg_score = np.mean(agent.scores[-100:]) if agent.scores else 0
                print(f"\nStep {total_steps}: Average Score = {avg_score:.2f}, Epsilon = {epsilon:.3f}")

        pbar.update(INIT_HP['EVO_STEPS'] // len(pop))
        agent.steps[-1] += steps
        pop_episode_scores.append(completed_episode_scores)

        current_fitness = np.mean(completed_episode_scores) if completed_episode_scores else float('-inf')
        if current_fitness > best_fitness:
            best_fitness = current_fitness
            episodes_without_improvement = 0
            agent.save_checkpoint('best_cliffwalking.pt')
        else:
            episodes_without_improvement += 1

        if episodes_without_improvement >= max_episodes_without_improvement:
            print("\nEarly stopping triggered!")
            break

    fitnesses = [
        agent.test(
            env,
            swap_channels=INIT_HP["CHANNELS_LAST"],
            max_steps=INIT_HP['EVAL_STEPS'],
            loop=INIT_HP['EVAL_LOOP'],
        )
        for agent in pop
    ]
    mean_scores = [
        (np.mean(episode_scores) if len(episode_scores) > 0 else "0 completed episodes")
        for episode_scores in pop_episode_scores
    ]

    print(f"\n--- Global steps {total_steps} ---")
    print(f"Steps {[agent.steps[-1] for agent in pop]}")
    print(f"Scores: {mean_scores}")
    print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
    print(f'Best Fitness So Far: {best_fitness:.2f}')

    for agent in pop:
        agent.steps.append(agent.steps[-1])

pbar.close()
env.close()

Training...


  0%|          | 0/100000 [00:00<?, ?step/s]


Step 1000: Average Score = 0.00, Epsilon = 1.000

Step 2000: Average Score = 0.00, Epsilon = 1.000

Step 3000: Average Score = -21557.00, Epsilon = 1.000

Step 4000: Average Score = -21557.00, Epsilon = 1.000

Step 5000: Average Score = -21557.00, Epsilon = 1.000

Step 6000: Average Score = -21557.00, Epsilon = 1.000

Step 7000: Average Score = -21557.00, Epsilon = 1.000

Step 8000: Average Score = -21557.00, Epsilon = 1.000

Step 9000: Average Score = -40291.50, Epsilon = 1.000

Step 10000: Average Score = -40291.50, Epsilon = 1.000

Step 11000: Average Score = -40291.50, Epsilon = 1.000

Step 12000: Average Score = -40291.50, Epsilon = 1.000

Step 13000: Average Score = -40291.50, Epsilon = 1.000

Step 14000: Average Score = -40291.50, Epsilon = 1.000

Step 15000: Average Score = -40291.50, Epsilon = 1.000

Step 16000: Average Score = -40291.50, Epsilon = 1.000

Step 17000: Average Score = -40291.50, Epsilon = 1.000

Step 18000: Average Score = -40291.50, Epsilon = 1.000

Step 19000

In [None]:
agent.save_checkpoint('CliffWalking.pt')

## Saving the Trained Weights and Evaluating the Agent

In [None]:
test_env = gym.make("CliffWalking-v0", render_mode='rgb_array')

These wrappers are necessary if using the gym.make method instead of the gym.vector.make method to create the test environment. The gym.make method is required if using gym.wrapper.RecordVideo for testing as it sets the render_mode to be able to use rgb_array. The gym.make method forces the environments to use tuples instead of arrays for the data types for observations or actions so is incompatible with the DQN class of AgileRL which expects arrays and not tuples.

In [None]:
class ArrayObservationEnv(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
    # code to overwrite the step and reset functions to modify the state

    #super() of env.step
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        #return tuple([obs]), reward, terminated, truncated, info
        return np.array(obs), reward, terminated, truncated, info

    #super() of env.reset
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        #return tuple([obs]), info
        return np.array(obs), info


#wrapper for agileRL DQN

class TupletoArrayDQN():
    def __init__(self, trained_agent):  # Pass keyword arguments for DQN initialization
        self.dqn_instance = trained_agent
        #super().__init__() # Added this to solve NameError, not sure if necessary but it works now

    #super() of DQN.test
    def test(self, env, swap_channels=False, max_steps=None, loop=1):
        # uses array env wrapper
        env = ArrayObservationEnv(env)
        """Returns mean test score of agent in environment with epsilon-greedy policy.

        :param env: The environment to be tested in
        :type env: Gym-style environment
        :param swap_channels: Swap image channels dimension from last to first [H, W, C] -> [C, H, W], defaults to False
        :type swap_channels: bool, optional
        :param max_steps: Maximum number of testing steps, defaults to None
        :type max_steps: int, optional
        :param loop: Number of testing loops/episodes to complete. The returned score is the mean over these tests. Defaults to 1
        :type loop: int, optional
        """
        with torch.no_grad():
            rewards = []
            num_envs = env.num_envs if hasattr(env, "num_envs") else 1
            for i in range(loop):
                state, info = env.reset()
                scores = np.zeros(num_envs)
                completed_episode_scores = np.zeros(num_envs)
                finished = np.zeros(num_envs)
                step = 0
                while not np.all(finished):
                    if swap_channels:
                        state = np.moveaxis(state, [-1], [-3])
                    action_mask = info.get("action_mask", None)
                    action = self.dqn_instance.get_action(state, epsilon=0, action_mask=action_mask)
                    state, reward, done, trunc, info = env.step(action[0])
                    step += 1
                    scores += np.array(reward)
                    for idx, (d, t) in enumerate(zip([done], [trunc])):
                        if (
                            d or t or (max_steps is not None and step == max_steps)
                        ) and not finished[idx]:
                            completed_episode_scores[idx] = scores[idx]
                            finished[idx] = 1
                rewards.append(np.mean(completed_episode_scores))
        mean_fit = np.mean(rewards)
        self.dqn_instance.fitness.append(mean_fit)
        return mean_fit



In [None]:
wrapped_dqn = TupletoArrayDQN(trained_agent=agent)

In [None]:
# Uses the Gym Monitor wrapper to evalaute the agent and record video
# only one video will be saved
# video of the final episode with the episode trigger
test_env = gym.wrappers.RecordVideo(
    test_env, "./gym_monitor_output", episode_trigger=lambda x: x == 0)

wrapped_dqn.test(test_env, swap_channels=INIT_HP["CHANNELS_LAST"], max_steps=INIT_HP['EVAL_STEPS'])

test_env.close()

Moviepy - Building video /content/gym_monitor_output/rl-video-episode-0.mp4.
Moviepy - Writing video /content/gym_monitor_output/rl-video-episode-0.mp4




t:   0%|          | 0/14 [00:00<?, ?it/s, now=None][A
                                                   [A

Moviepy - Done !
Moviepy - video ready /content/gym_monitor_output/rl-video-episode-0.mp4
