In [1]:
import gymnasium as gym
import numpy as np
import torch

from deep_q_network import DQNAgent
from double_deep_q_network import DoubleDQNAgent
from double_dueling_deep_q_network import DoubleDuelingDQNAgent
from dueling_deep_q_network import DuelingDQNAgent

In [2]:
agent_config = {
    "environment": "LunarLander-v2",
#     'environment': "MountainCar-v0",
    "max_episode_steps": 300,
    "num_episodes": 200,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "gamma": 0.99,
    "learning_rate": 0.001,
    "epsilon": 1,
    "epsilon_decrement": 0.0005,
    "min_epsilon": 0.01,
    "memory_size": 100000,
    "batch_size": 128,
    "hidden_dims": [128, 128],
    "target_update_frequency": 1000,
    "warmup_steps": 1000
}

In [3]:
agent_classes = [DQNAgent, DoubleDQNAgent, DoubleDuelingDQNAgent, DuelingDQNAgent]
agent_count = len(agent_classes)
env = gym.vector.make(
    agent_config.get("environment"),
    num_envs = agent_count,
    max_episode_steps = agent_config.get("max_episode_steps")
)
agents = list(map(lambda agent: agent(
    num_actions = env.single_action_space.n,
    input_dims = env.single_observation_space.shape,
    **agent_config
), agent_classes))

In [4]:
scores = [[] for _ in range(agent_count)]
eps_history = [[] for _ in range(agent_count)]

for i in range(agent_config.get("num_episodes")):
    complete = False
    dones = np.zeros(agent_count, dtype=bool)
    score = np.zeros(agent_count)
    observation = env.reset()[0]
    steps = 0
    
    while steps < agent_config.get("max_episode_steps") and not complete:
        action = []
        for agent, obs in zip(agents, observation):
            action.append(agent.get_action_training(obs))

        new_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated | truncated
        dones |= done
        complete = np.all(dones)
        steps += 1

        for index, agent in enumerate(agents):
            if not dones[index]:
                score[index] += reward[index]
                agent.store_transition(
                    observation[index],
                    action[index],
                    reward[index],
                    new_observation[index],
                    done[index]
                )
                agent.learn()
        
        observation = new_observation
    
    r_score = []
    avg_score = []
    eps = []
    for index, agent in enumerate(agents):
        scores[index].append(score[index])
        eps_history[index].append(agent.epsilon)
        r_score.append(round(score[index], 1))
        avg_score.append(round(np.mean(scores[index][-100:]), 1))
        eps.append(round(agent.epsilon, 3))
    print(
        f"episode {i:03d}",
        f"score {r_score}",
        f"average score {avg_score}",
        f"epsilon {eps}\n",
        sep = '\n'
    )

episode 000
score [1.6, -53.9, -88.1, -33.2]
average score [1.6, -53.9, -88.1, -33.2]
epsilon [1, 1, 1, 1]

episode 001
score [-35.9, -25.3, 10.6, -46.5]
average score [-17.1, -39.6, -38.8, -39.9]
epsilon [1, 1, 1, 1]

episode 002
score [-255.1, -90.5, -10.0, -23.5]
average score [-96.5, -56.5, -29.2, -34.4]
epsilon [1, 1, 1, 1]

episode 003
score [23.9, -10.3, 22.5, -0.5]
average score [-66.4, -45.0, -16.3, -25.9]
epsilon [1, 1, 1, 1]

episode 004
score [-239.5, -33.5, -21.6, -28.1]
average score [-101.0, -42.7, -17.3, -26.4]
epsilon [1, 1, 1, 1]

episode 005
score [-288.0, 10.6, -27.1, 65.5]
average score [-132.2, -33.8, -19.0, -11.1]
epsilon [1, 1, 1, 1]

episode 006
score [30.7, -17.4, 24.0, -45.2]
average score [-108.9, -31.5, -12.8, -15.9]
epsilon [1, 1, 1, 1]

episode 007
score [-49.6, -58.5, -263.0, -3.6]
average score [-101.5, -34.8, -44.1, -14.4]
epsilon [1, 1, 1, 1]

episode 008
score [-36.4, -67.6, -94.5, -174.0]
average score [-94.3, -38.5, -49.7, -32.1]
epsilon [1, 1, 1, 

episode 069
score [30.1, 48.4, 7.6, 116.2]
average score [11.8, -7.4, -43.4, 8.1]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 070
score [1.6, 20.3, -15.2, 56.1]
average score [11.6, -7.0, -43.0, 8.8]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 071
score [40.4, -44.7, 22.1, 47.8]
average score [12.0, -7.5, -42.1, 9.3]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 072
score [-24.9, -13.1, -8.5, -4.7]
average score [11.5, -7.6, -41.7, 9.1]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 073
score [-30.9, 88.2, 4.0, 19.3]
average score [11.0, -6.3, -41.1, 9.3]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 074
score [-30.9, -4.9, -17.8, -10.9]
average score [10.4, -6.3, -40.7, 9.0]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 075
score [72.2, 48.1, -10.0, -8.5]
average score [11.2, -5.6, -40.3, 8.8]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 076
score [6.7, 29.9, -29.0, 46.2]
average score [11.2, -5.1, -40.2, 9.3]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 077
score [-2.0, 23.4, -14.1, 52.8]
average score [11.0, 

episode 140
score [-0.7, -16.8, -42.9, 108.5]
average score [5.7, 13.3, -6.7, 26.8]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 141
score [-13.2, -10.6, 0.1, 6.1]
average score [5.6, 13.5, -7.1, 26.0]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 142
score [-20.7, -31.2, -17.6, 25.7]
average score [5.0, 13.3, -7.6, 26.0]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 143
score [-13.1, 20.6, -9.4, -19.8]
average score [4.1, 13.3, -7.2, 25.8]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 144
score [-28.8, -38.9, -15.3, -32.1]
average score [3.3, 12.8, -8.0, 26.3]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 145
score [-19.4, -43.5, -47.7, 38.8]
average score [2.1, 12.4, -8.2, 26.6]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 146
score [-10.4, -3.6, -25.4, 51.7]
average score [1.2, 12.9, -8.4, 26.7]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 147
score [-11.7, -21.2, -39.6, 25.0]
average score [0.6, 12.7, -8.3, 27.2]
epsilon [0.01, 0.01, 0.01, 0.01]

episode 148
score [-7.7, 43.9, -57.7, 36.1]
average score [0