In [1]:
import gymnasium as gym
import numpy as np
import torch

from deep_q_network import DQNAgent
from double_deep_q_network import DoubleDQNAgent
from double_dueling_deep_q_network import DoubleDuelingDQNAgent
from dueling_deep_q_network import DuelingDQNAgent

In [2]:
agent_config = {
    "environment": "LunarLander-v2",
#     'environment': "MountainCar-v0",
    "max_episode_steps": 300,
    "num_episodes": 200,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "gamma": 0.99,
    "learning_rate": 0.001,
    "epsilon": 1,
    "epsilon_decrement": 0.0001,
    "min_epsilon": 0.01,
    "memory_size": 100000,
    "batch_size": 128,
    "hidden_dims": [128, 128],
    "target_update_frequency": 1000,
    "warmup_steps": 1000
}

In [3]:
agent_classes = [DQNAgent, DoubleDQNAgent, DoubleDuelingDQNAgent, DuelingDQNAgent]
agent_count = len(agent_classes)
env = gym.vector.make(
    agent_config.get("environment"),
    num_envs = agent_count,
    max_episode_steps = agent_config.get("max_episode_steps")
)
agents = list(map(lambda agent: agent(
    num_actions = env.single_action_space.n,
    input_dims = env.single_observation_space.shape,
    **agent_config
), agent_classes))

In [4]:
scores = [[] for _ in range(agent_count)]
eps_history = [[] for _ in range(agent_count)]

for i in range(agent_config.get("num_episodes")):
    complete = False
    dones = np.zeros(agent_count, dtype=bool)
    score = np.zeros(agent_count)
    observation = env.reset()[0]
    steps = 0
    
    while steps < agent_config.get("max_episode_steps") and not complete:
        action = []
        for agent, obs in zip(agents, observation):
            action.append(agent.get_action_training(obs))

        new_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated | truncated
        dones |= done
        complete = np.all(dones)
        steps += 1

        for index, agent in enumerate(agents):
            if not dones[index]:
                score[index] += reward[index]
                agent.store_transition(
                    observation[index],
                    action[index],
                    reward[index],
                    new_observation[index],
                    done[index]
                )
                agent.learn()
        
        observation = new_observation
    
    r_score = []
    avg_score = []
    eps = []
    for index, agent in enumerate(agents):
        scores[index].append(score[index])
        eps_history[index].append(agent.epsilon)
        r_score.append(round(score[index], 1))
        avg_score.append(round(np.mean(scores[index][-100:]), 1))
        eps.append(round(agent.epsilon, 3))
    print(
        f"episode {i:03d}",
        f"score {r_score}",
        f"average score {avg_score}",
        f"epsilon {eps}\n",
        sep = '\n'
    )

episode 000 | score [  -9.33916223  -49.44329786   41.59748249 -155.61510371] | average score [-9.3, -49.4, 41.6, -155.6] | epsilon [1, 1, 1, 1]
episode 001 | score [-321.49721621 -222.09334854 -281.79855346 -144.60877861] | average score [-165.4, -135.8, -120.1, -150.1] | epsilon [1, 1, 1, 1]
episode 002 | score [ -44.7926607  -223.7861403     8.31906329   23.6979951 ] | average score [-125.2, -165.1, -77.3, -92.2] | epsilon [1, 1, 1, 1]
episode 003 | score [-180.27450705  -28.85257404   27.4201047   -27.41707385] | average score [-139.0, -131.0, -51.1, -76.0] | epsilon [1, 1, 1, 1]
episode 004 | score [  14.04988922 -185.53981905  -18.41034647    1.62935442] | average score [-108.4, -141.9, -44.6, -60.5] | epsilon [1, 1, 1, 1]
episode 005 | score [ -21.20164989 -125.64463829  -29.07737631  -63.13156694] | average score [-93.8, -139.2, -42.0, -60.9] | epsilon [1, 1, 1, 1]
episode 006 | score [-197.86837392  -22.92922405  -41.25514261 -188.74463075] | average score [-108.7, -122.6, -41

episode 053 | score [ 65.41762707 -52.38841777   9.19661969  17.69053565] | average score [-45.5, -53.9, -15.8, -40.2] | epsilon [0.541, 0.559, 0.571, 0.568]
episode 054 | score [56.03645702 -3.9413944  34.05856129 51.71780352] | average score [-43.7, -53.0, -14.9, -38.5] | epsilon [0.532, 0.551, 0.557, 0.558]
episode 055 | score [ 125.92928537 -163.51397856   28.27961827  -33.98377857] | average score [-40.7, -54.9, -14.1, -38.4] | epsilon [0.502, 0.542, 0.547, 0.548]
episode 056 | score [ 32.09498559  -7.2992567    3.1582444  -11.95931876] | average score [-39.4, -54.1, -13.8, -38.0] | epsilon [0.49, 0.531, 0.538, 0.522]
episode 057 | score [-11.36685005 -54.44302393  22.77462914   1.30837759] | average score [-38.9, -54.1, -13.2, -37.3] | epsilon [0.461, 0.521, 0.53, 0.508]
episode 058 | score [-77.03539571 -74.38299358  63.88185052  95.35778008] | average score [-39.5, -54.4, -11.9, -35.0] | epsilon [0.441, 0.511, 0.518, 0.49]
episode 059 | score [ 20.18260674 -79.15237859  58.7759

episode 106 | score [-27.77095302  -1.88173265   6.01182832 110.6041602 ] | average score [-32.7, -39.5, 6.0, -16.7] | epsilon [0.01, 0.058, 0.01, 0.01]
episode 107 | score [ -7.10948133 -18.40162407  26.82732303  -0.39312821] | average score [-31.2, -35.6, 6.5, -16.2] | epsilon [0.01, 0.049, 0.01, 0.01]
episode 108 | score [-366.34107906    3.68842246   73.18523451   -9.62262809] | average score [-35.2, -31.5, 7.6, -16.1] | epsilon [0.01, 0.04, 0.01, 0.01]
episode 109 | score [  24.39489957 -248.9629411    48.9950418     4.05166629] | average score [-31.9, -33.4, 7.9, -14.0] | epsilon [0.01, 0.03, 0.01, 0.01]
episode 110 | score [-191.47400353   34.85708666   50.97041228 -147.90027103] | average score [-34.1, -30.4, 8.6, -15.3] | epsilon [0.01, 0.022, 0.01, 0.01]
episode 111 | score [ -11.44288602  -16.2757383    84.42480964 -118.49745702] | average score [-33.7, -30.0, 9.2, -15.2] | epsilon [0.01, 0.014, 0.01, 0.01]
episode 112 | score [-235.23073506  -63.18573837  -21.99363027 -229.

episode 160 | score [-389.22201553   -5.1091131   -91.53168758  -78.58105182] | average score [-75.2, -2.9, 16.9, -13.2] | epsilon [0.01, 0.01, 0.01, 0.01]
episode 161 | score [-8.46409584 26.04847356 41.3088482  45.7369138 ] | average score [-77.0, -2.5, 17.1, -14.0] | epsilon [0.01, 0.01, 0.01, 0.01]
episode 162 | score [  11.77766883   24.11606668   18.81690116 -392.78895974] | average score [-77.2, -1.4, 16.5, -18.2] | epsilon [0.01, 0.01, 0.01, 0.01]
episode 163 | score [ 9.49847464 34.51696256 47.89541006 54.3931617 ] | average score [-78.7, -1.1, 16.1, -17.9] | epsilon [0.01, 0.01, 0.01, 0.01]
episode 164 | score [  -7.62991226   89.82662296 -235.76707917 -107.42947553] | average score [-78.7, 0.1, 12.9, -19.4] | epsilon [0.01, 0.01, 0.01, 0.01]
episode 165 | score [ 71.36523506 110.6096457   28.655415    31.81080385] | average score [-78.1, 0.6, 12.5, -19.6] | epsilon [0.01, 0.01, 0.01, 0.01]
episode 166 | score [-344.36940476    9.47637582   22.12506021 -567.26673849] | averag