In [1]:
import gymnasium as gym
import numpy as np
import torch

from deep_q_network import DQNAgent
from double_deep_q_network import DoubleDQNAgent
from double_dueling_deep_q_network import DoubleDuelingDQNAgent
from dueling_deep_q_network import DuelingDQNAgent

In [2]:
agent_config = {
#     "environment": "LunarLander-v2",
    'environment': "MountainCar-v0",
    "max_episode_steps": 300,
    "num_episodes": 200,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "gamma": 0.99,
    "learning_rate": 0.001,
    "epsilon": 1,
    "epsilon_decrement": 0.0001,
    "min_epsilon": 0.01,
    "memory_size": 100000,
    "batch_size": 64,
    "hidden_dims": [64, 32],
    "target_update_frequecy": 1000,
}

In [3]:
agent_classes = [DQNAgent, DoubleDQNAgent]
agent_count = len(agent_classes)
env = gym.vector.make(
    agent_config.get("environment"),
    num_envs = agent_count,
    max_episode_steps = agent_config.get("max_episode_steps")
)
agents = list(map(lambda agent: agent(
    num_actions = env.single_action_space.n,
    input_dims = env.single_observation_space.shape,
    **agent_config
), agent_classes))

In [4]:
scores = [[] for _ in range(agent_count)]
eps_history = [[] for _ in range(agent_count)]

for i in range(agent_config.get("num_episodes")):
    complete = False
    dones = np.zeros(agent_count, dtype=bool)
    score = np.zeros(agent_count)
    observation = env.reset()[0]
    steps = 0
    
    while steps < agent_config.get("max_episode_steps") and not complete:
        action = []
        for agent, obs in zip(agents, observation):
            action.append(agent.get_action_training(obs))

        new_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated | truncated
        dones |= done
        complete = np.all(done)
        steps += 1

        for index, agent in enumerate(agents):
            if not dones[index]:
                score[index] += reward[index]
                agent.store_transition(
                    observation[index],
                    action[index],
                    reward[index],
                    new_observation[index],
                    done[index]
                )
                agent.learn()
        
        observation = new_observation
    
    r_score = []
    avg_score = []
    eps = []
    for index, agent in enumerate(agents):
        scores[index].append(score[index])
        eps_history[index].append(agent.epsilon)
        r_score.append(round(score[index], 1))
        avg_score.append(round(np.mean(scores[index][-100:]), 1))
        eps.append(round(agent.epsilon, 3))
    print(
        f"episode {i:03d} |",
        f"score {score} |",
        f"average score {avg_score} |",
        f"epsilon {eps}"
    )

episode 000 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.976, 0.976]
episode 001 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.947, 0.947]
episode 002 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.917, 0.917]
episode 003 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.887, 0.887]
episode 004 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.857, 0.857]
episode 005 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.827, 0.827]
episode 006 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.797, 0.797]
episode 007 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.767, 0.767]
episode 008 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.737, 0.737]
episode 009 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.707, 0.707]
episode 010 | score [-299. -299.] | average score [-299.0, -299.0] | epsilon [0.

episode 091 | score [-299. -299.] | average score [-299.0, -289.5] | epsilon [0.01, 0.01]
episode 092 | score [-299. -228.] | average score [-299.0, -288.9] | epsilon [0.01, 0.01]
episode 093 | score [-299. -208.] | average score [-299.0, -288.0] | epsilon [0.01, 0.01]
episode 094 | score [-299. -169.] | average score [-299.0, -286.8] | epsilon [0.01, 0.01]
episode 095 | score [-299. -146.] | average score [-299.0, -285.3] | epsilon [0.01, 0.01]
episode 096 | score [-299. -189.] | average score [-299.0, -284.3] | epsilon [0.01, 0.01]
episode 097 | score [-299. -183.] | average score [-299.0, -283.3] | epsilon [0.01, 0.01]
episode 098 | score [-299. -192.] | average score [-299.0, -282.3] | epsilon [0.01, 0.01]
episode 099 | score [-299. -142.] | average score [-299.0, -280.9] | epsilon [0.01, 0.01]
episode 100 | score [-299. -144.] | average score [-299.0, -279.4] | epsilon [0.01, 0.01]
episode 101 | score [-299. -210.] | average score [-299.0, -278.5] | epsilon [0.01, 0.01]
episode 10

episode 183 | score [-299. -299.] | average score [-299.0, -191.6] | epsilon [0.01, 0.01]
episode 184 | score [-299. -299.] | average score [-299.0, -191.6] | epsilon [0.01, 0.01]
episode 185 | score [-299. -299.] | average score [-299.0, -192.6] | epsilon [0.01, 0.01]
episode 186 | score [-299. -299.] | average score [-299.0, -192.8] | epsilon [0.01, 0.01]
episode 187 | score [-299. -299.] | average score [-299.0, -193.1] | epsilon [0.01, 0.01]
episode 188 | score [-299. -171.] | average score [-299.0, -192.0] | epsilon [0.01, 0.01]
episode 189 | score [-299. -299.] | average score [-299.0, -192.9] | epsilon [0.01, 0.01]
episode 190 | score [-299. -299.] | average score [-299.0, -194.4] | epsilon [0.01, 0.01]
episode 191 | score [-299. -299.] | average score [-299.0, -194.4] | epsilon [0.01, 0.01]
episode 192 | score [-299. -299.] | average score [-299.0, -195.1] | epsilon [0.01, 0.01]
episode 193 | score [-299. -299.] | average score [-299.0, -196.0] | epsilon [0.01, 0.01]
episode 19