# Simple Spread
**Objectives**: The environment has N agents and N landmarks. Agents must learn to cover all the landmarks while avoiding collisions. All agents are globally rewarded based on how far the closest agents is to each landmark (sum of the minimum distances). The agents are penalised if they collide with other agents (-1 for each collision).

**Actions**:
 - Agent observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, communication]`
 - Agent action space: `[no_action, move_left, move_right, move_down, move_up]`

In [2]:
from pettingzoo.mpe import simple_spread_v3
num_agents = 3
env = simple_spread_v3.env(
    N=num_agents,
    max_cycles=25,
    local_ratio=0.5,
    continuous_actions=False,
    render_mode="rgb_array"
)

In [4]:
env.reset()
env.agents

['agent_0', 'agent_1', 'agent_2']

In [6]:
for agent in env.agents:
    print(agent, env.observation_space(agent))

agent_0 Box(-inf, inf, (18,), float32)
agent_1 Box(-inf, inf, (18,), float32)
agent_2 Box(-inf, inf, (18,), float32)


In [7]:
for agent in env.agents:
    print(agent, env.action_space(agent))

agent_0 Box(0.0, 1.0, (5,), float32)
agent_1 Box(0.0, 1.0, (5,), float32)
agent_2 Box(0.0, 1.0, (5,), float32)


In [13]:
actions_dict = {agent: env.action_space(agent).sample() for agent in env.agents}
actions_dict

{'agent_0': array([0.04795634, 0.5480799 , 0.96403193, 0.15368281, 0.9715407 ],
       dtype=float32),
 'agent_1': array([0.6359698 , 0.8304244 , 0.3023392 , 0.18200669, 0.73282206],
       dtype=float32),
 'agent_2': array([0.6605648 , 0.38431332, 0.78608483, 0.5651855 , 0.51340795],
       dtype=float32)}

In [16]:
obs, rew, term, trunc, info = env.step(actions_dict)
rew

defaultdict(int,
            {'agent_0': -0.9531965289987081,
             'agent_1': -0.9531965289987081,
             'agent_2': -0.9531965289987081})

In [18]:
import os
from typing import Optional, Tuple

import gymnasium as gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager
from tianshou.trainer import offpolicy_trainer
from tianshou.utils.net.common import Net

from pettingzoo.mpe import simple_spread_v3

In [19]:
def _get_env():
    """This functions is needed to provide callables for DummyVectorEnv"""
    return PettingZooEnv(simple_spread_v3.env(N=num_agents, max_cycles=25, local_ratio=0.5, continuous_actions=False,
                                              render_mode="rgb_array"))

In [20]:
def _get_agents(
    agent_0: Optional[BasePolicy] = None,
    agent_1: Optional[BasePolicy] = None,
    agent_2: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gym.spaces.Dict)
        else env.observation_space
    )
    if agent_0 is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 128, 128, 128],
            device="cuda" if torch.cuda.is_available() else "cpu"
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_0 = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320
        )

    if agent_1 is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 128, 128, 128],
            device="cuda" if torch.cuda.is_available() else "cpu"
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_1 = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320
        )

    if agent_2 is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 128, 128, 128],
            device="cuda" if torch.cuda.is_available() else "cpu"
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_2 = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320
        )
    
    agents = [agent_0, agent_1, agent_2]
    policy = MultiAgentPolicyManager(agents, env)
    return policy, optim, env.agents

In [21]:
# step 1: environment setup
train_envs = DummyVectorEnv([_get_env for _ in range(10)])
test_envs = DummyVectorEnv([_get_env for _ in range(10)])

# seed
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# step 2: agent setup
policy, optim, agents = _get_agents()

# step 3: collector setup
train_collector = Collector(
    policy,
    train_envs,
    VectorReplayBuffer(20_000, len(train_envs)),
    exploration_noise=True
)
test_collector = Collector(policy, test_envs, exploration_noise=True)
# policy.set_eps(1)
train_collector.collect(n_step=64 * 10)  # batch size * training_num

{'n/ep': 0,
 'n/st': 640,
 'rews': array([], dtype=float64),
 'lens': array([], dtype=int64),
 'idxs': array([], dtype=int64),
 'rew': 0,
 'len': 0,
 'rew_std': 0,
 'len_std': 0}

In [22]:
# step 4: callback functions setup
def save_best_fn(policy):
    model_save_path = os.path.join("log", "ttt", "dqn", "policy.pth")
    os.makedirs(os.path.join("log", "ttt", "dqn"), exist_ok=True)
    torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

def stop_fn(mean_rewards):
    return mean_rewards >= 0.6

def train_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.1)

def test_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.05)

def reward_metric(rews):
    return rews[:, 1]

In [23]:
# step 5: run the trainer
result = offpolicy_trainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=50,
    step_per_epoch=1000,
    step_per_collect=50,
    episode_per_test=10,
    batch_size=64,
    train_fn=train_fn,
    test_fn=test_fn,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    update_per_step=0.1,
    test_in_train=False,
    reward_metric=reward_metric
)

Epoch #1: 1001it [00:02, 426.11it/s, agent_0/loss=2.717, agent_1/loss=3.971, agent_2/loss=4.893, env_step=1000, len=75, n/ep=0, n/st=50, rew=-42.51]                          


Epoch #1: test_reward: -54.758826 ± 6.949453, best_reward: -54.758826 ± 6.949453 in #1


Epoch #2: 1001it [00:01, 761.58it/s, agent_0/loss=0.384, agent_1/loss=3.748, agent_2/loss=4.782, env_step=2000, len=75, n/ep=0, n/st=50, rew=-50.09]                          


Epoch #2: test_reward: -49.356117 ± 15.470015, best_reward: -49.356117 ± 15.470015 in #2


Epoch #3: 1001it [00:01, 817.88it/s, agent_0/loss=0.175, agent_1/loss=4.299, agent_2/loss=5.307, env_step=3000, len=75, n/ep=0, n/st=50, rew=-56.99]                          


Epoch #3: test_reward: -44.184034 ± 10.165206, best_reward: -44.184034 ± 10.165206 in #3


Epoch #4: 1001it [00:01, 758.93it/s, agent_0/loss=0.265, agent_1/loss=4.054, agent_2/loss=4.918, env_step=4000, len=75, n/ep=0, n/st=50, rew=-33.50]                          


Epoch #4: test_reward: -45.363291 ± 11.261698, best_reward: -44.184034 ± 10.165206 in #3


Epoch #5: 1001it [00:01, 770.37it/s, agent_0/loss=0.157, agent_1/loss=3.725, agent_2/loss=4.734, env_step=5000, len=75, n/ep=0, n/st=50, rew=-48.04]                          


Epoch #5: test_reward: -35.171375 ± 8.218110, best_reward: -35.171375 ± 8.218110 in #5


Epoch #6: 1001it [00:01, 762.90it/s, agent_0/loss=0.113, agent_1/loss=3.578, agent_2/loss=4.774, env_step=6000, len=75, n/ep=0, n/st=50, rew=-40.39]                          


Epoch #6: test_reward: -46.202707 ± 19.275439, best_reward: -35.171375 ± 8.218110 in #5


Epoch #7: 1001it [00:01, 814.60it/s, agent_0/loss=0.163, agent_1/loss=3.590, agent_2/loss=4.667, env_step=7000, len=75, n/ep=0, n/st=50, rew=-43.27]                          


Epoch #7: test_reward: -41.866087 ± 14.821008, best_reward: -35.171375 ± 8.218110 in #5


Epoch #8: 1001it [00:01, 724.26it/s, agent_0/loss=0.099, agent_1/loss=3.565, agent_2/loss=4.297, env_step=8000, len=75, n/ep=0, n/st=50, rew=-32.92]                          


Epoch #8: test_reward: -31.854395 ± 7.635090, best_reward: -31.854395 ± 7.635090 in #8


Epoch #9: 1001it [00:01, 797.52it/s, agent_0/loss=0.084, agent_1/loss=3.229, agent_2/loss=4.040, env_step=9000, len=75, n/ep=0, n/st=50, rew=-31.50]                          


Epoch #9: test_reward: -29.234326 ± 8.850037, best_reward: -29.234326 ± 8.850037 in #9


Epoch #10: 1001it [00:01, 805.39it/s, agent_0/loss=0.094, agent_1/loss=3.152, agent_2/loss=3.912, env_step=10000, len=75, n/ep=0, n/st=50, rew=-34.81]                          


Epoch #10: test_reward: -32.056833 ± 9.479975, best_reward: -29.234326 ± 8.850037 in #9


Epoch #11: 1001it [00:01, 734.14it/s, agent_0/loss=0.081, agent_1/loss=3.054, agent_2/loss=3.740, env_step=11000, len=75, n/ep=0, n/st=50, rew=-28.79]                          


Epoch #11: test_reward: -26.349857 ± 7.885603, best_reward: -26.349857 ± 7.885603 in #11


Epoch #12: 1001it [00:01, 692.58it/s, agent_0/loss=0.078, agent_1/loss=2.746, agent_2/loss=3.566, env_step=12000, len=75, n/ep=0, n/st=50, rew=-30.48]                          


Epoch #12: test_reward: -26.649252 ± 5.461832, best_reward: -26.349857 ± 7.885603 in #11


Epoch #13: 1001it [00:01, 678.76it/s, agent_0/loss=0.079, agent_1/loss=2.859, agent_2/loss=3.423, env_step=13000, len=75, n/ep=0, n/st=50, rew=-28.04]                          


Epoch #13: test_reward: -28.265462 ± 6.061333, best_reward: -26.349857 ± 7.885603 in #11


Epoch #14: 1001it [00:01, 805.51it/s, agent_0/loss=0.076, agent_1/loss=2.673, agent_2/loss=3.071, env_step=14000, len=75, n/ep=0, n/st=50, rew=-35.01]                          


Epoch #14: test_reward: -30.838810 ± 5.559723, best_reward: -26.349857 ± 7.885603 in #11


Epoch #15: 1001it [00:01, 638.03it/s, agent_0/loss=0.065, agent_1/loss=2.505, agent_2/loss=3.377, env_step=15000, len=75, n/ep=0, n/st=50, rew=-30.03]                          


Epoch #15: test_reward: -35.313719 ± 9.884449, best_reward: -26.349857 ± 7.885603 in #11


Epoch #16: 1001it [00:01, 765.68it/s, agent_0/loss=0.058, agent_1/loss=2.540, agent_2/loss=3.202, env_step=16000, len=75, n/ep=0, n/st=50, rew=-26.74]                           


Epoch #16: test_reward: -29.586622 ± 6.451127, best_reward: -26.349857 ± 7.885603 in #11


Epoch #17: 1001it [00:01, 637.58it/s, agent_0/loss=0.077, agent_1/loss=2.654, agent_2/loss=3.113, env_step=17000, len=75, n/ep=0, n/st=50, rew=-35.11]                          


Epoch #17: test_reward: -25.845640 ± 5.199224, best_reward: -25.845640 ± 5.199224 in #17


Epoch #18: 1001it [00:01, 570.29it/s, agent_0/loss=0.058, agent_1/loss=2.338, agent_2/loss=2.894, env_step=18000, len=75, n/ep=0, n/st=50, rew=-24.92]                          


Epoch #18: test_reward: -34.975870 ± 9.858120, best_reward: -25.845640 ± 5.199224 in #17


Epoch #19: 1001it [00:01, 671.76it/s, agent_0/loss=0.056, agent_1/loss=2.260, agent_2/loss=2.734, env_step=19000, len=75, n/ep=0, n/st=50, rew=-32.68]                          


Epoch #19: test_reward: -26.179120 ± 5.436357, best_reward: -25.845640 ± 5.199224 in #17


Epoch #20: 1001it [00:01, 662.65it/s, agent_0/loss=0.065, agent_1/loss=2.243, agent_2/loss=2.811, env_step=20000, len=75, n/ep=0, n/st=50, rew=-27.55]                          


Epoch #20: test_reward: -26.089033 ± 3.771193, best_reward: -25.845640 ± 5.199224 in #17


Epoch #21: 1001it [00:01, 709.98it/s, agent_0/loss=0.062, agent_1/loss=2.125, agent_2/loss=2.648, env_step=21000, len=75, n/ep=0, n/st=50, rew=-32.14]                          


Epoch #21: test_reward: -28.867360 ± 4.164922, best_reward: -25.845640 ± 5.199224 in #17


Epoch #22: 1001it [00:01, 750.90it/s, agent_0/loss=0.056, agent_1/loss=2.206, agent_2/loss=2.500, env_step=22000, len=75, n/ep=0, n/st=50, rew=-36.01]                          


Epoch #22: test_reward: -39.779976 ± 11.813746, best_reward: -25.845640 ± 5.199224 in #17


Epoch #23: 1001it [00:01, 751.37it/s, agent_0/loss=0.071, agent_1/loss=1.859, agent_2/loss=2.412, env_step=23000, len=75, n/ep=0, n/st=50, rew=-33.76]                          


Epoch #23: test_reward: -23.624669 ± 6.479593, best_reward: -23.624669 ± 6.479593 in #23


Epoch #24: 1001it [00:01, 649.39it/s, agent_0/loss=0.061, agent_1/loss=1.734, agent_2/loss=2.212, env_step=24000, len=75, n/ep=0, n/st=50, rew=-33.83]                          


Epoch #24: test_reward: -53.410803 ± 18.618010, best_reward: -23.624669 ± 6.479593 in #23


Epoch #25: 1001it [00:01, 647.68it/s, agent_0/loss=0.054, agent_1/loss=1.766, agent_2/loss=2.145, env_step=25000, len=75, n/ep=0, n/st=50, rew=-36.80]                          


Epoch #25: test_reward: -28.359638 ± 7.337721, best_reward: -23.624669 ± 6.479593 in #23


Epoch #26: 1001it [00:01, 596.94it/s, agent_0/loss=0.055, agent_1/loss=1.732, agent_2/loss=2.073, env_step=26000, len=75, n/ep=0, n/st=50, rew=-27.33]                          


Epoch #26: test_reward: -28.433332 ± 7.092926, best_reward: -23.624669 ± 6.479593 in #23


Epoch #27: 1001it [00:01, 693.12it/s, agent_0/loss=0.059, agent_1/loss=1.616, agent_2/loss=1.873, env_step=27000, len=75, n/ep=0, n/st=50, rew=-31.82]                          


Epoch #27: test_reward: -24.846154 ± 5.358418, best_reward: -23.624669 ± 6.479593 in #23


Epoch #28: 1001it [00:01, 601.01it/s, agent_0/loss=0.048, agent_1/loss=1.518, agent_2/loss=1.932, env_step=28000, len=75, n/ep=0, n/st=50, rew=-26.34]                          


Epoch #28: test_reward: -27.528777 ± 6.484642, best_reward: -23.624669 ± 6.479593 in #23


Epoch #29: 1001it [00:01, 772.73it/s, agent_0/loss=0.048, agent_1/loss=1.500, agent_2/loss=1.780, env_step=29000, len=75, n/ep=0, n/st=50, rew=-25.69]                          


Epoch #29: test_reward: -25.529261 ± 6.156468, best_reward: -23.624669 ± 6.479593 in #23


Epoch #30: 1001it [00:01, 846.09it/s, agent_0/loss=0.061, agent_1/loss=1.411, agent_2/loss=1.806, env_step=30000, len=75, n/ep=0, n/st=50, rew=-27.87]                          


Epoch #30: test_reward: -26.007010 ± 6.470722, best_reward: -23.624669 ± 6.479593 in #23


Epoch #31: 1001it [00:01, 921.45it/s, agent_0/loss=0.048, agent_1/loss=1.468, agent_2/loss=1.777, env_step=31000, len=75, n/ep=0, n/st=50, rew=-27.54]                           


Epoch #31: test_reward: -30.486789 ± 6.675625, best_reward: -23.624669 ± 6.479593 in #23


Epoch #32: 1001it [00:01, 847.46it/s, agent_0/loss=0.056, agent_1/loss=1.410, agent_2/loss=1.749, env_step=32000, len=75, n/ep=0, n/st=50, rew=-24.98]                          


Epoch #32: test_reward: -25.977754 ± 5.829596, best_reward: -23.624669 ± 6.479593 in #23


Epoch #33: 1001it [00:01, 906.12it/s, agent_0/loss=0.043, agent_1/loss=1.402, agent_2/loss=1.750, env_step=33000, len=75, n/ep=0, n/st=50, rew=-25.18]                           


Epoch #33: test_reward: -25.166128 ± 5.788446, best_reward: -23.624669 ± 6.479593 in #23


Epoch #34: 1001it [00:01, 847.06it/s, agent_0/loss=0.051, agent_1/loss=1.402, agent_2/loss=1.728, env_step=34000, len=75, n/ep=0, n/st=50, rew=-26.19]                           


Epoch #34: test_reward: -27.932991 ± 5.647160, best_reward: -23.624669 ± 6.479593 in #23


Epoch #35: 1001it [00:01, 862.83it/s, agent_0/loss=0.051, agent_1/loss=1.364, agent_2/loss=1.685, env_step=35000, len=75, n/ep=0, n/st=50, rew=-27.99]                          


Epoch #35: test_reward: -22.464294 ± 5.597708, best_reward: -22.464294 ± 5.597708 in #35


Epoch #36: 1001it [00:01, 649.41it/s, agent_0/loss=0.043, agent_1/loss=1.306, agent_2/loss=1.650, env_step=36000, len=75, n/ep=0, n/st=50, rew=-28.33]                          


Epoch #36: test_reward: -25.157637 ± 6.099965, best_reward: -22.464294 ± 5.597708 in #35


Epoch #37: 1001it [00:01, 620.15it/s, agent_0/loss=0.069, agent_1/loss=1.348, agent_2/loss=1.683, env_step=37000, len=75, n/ep=0, n/st=50, rew=-25.90]                          


Epoch #37: test_reward: -25.498684 ± 6.396474, best_reward: -22.464294 ± 5.597708 in #35


Epoch #38: 1001it [00:01, 711.32it/s, agent_0/loss=0.044, agent_1/loss=1.296, agent_2/loss=1.643, env_step=38000, len=75, n/ep=0, n/st=50, rew=-22.94]                          


Epoch #38: test_reward: -24.945632 ± 3.146161, best_reward: -22.464294 ± 5.597708 in #35


Epoch #39: 1001it [00:01, 703.89it/s, agent_0/loss=0.056, agent_1/loss=1.340, agent_2/loss=1.609, env_step=39000, len=75, n/ep=0, n/st=50, rew=-22.92]                          


Epoch #39: test_reward: -28.402356 ± 7.832578, best_reward: -22.464294 ± 5.597708 in #35


Epoch #40: 1001it [00:01, 756.16it/s, agent_0/loss=0.085, agent_1/loss=1.311, agent_2/loss=1.530, env_step=40000, len=75, n/ep=0, n/st=50, rew=-25.14]                           


Epoch #40: test_reward: -29.217290 ± 7.980253, best_reward: -22.464294 ± 5.597708 in #35


Epoch #41: 1001it [00:01, 643.45it/s, agent_0/loss=0.041, agent_1/loss=1.238, agent_2/loss=1.554, env_step=41000, len=75, n/ep=0, n/st=50, rew=-31.02]                          


Epoch #41: test_reward: -24.201059 ± 6.162457, best_reward: -22.464294 ± 5.597708 in #35


Epoch #42: 1001it [00:01, 653.25it/s, agent_0/loss=0.046, agent_1/loss=1.242, agent_2/loss=1.494, env_step=42000, len=75, n/ep=0, n/st=50, rew=-24.81]                          


Epoch #42: test_reward: -24.360615 ± 8.092796, best_reward: -22.464294 ± 5.597708 in #35


Epoch #43: 1001it [00:01, 726.75it/s, agent_0/loss=0.062, agent_1/loss=1.212, agent_2/loss=1.444, env_step=43000, len=75, n/ep=0, n/st=50, rew=-23.09]                          


Epoch #43: test_reward: -24.986697 ± 4.972593, best_reward: -22.464294 ± 5.597708 in #35


Epoch #44: 1001it [00:01, 662.90it/s, agent_0/loss=0.045, agent_1/loss=1.133, agent_2/loss=1.411, env_step=44000, len=75, n/ep=0, n/st=50, rew=-24.42]                          


Epoch #44: test_reward: -22.908279 ± 4.583182, best_reward: -22.464294 ± 5.597708 in #35


Epoch #45: 1001it [00:01, 792.24it/s, agent_0/loss=0.038, agent_1/loss=1.098, agent_2/loss=1.384, env_step=45000, len=75, n/ep=0, n/st=50, rew=-25.54]                          


Epoch #45: test_reward: -23.523949 ± 5.705251, best_reward: -22.464294 ± 5.597708 in #35


Epoch #46: 1001it [00:01, 763.17it/s, agent_0/loss=0.039, agent_1/loss=1.050, agent_2/loss=1.259, env_step=46000, len=75, n/ep=0, n/st=50, rew=-23.37]                          


Epoch #46: test_reward: -28.353237 ± 5.231560, best_reward: -22.464294 ± 5.597708 in #35


Epoch #47: 1001it [00:01, 801.52it/s, agent_0/loss=0.037, agent_1/loss=1.047, agent_2/loss=1.225, env_step=47000, len=75, n/ep=0, n/st=50, rew=-27.74]                          


Epoch #47: test_reward: -25.642611 ± 6.526083, best_reward: -22.464294 ± 5.597708 in #35


Epoch #48: 1001it [00:01, 747.09it/s, agent_0/loss=0.065, agent_1/loss=0.988, agent_2/loss=1.208, env_step=48000, len=75, n/ep=0, n/st=50, rew=-28.05]                          


Epoch #48: test_reward: -25.677173 ± 4.513942, best_reward: -22.464294 ± 5.597708 in #35


Epoch #49: 1001it [00:01, 791.36it/s, agent_0/loss=0.046, agent_1/loss=0.973, agent_2/loss=1.237, env_step=49000, len=75, n/ep=0, n/st=50, rew=-25.48]                           


Epoch #49: test_reward: -26.104573 ± 7.729232, best_reward: -22.464294 ± 5.597708 in #35


Epoch #50: 1001it [00:01, 791.19it/s, agent_0/loss=0.037, agent_1/loss=0.976, agent_2/loss=1.190, env_step=50000, len=75, n/ep=0, n/st=50, rew=-23.07]                          


Epoch #50: test_reward: -27.509545 ± 4.035955, best_reward: -22.464294 ± 5.597708 in #35


In [24]:
# return result, policy.policies[agents[1]]
print(f"\n==========Result==========\n{result}")


{'duration': '81.47s', 'train_time/model': '52.40s', 'test_step': 38250, 'test_episode': 510, 'test_time': '11.27s', 'test_speed': '3393.63 step/s', 'best_reward': -22.464293915369872, 'best_result': '-22.46 ± 5.60', 'train_step': 50000, 'train_episode': 670, 'train_time/collector': '17.80s', 'train_speed': '712.30 step/s'}
