In [None]:
"""This is a minimal example of using Tianshou with MARL to train agents.

Author: Will (https://github.com/WillDudley)

Python version used: 3.8.10

Requirements:
pettingzoo == 1.22.0
git+https://github.com/thu-ml/tianshou
"""

import os
from typing import Optional, Tuple

import gymnasium as gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils.net.common import Net

from pettingzoo.classic import tictactoe_v3


def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gym.spaces.Dict)
        else env.observation_space
    )
    if agent_learn is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 128, 128, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        )

    if agent_opponent is None:
        agent_opponent = RandomPolicy()

    agents = [agent_opponent, agent_learn]
    policy = MultiAgentPolicyManager(agents, env)
    return policy, optim, env.agents


def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    return PettingZooEnv(tictactoe_v3.env())


if __name__ == "__main__":
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(10)])
    test_envs = DummyVectorEnv([_get_env for _ in range(10)])

    # seed
    seed = 1
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()

    # ======== Step 3: Collector setup =========
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(20_000, len(train_envs)),
        exploration_noise=True,
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # policy.set_eps(1)
    train_collector.collect(n_step=64 * 10)  # batch size * training_num

    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):
        model_save_path = os.path.join("log", "rps", "dqn", "policy.pth")
        os.makedirs(os.path.join("log", "rps", "dqn"), exist_ok=True)
        torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

    def stop_fn(mean_rewards):
        return mean_rewards >= 0.6

    def train_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.1)

    def test_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.05)

    def reward_metric(rews):
        return rews[:, 1]

    # ======== Step 5: Run the trainer =========
    result = offpolicy_trainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=50,
        step_per_epoch=1000,
        step_per_collect=50,
        episode_per_test=10,
        batch_size=64,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=0.1,
        test_in_train=False,
        reward_metric=reward_metric,
    )

    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[1]])")


In [None]:
from environment import Environment

env = Environment.VehicleJobSchedulingEnvACE()
env.reset()
for agent in env.agent_iter(10000):
    env.step(env.action_space(agent).sample())


In [None]:
import jax.numpy as jnp
from jax import grad, jit, vmap
from jax import random


In [None]:
key = random.PRNGKey(0)
x = random.normal(key, (10,))
print(x)


In [None]:
size = 3000
x = random.normal(key, (size, size), dtype=jnp.float32)
%timeit jnp.dot(x, x.T).block_until_ready()  # runs on the GPU


In [None]:
def selu(x, alpha=1.67, lmbda=1.05):
  return lmbda * jnp.where(x > 0, x, alpha * jnp.exp(x) - alpha)

x = random.normal(key, (1000000,))
%timeit selu(x).block_until_ready()


In [None]:
selu_jit = jit(selu)
%timeit selu_jit(x).block_until_ready()


In [2]:
from environment import environment_jax

env = environment_jax.VehicleJobSchedulingEnvACE()


AttributeError: module 'jax.numpy' has no attribute 'random'

In [None]:
from environment import Environment

env = Environment.VehicleJobSchedulingEnvACE()
env.reset()


In [None]:
env.parameters.cluster.machines[1].observe()


In [7]:
import cProfile
from pettingzoo.test import performance_benchmark
from environment import Environment
for i in range(25,40,5):
    for j in range(10,100,20):
        print("duration: ", i, " average_per_slot: ", j)
        para = Environment.VehicleJobSchedulingParameters(duration=i,average_per_slot=j)
        env = Environment.VehicleJobSchedulingEnvACE()
        performance_benchmark(env)


duration:  25  average_per_slot:  10
Starting performance benchmark
12679.175816391342 turns per second
1056.5979846992784 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  30
Starting performance benchmark
12731.171350055083 turns per second
1060.9309458379237 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  50
Starting performance benchmark
12692.710303681068 turns per second
1057.725858640089 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  70
Starting performance benchmark
12447.760684349141 turns per second
1037.3133903624284 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  90
Starting performance benchmark
12489.932789005727 turns per second
1040.8277324171438 cycles per second
Finished performance benchmark
duration:  30  average_per_slot:  10
Starting performance benchmark
12766.805595751817 turns per second
1063.9004663126514 cycles per second


In [3]:
import cProfile
from tkinter import E
from pettingzoo.test import performance_benchmark, api_test
from environment import Environment

for i in range(10,100,10):
    para = Environment.VehicleJobSchedulingParameters(average_per_slot=i, duration=30)
    env = Environment.VehicleJobSchedulingEnvACE(parameter=para)
    env.reset()
    for agent in env.agent_iter(100000):
        env.step(env.action_space(agent).sample())   
    print("Finish Rate: ",env.finished_job/env.total_job)


Finish Rate:  0.8738916256157635
Finish Rate:  0.837245696400626
Finish Rate:  0.7435723951285521
Finish Rate:  0.6604767879548307
Finish Rate:  0.5661592505854801
Finish Rate:  0.5196389771017884
Finish Rate:  0.4667235494880546
Finish Rate:  0.43742098609355246
Finish Rate:  0.3821138211382114


In [None]:
for agent in env.agent_iter(10):
    action = 1
    env.step(action)
    print(action)


In [None]:
print("env job finished rate", env.finished_job/env.total_job)


In [None]:
from environment import Environment, AllocationMechanism, Machine

env = Environment.VehicleJobSchedulingEnvACE()
for job in env.get_job_next_step():
    job1 = job
    break
cluster = env.parameters.cluster

fp = AllocationMechanism.FirstPrice()
bids = Machine.Bids(cluster,job1)


In [None]:
bids.request_bids()
bids.get_bids()
bids.bids


In [None]:
from tianshou.data import Collector
from tianshou.env import DummyVectorEnv
from tianshou.policy import MultiAgentPolicyManager, RandomPolicy

import tianshou_setup

env = tianshou_setup.get_env()
policies = MultiAgentPolicyManager(
    [
        RandomPolicy(
            observation_space=env.observation_space, action_space=env.action_space
        )
        for _ in range(len(env.agents))
    ],
    env,
)


In [None]:
pol = RandomPolicy(
            observation_space=env.observation_space, action_space=env.action_space
        )


In [None]:
pol.forward()


In [None]:
import time

from tianshou.data import Collector
from tianshou.env import DummyVectorEnv, SubprocVectorEnv
from tianshou.policy import MultiAgentPolicyManager

import tianshou_setup
from static_policy.TruthfulPolicy import TruthfulPolicy

env = tianshou_setup.get_env_continous()


def get_DummyVectorEnv_n(n):
    return DummyVectorEnv([lambda: env for _ in range(n)])


def get_SubprocVectorEnv_n(n):
    return SubprocVectorEnv([lambda: env for _ in range(n)])


def get_policy_manager():
    env = tianshou_setup.get_env_continous()
    policies = MultiAgentPolicyManager(
        [
            TruthfulPolicy(
                observation_space=env.observation_space, action_space=env.action_space
            )
            for _ in range(len(env.agents))
        ],
        env,
    )
    return policies


In [None]:
n = 10
env = get_DummyVectorEnv_n(n)
policies = get_policy_manager()
collector = Collector(policies, env)


In [None]:
# Execute one episode
start = time.time()
result = collector.collect(n_episode=10)
end = time.time()
print(f"Time elapsed: {end-start}")


In [None]:
tp = policies.policies['Machine_0']


In [5]:
def FinishRate(average_per_slot=10,machine_numbers=10):
    from environment import Environment
    para = Environment.VehicleJobSchedulingParameters(average_per_slot=average_per_slot, machine_numbers=machine_numbers)
    env = Environment.VehicleJobSchedulingEnvACE(parameter=para)
    env.reset()
    for agent in env.agent_iter(100000):
        env.step(env.action_space(agent).sample())
    print("Finish Rate: ",env.finished_job/env.total_job)


In [14]:
for average in range(1,40,2):
    print("average: ", average)
    FinishRate(average_per_slot=average,machine_numbers=6)


average:  1
Finish Rate:  0.8653846153846154
average:  3
Finish Rate:  0.9018181818181819
average:  5
Finish Rate:  0.8770161290322581
average:  7
Finish Rate:  0.8705547652916074
average:  9
Finish Rate:  0.84251968503937
average:  11
Finish Rate:  0.8443396226415094
average:  13
Finish Rate:  0.7830609212481426
average:  15
Finish Rate:  0.7724665391969407
average:  17
Finish Rate:  0.7380410022779044
average:  19
Finish Rate:  0.7107085775173149
average:  21
Finish Rate:  0.7013487475915221
average:  23
Finish Rate:  0.6526458616010855
average:  25
Finish Rate:  0.6032372680615871
average:  27
Finish Rate:  0.603460972017673
average:  29
Finish Rate:  0.5652474904811353
average:  31
Finish Rate:  0.5144059566202654
average:  33
Finish Rate:  0.5041398344066237
average:  35
Finish Rate:  0.48461755574372
average:  37
Finish Rate:  0.45884827767551933
average:  39
Finish Rate:  0.43522372528616027


In [16]:
for average in range(2,69,6):
    print("average: ", average)
    FinishRate(average_per_slot=average,machine_numbers=12)


average:  2
Finish Rate:  0.9214659685863874
average:  8
Finish Rate:  0.8761467889908257
average:  14
Finish Rate:  0.8304446119065562
average:  20
Finish Rate:  0.8105749486652978
average:  26
Finish Rate:  0.7671130952380952
average:  32
Finish Rate:  0.7190490892250695
average:  38
Finish Rate:  0.685372340425532
average:  44
Finish Rate:  0.6315431679129844
average:  50
Finish Rate:  0.5862412761714856
average:  56
Finish Rate:  0.5447884227880564
average:  62
Finish Rate:  0.4992874109263658
average:  68
Finish Rate:  0.4751783590963139


In [26]:
for average in range(70,76,1):
    print("average: ", average)
    FinishRate(average_per_slot=average,machine_numbers=24)


average:  70
Finish Rate:  0.6954293037163606
average:  71
Finish Rate:  0.6863938053097345
average:  72
Finish Rate:  0.6639825303671353
average:  73
Finish Rate:  0.6614386154678205
average:  74
Finish Rate:  0.6646570203644159
average:  75
Finish Rate:  0.6658644203770557


In [27]:
FinishRate(average_per_slot=120, machine_numbers=24)


Finish Rate:  0.523389159310288


In [10]:
for machine in [6,12,24]:
    for average in range(10,100,10):
        print("machine: ", machine, " average: ", average)
        FinishRate(average,machine)


machine:  6  average:  10
Finish Rate:  0.8121442125237192
machine:  6  average:  20
Finish Rate:  0.6793587174348698
machine:  6  average:  30
Finish Rate:  0.535857908847185
machine:  6  average:  40
Finish Rate:  0.45218476903870164
machine:  6  average:  50
Finish Rate:  0.36667995217218013
machine:  6  average:  60
Finish Rate:  0.30702179176755445
machine:  6  average:  70
Finish Rate:  0.27015032211882606
machine:  6  average:  80
Finish Rate:  0.2350432710397592
machine:  6  average:  90
Finish Rate:  0.21080903310677482
machine:  12  average:  10
Finish Rate:  0.8653100775193798
machine:  12  average:  20
Finish Rate:  0.8021148036253777
machine:  12  average:  30
Finish Rate:  0.7311827956989247
machine:  12  average:  40
Finish Rate:  0.6471898984897252
machine:  12  average:  50
Finish Rate:  0.5836909871244635
machine:  12  average:  60
Finish Rate:  0.5137344669718771
machine:  12  average:  70
Finish Rate:  0.46066282420749277
machine:  12  average:  80
Finish Rate:  0.4

In [28]:
import rllib_sac
import ray

ray.init()
rllib_sac.train_sac(30,6)


2023-09-30 11:12:30,610	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


{'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'num_gpus': 1, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, '_fake_gpus': False, 'num_learner_workers': 0, 'num_gpus_per_learner_worker': 0, 'num_cpus_per_learner_worker': 1, 'local_gpu_idx': 0, 'custom_resources_per_worker': {}, 'placement_strategy': 'PACK', 'eager_tracing': False, 'eager_max_retraces': 20, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'env': 'VJS_6_30', 'env_config': {}, 'observation_space': None, 'action_space': None, 'env_task_fn': None, 'render_env': False, 'clip_rewards': None, 'normalize_actions': True, 'clip_actions': False, 'disable_env_checking': True, 'is_atari': None, 'auto_wrap_old_gym_envs': True, 'num_e

0,1
Current time:,2023-09-30 11:13:32
Running for:,00:01:00.12
Memory:,43.3/251.8 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_VJS_6_30_31da1_00000,RUNNING,192.168.3.6:818076,1,42.4586,110755,39178100.0,44564600.0,36943000.0,11075.5


[2m[36m(SAC pid=818076)[0m 2023-09-30 11:12:38,979	INFO algorithm.py:536 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(SAC pid=818076)[0m 2023-09-30 11:12:49,208	INFO trainable.py:173 -- Trainable.setup took 10.231 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_total,training_iteration,trial_id
SAC_VJS_6_30_31da1_00000,110755,"{'ObsPreprocessorConnector_ms': 0.005630652109781901, 'StateBufferConnector_ms': 0.0044707457224528, 'ViewRequirementAgentConnector_ms': 0.12716094652811685}","{'num_env_steps_sampled': 110755, 'num_env_steps_trained': 258, 'num_agent_steps_sampled': 110755, 'num_agent_steps_trained': 258, 'last_target_update_ts': 110755, 'num_target_updates': 1}",{},2023-09-30_11-13-31,False,11075.5,{},44564600.0,39178100.0,36943000.0,10,10,lwh-Super-Server,"{'learner': {'Machine_0': {'custom_metrics': {}, 'learner_stats': {'actor_loss': 0.6877908706665039, 'critic_loss': 5141.9453125, 'alpha_loss': 0.0, 'alpha_value': 0.99970007, 'log_alpha_value': -0.00029999993, 'target_entropy': -1.0, 'policy_t': 0.760205090045929, 'mean_q': -0.07945867627859116, 'max_q': 0.14960968494415283, 'min_q': -0.28345489501953125}, 'model': {}, 'num_grad_updates_lifetime': 1.0, 'diff_num_grad_updates_vs_sampler_policy': 0.0, 'td_error': array([4.0665527e+04, 5.4440305e-02, 1.1243813e+00, 4.8830611e-01,  1.2530667e+00, 8.2524086e+04, 1.6063209e+00, 1.3335884e+00,  2.1081969e-02, 1.2101886e+00, 1.6074028e+00, 1.6620512e+00,  2.8746176e-01, 9.6830225e-01, 1.3710349e+00, 1.3568031e+00,  3.3793747e-01, 1.1528366e+00, 3.2543977e+04, 1.5321329e+00,  7.0614624e-01, 4.7325560e-01, 1.6111774e+00, 7.7656716e-02,  9.8603183e-01, 1.4736912e+00, 6.5347090e+04, 1.3899674e+00,  1.4403410e+00, 1.2815191e+00, 1.1749709e-01, 1.5255405e+00,  7.3588169e-01, 1.5035071e+00, 2.0056548e+00, 1.6878421e+00,  1.7585278e+00, 6.7455769e-01, 3.8693756e-01, 7.8697848e-01,  2.9450893e-01, 2.4779024e+00, 4.0819579e-01], dtype=float32), 'mean_td_error': 5142.36865234375}, 'Machine_1': {'custom_metrics': {}, 'learner_stats': {'actor_loss': 8.174642562866211, 'critic_loss': 5674.10498046875, 'alpha_loss': 0.0, 'alpha_value': 1.0003, 'log_alpha_value': 0.00029999996, 'target_entropy': -1.0, 'policy_t': -0.05043136700987816, 'mean_q': 5.080766677856445, 'max_q': 5.831745147705078, 'min_q': 3.950099229812622}, 'model': {}, 'num_grad_updates_lifetime': 1.0, 'diff_num_grad_updates_vs_sampler_policy': 0.0, 'td_error': array([1.7305450e+05, 5.4912758e+00, 1.2189648e+01, 9.5074043e+00,  1.2270411e+01, 1.0431404e+01, 7.0626703e+04, 4.0478263e+00,  5.1994123e+00, 3.8193402e+00, 1.2340724e+01, 6.5207343e+00,  9.0678062e+00, 8.2051783e+00, 4.2511396e+00, 9.2943678e+00,  1.2387680e+01, 3.9857445e+00, 9.8472404e+00, 3.8245883e+00,  1.2343034e+01, 4.1130624e+00, 1.0952567e+01, 1.1791536e+01,  4.2248945e+00, 9.5534048e+00, 1.2315926e+01, 3.4921072e+00,  1.2722763e+01, 1.2999750e+01, 3.5853291e+00, 4.1947675e+00,  5.3616371e+00, 1.0939082e+01, 3.1626801e+00, 3.6698041e+00,  1.2001844e+01, 1.0544850e+01, 3.7296329e+00, 3.9835219e+00,  1.2194521e+01, 3.7314506e+00, 1.1973492e+01], dtype=float32), 'mean_td_error': 5674.591796875}, 'Machine_2': {'custom_metrics': {}, 'learner_stats': {'actor_loss': -0.07856561243534088, 'critic_loss': 2327.988525390625, 'alpha_loss': 0.0, 'alpha_value': 0.99970007, 'log_alpha_value': -0.00029999996, 'target_entropy': -1.0, 'policy_t': 0.23459558188915253, 'mean_q': 0.02755546011030674, 'max_q': 0.7291316986083984, 'min_q': -0.7601333856582642}, 'model': {}, 'num_grad_updates_lifetime': 1.0, 'diff_num_grad_updates_vs_sampler_policy': 0.0, 'td_error': array([5.8758605e-01, 8.0582562e+04, 4.1056852e+00, 1.9486709e+04,  3.5430676e-01, 3.3099914e+00, 1.3380589e+00, 6.8220079e-01,  2.3633614e+00, 9.1002727e-01, 1.6930327e-01, 4.0229198e-01,  8.2213491e-01, 3.5676429e-01, 3.2816434e-01, 6.7999506e-01,  2.5508823e+00, 1.2836016e+00, 3.3226304e+00, 8.2756311e-01,  6.8102640e-01, 2.8707761e-01, 1.7927976e+00, 8.5441250e-01,  5.3936410e-01, 1.8339012e+00, 3.1306183e-01, 6.4521837e-01,  1.5526724e+00, 4.1644615e-01, 1.7772462e+00, 2.1593404e+00,  9.9099046e-01, 2.9086629e-01, 3.8315945e+00, 4.5075154e-01,  3.4123871e+00, 1.3278430e+00, 1.0399585e+00, 6.2660277e-01,  1.4658800e+00, 7.8512627e-01, 2.6628187e-01], dtype=float32), 'mean_td_error': 2328.3955078125}, 'Machine_3': {'custom_metrics': {}, 'learner_stats': {'actor_loss': 0.26429811120033264, 'critic_loss': 2172.265869140625, 'alpha_loss': 0.0, 'alpha_value': 0.99970007, 'log_alpha_value': -0.00029999996, 'target_entropy': -1.0, 'policy_t': 0.35043472051620483, 'mean_q': -0.23442591726779938, 'max_q': -0.04695901274681091, 'min_q': -0.44547539949417114}, 'model': {}, 'num_grad_updates_lifetime': 1.0, 'diff_num_grad_updates_vs_sampler_policy': 0.0, 'td_error': array([1.7923430e+00, 5.7504106e-01, 1.9454247e-01, 6.2672466e-01,  5.2029550e-01, 7.1262741e-01, 1.6206093e-01, 1.3882105e+00,  5.7498455e-01, 6.1877477e-01, 8.4421903e-01, 6.5886045e-01,  7.6197994e-01, 6.5907938e+04, 4.2457312e-01, 1.7565420e+00,  2.9460657e-01, 7.2171390e-01, 7.5608397e-01, 1.3203483e+00,  3.1762588e-01, 5.5810446e-01, 2.7485277e+04, 1.9721593e-01,  6.4488834e-01, 7.1502197e-01, 4.9310675e-01, 4.4309089e-01,  2.9362506e-01, 3.3387545e-01, 4.9404278e-01, 3.1843308e-01,  1.1998708e+00, 2.2215376e+00, 6.3516480e-01, 3.7205288e-01,  1.0699122e+00, 2.4758452e-01, 6.7817312e-01, 5.6747800e-01,  2.1458712e+00, 1.2268114e+00, 7.3482060e-01], dtype=float32), 'mean_td_error': 2172.647216796875}, 'Machine_4': {'custom_metrics': {}, 'learner_stats': {'actor_loss': 15.428377151489258, 'critic_loss': 2081.624755859375, 'alpha_loss': 0.0, 'alpha_value': 1.0003, 'log_alpha_value': 0.0003, 'target_entropy': -1.0, 'policy_t': 0.9788821935653687, 'mean_q': 6.983963489532471, 'max_q': 7.0963521003723145, 'min_q': 6.492913246154785}, 'model': {}, 'num_grad_updates_lifetime': 1.0, 'diff_num_grad_updates_vs_sampler_policy': 0.0, 'td_error': array([1.06187582e+01, 1.04036846e+01, 1.11610632e+01, 1.08321686e+01,  1.05897369e+01, 1.13831577e+01, 1.03485432e+01, 9.34339046e+00,  1.00174732e+01, 3.58909619e+03, 1.06706810e+01, 1.02796345e+01,  1.03148298e+01, 9.64211845e+00, 1.11754723e+01, 1.09429226e+01,  9.17383194e+00, 1.14269009e+01, 1.13760605e+01, 1.00352516e+01,  1.14164133e+01, 1.14908838e+01, 1.11179123e+01, 1.12190189e+01,  1.11555882e+01, 7.04760000e+04, 1.13282681e+01, 1.13002310e+01,  1.12677288e+01, 1.10680885e+01, 9.39256859e+00, 1.06918478e+01,  1.07006235e+01, 1.07546120e+01, 1.04136829e+01, 1.50342031e+04,  1.11747332e+01, 1.14203873e+01, 1.11885738e+01, 1.15197659e+01,  1.13213043e+01, 1.10898743e+01, 1.12960320e+01], dtype=float32), 'mean_td_error': 2082.12451171875}, 'Machine_5': {'custom_metrics': {}, 'learner_stats': {'actor_loss': 1.1621485948562622, 'critic_loss': 6825.7041015625, 'alpha_loss': 0.0, 'alpha_value': 0.99970007, 'log_alpha_value': -0.00029999996, 'target_entropy': -1.0, 'policy_t': -0.027112172916531563, 'mean_q': -1.792523741722107, 'max_q': -1.2957220077514648, 'min_q': -2.3339755535125732}, 'model': {}, 'num_grad_updates_lifetime': 1.0, 'diff_num_grad_updates_vs_sampler_policy': 0.0, 'td_error': array([1.23245016e+05, 1.89375520e+00, 1.45384383e+00, 1.37705553e+00,  1.11858439e+00, 1.69661248e+00, 1.64045048e+00, 1.67600369e+00,  1.85766935e+00, 1.73400164e+00, 1.70464540e+00, 1.89115191e+00,  1.81754947e+00, 1.36852145e+00, 8.21454688e+04, 1.05796814e+00,  1.30691934e+00, 1.21382844e+00, 1.90995109e+00, 1.37077820e+00,  2.02329254e+00, 1.05481291e+00, 1.51228583e+00, 1.64141321e+00,  1.07416439e+00, 1.87606573e+00, 8.80733125e+04, 2.01847625e+00,  1.93169868e+00, 1.66582131e+00, 1.47454548e+00, 1.52608204e+00,  1.14475298e+00, 2.00179100e+00, 2.07290244e+00, 1.32667160e+00,  2.01289749e+00, 1.34144616e+00, 1.17963648e+00, 1.03355098e+00,  1.08432078e+00, 1.45389867e+00, 1.27924371e+00], dtype=float32), 'mean_td_error': 6826.17724609375}}, 'num_env_steps_sampled': 110755, 'num_env_steps_trained': 258, 'num_agent_steps_sampled': 110755, 'num_agent_steps_trained': 258, 'last_target_update_ts': 110755, 'num_target_updates': 1}",1,192.168.3.6,110755,258,110755,110755,2610.32,258,258,6.08065,0,10,0,0,258,"{'cpu_util_percent': 18.28444444444444, 'ram_util_percent': 17.008888888888894, 'gpu_util_percent0': 0.5993333333333333, 'vram_util_percent0': 0.8309172453703706, 'gpu_util_percent1': 0.00044444444444444447, 'vram_util_percent1': 0.00032552083333333337}",818076,"{'Machine_2': 9906780.447603539, 'Machine_5': 9387044.388747254, 'Machine_4': 5783864.870646471, 'Machine_3': 4257275.229611814, 'Machine_0': 6479269.19855386, 'Machine_1': 10473960.10680602}","{'Machine_2': 8781241.356379366, 'Machine_5': 8778715.720527632, 'Machine_4': 3571719.427174209, 'Machine_3': 3603660.816933346, 'Machine_0': 5007179.5825243415, 'Machine_1': 9435590.829809576}","{'Machine_2': 7798972.249257327, 'Machine_5': 7372601.303536683, 'Machine_4': 1725123.1581086365, 'Machine_3': 2638908.0997323096, 'Machine_0': 4304654.594082534, 'Machine_1': 8631553.000595061}","{'mean_raw_obs_processing_ms': 0.3016310477046461, 'mean_inference_ms': 1.250758361285411, 'mean_action_processing_ms': 0.20352159306542755, 'mean_env_wait_ms': 0.12070695063656237, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 44564626.56391185, 'episode_reward_min': 36942991.72150791, 'episode_reward_mean': 39178107.733348474, 'episode_len_mean': 11075.5, 'episode_media': {}, 'episodes_this_iter': 10, 'policy_reward_min': {'Machine_2': 7798972.249257327, 'Machine_5': 7372601.303536683, 'Machine_4': 1725123.1581086365, 'Machine_3': 2638908.0997323096, 'Machine_0': 4304654.594082534, 'Machine_1': 8631553.000595061}, 'policy_reward_max': {'Machine_2': 9906780.447603539, 'Machine_5': 9387044.388747254, 'Machine_4': 5783864.870646471, 'Machine_3': 4257275.229611814, 'Machine_0': 6479269.19855386, 'Machine_1': 10473960.10680602}, 'policy_reward_mean': {'Machine_2': 8781241.356379366, 'Machine_5': 8778715.720527632, 'Machine_4': 3571719.427174209, 'Machine_3': 3603660.816933346, 'Machine_0': 5007179.5825243415, 'Machine_1': 9435590.829809576}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [40930859.60915342, 39703735.29956111, 37442999.90479669, 37302525.277771175, 38430229.64975363, 41841952.29985553, 37406038.56926617, 44564626.56391185, 36942991.72150791, 37215118.4379073], 'episode_lengths': [11180, 11144, 11037, 10723, 11293, 10861, 11090, 11322, 10971, 11134], 'policy_Machine_2_reward': [9056348.17918519, 9771461.09495578, 7798972.249257327, 8726141.921534631, 8426430.359457463, 9311726.048077293, 7974784.807999194, 9906780.447603539, 8285436.02737236, 8554332.428350898], 'policy_Machine_5_reward': [9224877.104262536, 7372601.303536683, 8413679.63646838, 8468394.90954867, 8586144.418477328, 9387044.388747254, 8980337.414001409, 8991111.905710343, 9157631.267742252, 9205334.856781462], 'policy_Machine_4_reward': [2873377.8017750904, 5441767.677134634, 2507373.8305901196, 3297191.9611710208, 4429675.712143716, 3996167.6708685714, 3050046.116148341, 5783864.870646471, 2612605.4731554906, 1725123.1581086365], 'policy_Machine_3_reward': [4116593.231486857, 3626056.774901122, 4041732.521159202, 3648031.5458087325, 2638908.0997323096, 3294535.898992002, 4040316.5909834504, 4257275.229611814, 2755603.725015521, 3617554.5516424477], 'policy_Machine_0_reward': [5185703.1856377125, 4552182.874254763, 4304654.594082534, 4489938.501571476, 4738112.4010511935, 6479269.19855386, 4729000.639538705, 5922831.132450521, 4658215.6642844975, 5011887.63381815], 'policy_Machine_1_reward': [10473960.10680602, 8939665.574778136, 10376587.073239097, 8672826.438136635, 9610958.658891607, 9373209.094616553, 8631553.000595061, 9702762.977889169, 9473499.563937776, 9100885.809205705]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.3016310477046461, 'mean_inference_ms': 1.250758361285411, 'mean_action_processing_ms': 0.20352159306542755, 'mean_env_wait_ms': 0.12070695063656237, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.005630652109781901, 'StateBufferConnector_ms': 0.0044707457224528, 'ViewRequirementAgentConnector_ms': 0.12716094652811685}}",42.4586,42.4586,42.4586,"{'training_iteration_time_ms': 42429.443, 'sample_time_ms': 21824.654, 'load_time_ms': 47.391, 'load_throughput': 5444.024, 'learn_time_ms': 341.266, 'learn_throughput': 756.009, 'synch_weights_time_ms': 67.445}",1696043611,110755,1,31da1_00000


2023-09-30 11:13:42,053	INFO tune.py:1111 -- Total run time: 70.25 seconds (60.03 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)


[2m[36m(RolloutWorker pid=820220)[0m Traceback (most recent call last):
[2m[36m(RolloutWorker pid=820220)[0m   File "python/ray/_raylet.pyx", line 1438, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=820220)[0m   File "python/ray/_raylet.pyx", line 1378, in ray._raylet.execute_task.function_executor
[2m[36m(RolloutWorker pid=820220)[0m   File "/home/yuan/ResMan/man/lib/python3.9/site-packages/ray/_private/function_manager.py", line 724, in actor_method_executor
[2m[36m(RolloutWorker pid=820220)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(RolloutWorker pid=820220)[0m   File "/home/yuan/ResMan/man/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 464, in _resume_span
[2m[36m(RolloutWorker pid=820220)[0m     return method(self, *_args, **_kwargs)
[2m[36m(RolloutWorker pid=820220)[0m   File "/home/yuan/ResMan/man/lib/python3.9/site-packages/ray/rllib/utils/actor_manager.py", line 176, in apply
[2m[36m(RolloutWorker pid=820

In [1]:
import numpy as np
con = np.array([1])


In [3]:
con[0]


1

## Random

In [7]:
from environment import Environment
def get_env_continuous(average_per_slot=50, machine_num=12):
    para = Environment.VehicleJobSchedulingParameters(
        average_per_slot=average_per_slot, machine_numbers=machine_num
    )
    para.action_space_continuous = True
    env = Environment.VehicleJobSchedulingEnvACE(parameter=para)
    return env


In [10]:
finished = False
env = get_env_continuous(20,6)
env.reset()
reward_sums = {a: 0.0 for a in env.possible_agents}
for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()
    reward_sums[agent] += reward
    env.step(env.action_space(agent).sample())
    
print("rewards:")
print(reward_sums)
print("total reward:")
print(sum(reward_sums.values()))


rewards:
{'Machine_0': 1443952.3514621258, 'Machine_1': 3046659.2624404957, 'Machine_2': 2705154.313986431, 'Machine_3': 1563793.850536108, 'Machine_4': 3356951.536629907, 'Machine_5': 3638268.5951493443}
total reward:
15754779.910204412


In [3]:
env.finished_job/env.total_job


0.8440643863179075

In [9]:
env = get_env_continuous(20,6)
env.reset()
reward_sums = {a: 0.0 for a in env.possible_agents}
for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()
    reward_sums[agent] += reward
    env.step(1)
    
print("rewards:")
print(reward_sums)
print("total reward:")
print(sum(reward_sums.values()))


rewards:
{'Machine_0': 1675460.25, 'Machine_1': 4127737.0500000077, 'Machine_2': 3547264.9499999946, 'Machine_3': 1604004.75, 'Machine_4': 3371350.050000006, 'Machine_5': 3282498.4499999955}
total reward:
17608315.500000007


In [22]:
env.finished_job/env.total_job


0.5370744860128075

In [15]:
from ray.tune import register_env

import rllib_setup
def create_env(machine,jobs):
    env_name = "VJS"
    alg_name = "SAC"
    register_env(
        env_name,
        lambda config: rllib_setup.get_env_continuous(jobs, machine),
    )
    test_env = rllib_setup.get_env_continuous(jobs, machine)
    return test_env

test_env = create_env(12,60)

test_env.reset()

reward_sums = {a: 0.0 for a in test_env._agent_ids}


In [16]:
from ray.rllib.algorithms.algorithm import Algorithm
import ray

# ray.init()
algo = Algorithm.from_checkpoint("/home/yuan/ray_results/A3C/SAC_VJS_0c6b7_00000_0_2023-09-29_13-28-40/checkpoint_000700")


2023-10-04 15:50:32,128	INFO trainable.py:173 -- Trainable.setup took 11.342 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [24]:
algo.evaluate()


ValueError: Cannot evaluate w/o an evaluation worker set in the Trainer or w/o an env on the local worker!
Try one of the following:
1) Set `evaluation_interval` >= 0 to force creating a separate evaluation worker set.
2) Set `create_env_on_driver=True` to force the local (non-eval) worker to have an environment to evaluate on.

In [23]:
for agent in test_env.env.agent_iter():
    observation, reward, termination, truncation, info = test_env.env.last()
    print(test_env.env.last())
    reward_sums[agent] += reward
    if termination or truncation:
        action = None
    else:
        print(algo.get_policy(agent))
        policy = algo.get_policy(agent)
        batch_obs = {
            "obs": {
                "observation": observation,
                "action_mask": observation,
            }
        }
        batched_action, state_out, info = policy.compute_actions_from_input_dict(
            batch_obs
        )
        single_action = batched_action[0]
        action = single_action

    test_env.step(action)
    i += 1
    # env.render()

print("rewards:")
print(reward_sums)


(array([ 72., 144.,  72., 144.,  72., 144.,  72., 144.,  72., 144.,  72.,
       144.,  72., 144.,  72., 144.,  72., 144.,  72., 144.,  72., 144.,
        72., 144.,  72., 144.,  72., 144.,  72., 144.,  72., 144.,  72.,
       144.,  72., 144.,  72., 144.,  72., 144.,   0.,   0.,   1.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.]), 0, False, False, {})
SACTorchPolicy


RuntimeError: mat1 and mat2 shapes cannot be multiplied (73x2 and 73x256)

In [3]:
from environment import environment_jax
from environment import Environment
from pettingzoo.test import performance_benchmark



env = Environment.VehicleJobSchedulingEnvACE()
env.reset()
performance_benchmark(env)


Starting performance benchmark


11005.012010156304 turns per second
917.0843341796921 cycles per second
Finished performance benchmark
Starting performance benchmark
11391.133549607744 turns per second
949.2611291339787 cycles per second
Finished performance benchmark


In [4]:
import cProfile
env = environment_jax.VehicleJobSchedulingEnvACE()
env.reset()
cProfile.run("performance_benchmark(env)")


Starting performance benchmark
10990.468154029168 turns per second
915.8723461690973 cycles per second
Finished performance benchmark
Starting performance benchmark
7658.223707568317 turns per second
638.1853089640264 cycles per second
Finished performance benchmark
         3769624 function calls (3615914 primitive calls) in 5.004 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    36119    0.039    0.000    0.372    0.000 <__array_function__ internals>:177(all)
     8178    0.008    0.000    0.133    0.000 <__array_function__ internals>:177(argsort)
       33    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(atleast_1d)
       33    0.000    0.000    0.001    0.000 <__array_function__ internals>:177(broadcast_arrays)
    38420    0.039    0.000    0.200    0.000 <__array_function__ internals>:177(concatenate)
      108    0.000    0.000    0.001    0.000 <__array_function__ internals>:177(copyto)
    

In [2]:
import jax.numpy as np
np.max(np.array([24, 100]))


Array(100, dtype=int32)