In [None]:
"""This is a minimal example of using Tianshou with MARL to train agents.

Author: Will (https://github.com/WillDudley)

Python version used: 3.8.10

Requirements:
pettingzoo == 1.22.0
git+https://github.com/thu-ml/tianshou
"""

import os
from typing import Optional, Tuple

import gymnasium as gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils.net.common import Net

from pettingzoo.classic import tictactoe_v3


def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gym.spaces.Dict)
        else env.observation_space
    )
    if agent_learn is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 128, 128, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        )

    if agent_opponent is None:
        agent_opponent = RandomPolicy()

    agents = [agent_opponent, agent_learn]
    policy = MultiAgentPolicyManager(agents, env)
    return policy, optim, env.agents


def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    return PettingZooEnv(tictactoe_v3.env())


if __name__ == "__main__":
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(10)])
    test_envs = DummyVectorEnv([_get_env for _ in range(10)])

    # seed
    seed = 1
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)

    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents()

    # ======== Step 3: Collector setup =========
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(20_000, len(train_envs)),
        exploration_noise=True,
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # policy.set_eps(1)
    train_collector.collect(n_step=64 * 10)  # batch size * training_num

    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):
        model_save_path = os.path.join("log", "rps", "dqn", "policy.pth")
        os.makedirs(os.path.join("log", "rps", "dqn"), exist_ok=True)
        torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

    def stop_fn(mean_rewards):
        return mean_rewards >= 0.6

    def train_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.1)

    def test_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.05)

    def reward_metric(rews):
        return rews[:, 1]

    # ======== Step 5: Run the trainer =========
    result = offpolicy_trainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=50,
        step_per_epoch=1000,
        step_per_collect=50,
        episode_per_test=10,
        batch_size=64,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=0.1,
        test_in_train=False,
        reward_metric=reward_metric,
    )

    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[1]])")

In [None]:
from environment import Environment

env = Environment.VehicleJobSchedulingEnvACE()
env.reset()
for agent in env.agent_iter(10000):
    env.step(env.action_space(agent).sample())

In [None]:
import jax.numpy as jnp
from jax import grad, jit, vmap
from jax import random

In [None]:
key = random.PRNGKey(0)
x = random.normal(key, (10,))
print(x)

In [None]:
size = 3000
x = random.normal(key, (size, size), dtype=jnp.float32)
%timeit jnp.dot(x, x.T).block_until_ready()  # runs on the GPU

In [None]:
def selu(x, alpha=1.67, lmbda=1.05):
  return lmbda * jnp.where(x > 0, x, alpha * jnp.exp(x) - alpha)

x = random.normal(key, (1000000,))
%timeit selu(x).block_until_ready()

In [None]:
selu_jit = jit(selu)
%timeit selu_jit(x).block_until_ready()

In [2]:
from environment import environment_jax

env = environment_jax.VehicleJobSchedulingEnvACE()

AttributeError: module 'jax.numpy' has no attribute 'random'

In [None]:
from environment import Environment

env = Environment.VehicleJobSchedulingEnvACE()
env.reset()


In [None]:
env.parameters.cluster.machines[1].observe()

In [7]:
import cProfile
from pettingzoo.test import performance_benchmark
from environment import Environment
for i in range(25,40,5):
    for j in range(10,100,20):
        print("duration: ", i, " average_per_slot: ", j)
        para = Environment.VehicleJobSchedulingParameters(duration=i,average_per_slot=j)
        env = Environment.VehicleJobSchedulingEnvACE()
        performance_benchmark(env)


duration:  25  average_per_slot:  10
Starting performance benchmark
12679.175816391342 turns per second
1056.5979846992784 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  30
Starting performance benchmark
12731.171350055083 turns per second
1060.9309458379237 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  50
Starting performance benchmark
12692.710303681068 turns per second
1057.725858640089 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  70
Starting performance benchmark
12447.760684349141 turns per second
1037.3133903624284 cycles per second
Finished performance benchmark
duration:  25  average_per_slot:  90
Starting performance benchmark
12489.932789005727 turns per second
1040.8277324171438 cycles per second
Finished performance benchmark
duration:  30  average_per_slot:  10
Starting performance benchmark
12766.805595751817 turns per second
1063.9004663126514 cycles per second


In [3]:
import cProfile
from tkinter import E
from pettingzoo.test import performance_benchmark, api_test
from environment import Environment

for i in range(10,100,10):
    para = Environment.VehicleJobSchedulingParameters(average_per_slot=i, duration=30)
    env = Environment.VehicleJobSchedulingEnvACE(parameter=para)
    env.reset()
    for agent in env.agent_iter(100000):
        env.step(env.action_space(agent).sample())   
    print("Finish Rate: ",env.finished_job/env.total_job)

Finish Rate:  0.8738916256157635
Finish Rate:  0.837245696400626
Finish Rate:  0.7435723951285521
Finish Rate:  0.6604767879548307
Finish Rate:  0.5661592505854801
Finish Rate:  0.5196389771017884
Finish Rate:  0.4667235494880546
Finish Rate:  0.43742098609355246
Finish Rate:  0.3821138211382114


In [None]:
for agent in env.agent_iter(10):
    action = 1
    env.step(action)
    print(action)

In [None]:
print("env job finished rate", env.finished_job/env.total_job)

In [None]:
from environment import Environment, AllocationMechanism, Machine

env = Environment.VehicleJobSchedulingEnvACE()
for job in env.get_job_next_step():
    job1 = job
    break
cluster = env.parameters.cluster

fp = AllocationMechanism.FirstPrice()
bids = Machine.Bids(cluster,job1)

In [None]:
bids.request_bids()
bids.get_bids()
bids.bids

In [None]:
from tianshou.data import Collector
from tianshou.env import DummyVectorEnv
from tianshou.policy import MultiAgentPolicyManager, RandomPolicy

import tianshou_setup

env = tianshou_setup.get_env()
policies = MultiAgentPolicyManager(
    [
        RandomPolicy(
            observation_space=env.observation_space, action_space=env.action_space
        )
        for _ in range(len(env.agents))
    ],
    env,
)

In [None]:
pol = RandomPolicy(
            observation_space=env.observation_space, action_space=env.action_space
        )

In [None]:
pol.forward()

In [None]:
import time

from tianshou.data import Collector
from tianshou.env import DummyVectorEnv, SubprocVectorEnv
from tianshou.policy import MultiAgentPolicyManager

import tianshou_setup
from static_policy.TruthfulPolicy import TruthfulPolicy

env = tianshou_setup.get_env_continous()


def get_DummyVectorEnv_n(n):
    return DummyVectorEnv([lambda: env for _ in range(n)])


def get_SubprocVectorEnv_n(n):
    return SubprocVectorEnv([lambda: env for _ in range(n)])


def get_policy_manager():
    env = tianshou_setup.get_env_continous()
    policies = MultiAgentPolicyManager(
        [
            TruthfulPolicy(
                observation_space=env.observation_space, action_space=env.action_space
            )
            for _ in range(len(env.agents))
        ],
        env,
    )
    return policies


In [None]:
n = 10
env = get_DummyVectorEnv_n(n)
policies = get_policy_manager()
collector = Collector(policies, env)


In [None]:
# Execute one episode
start = time.time()
result = collector.collect(n_episode=10)
end = time.time()
print(f"Time elapsed: {end-start}")

In [None]:
tp = policies.policies['Machine_0']