In [2]:
import argparse
import os
import pprint

import gymnasium as gym
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.exploration import GaussianNoise
from tianshou.policy import DDPGPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import Actor, Critic


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='Pendulum-v1')
    parser.add_argument('--reward-threshold', type=float, default=None)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--exploration-noise', type=float, default=0.1)
    parser.add_argument('--epoch', type=int, default=5)
    parser.add_argument('--step-per-epoch', type=int, default=20000)
    parser.add_argument('--step-per-collect', type=int, default=8)
    parser.add_argument('--update-per-step', type=float, default=0.125)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
    parser.add_argument('--training-num', type=int, default=8)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument('--rew-norm', action="store_true", default=False)
    parser.add_argument('--n-step', type=int, default=3)
    parser.add_argument(
        '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'
    )
    args = parser.parse_known_args()[0]
    return args


def test_ddpg(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    if args.reward_threshold is None:
        default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
        args.reward_threshold = default_reward_threshold.get(
            args.task, env.spec.reward_threshold
        )
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)]
    )
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)]
    )
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
    actor = Actor(
        net, args.action_shape, max_action=args.max_action, device=args.device
    ).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net = Net(
        args.state_shape,
        args.action_shape,
        hidden_sizes=args.hidden_sizes,
        concat=True,
        device=args.device
    )
    critic = Critic(net, device=args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor,
        actor_optim,
        critic,
        critic_optim,
        tau=args.tau,
        gamma=args.gamma,
        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
        reward_normalization=args.rew_norm,
        estimation_step=args.n_step,
        action_space=env.action_space
    )
    # collector
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)),
        exploration_noise=True
    )
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    writer = SummaryWriter(log_path)
    logger = TensorboardLogger(writer)

    def save_best_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= args.reward_threshold

    # trainer
    result = offpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        args.step_per_collect,
        args.test_num,
        args.batch_size,
        update_per_step=args.update_per_step,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        logger=logger
    )
    assert stop_fn(result['best_reward'])

    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")



In [3]:
test_ddpg()

Epoch #1: 20001it [00:34, 571.69it/s, env_step=20000, len=200, loss/actor=129.021, loss/critic=11.332, n/ep=0, n/st=8, rew=-557.07]                            


Epoch #1: test_reward: -697.020395 ± 55.156102, best_reward: -697.020395 ± 55.156102 in #1


Epoch #2:  52%|#####2    | 10400/20000 [00:16<00:15, 612.36it/s, env_step=30400, len=200, n/ep=8, n/st=8, rew=-180.24]                                        

{'best_result': '-149.28 ± 80.51',
 'best_reward': -149.27721713256952,
 'duration': '56.71s',
 'test_episode': 300,
 'test_speed': '9511.39 step/s',
 'test_step': 60000,
 'test_time': '6.31s',
 'train_episode': 152,
 'train_speed': '603.14 step/s',
 'train_step': 30400,
 'train_time/collector': '8.64s',
 'train_time/model': '41.77s'}
Final reward: -128.97530423652887, length: 200.0





In [1]:
import torch
import numpy as np
from torch import nn
import gym
import tianshou as ts
import Environment
from tianshou.env import (
    ContinuousToDiscrete,
    DummyVectorEnv,
    PettingZooEnv,
    ShmemVectorEnv,
    SubprocVectorEnv,
)
from pettingzoo.utils.wrappers import BaseWrapper
# define a multi-agent environment

# import the predefined networks from tianshou
from tianshou.utils.net.common import ActorCritic, Net

In [2]:

def get_env():
    env = Environment.VehicleJobSchedulingEnvACE()
    env = BaseWrapper(env)
    env = PettingZooEnv(env)
    return env
env = get_env()
num_agents = env.num_agents
obs_shape,*_ = env.observation_space.shape
act_shape = env.action_space.n 



In [6]:
# define a policy network and a critic network for each agent
policy_nets = [Net(obs_shape, act_shape, hidden_sizes=[64], ) for _ in range(num_agents)] # use output_size=act_shape.sum() for multi-discrete action space
critic_nets = [Net(obs_shape * num_agents + act_shape * num_agents, hidden_sizes=[64], action_shape=1) for _ in range(num_agents)] # use act_shape.sum() instead of act_shape for multi-discrete action space

# define a policy and a critic for each agent using tianshou and the predefined networks
policies = []
for i in range(num_agents):
    # use different optimizers and parameters for policy and critic networks
    optim_p = torch.optim.Adam(policy_nets[i].parameters(), lr=1e-4)
    optim_c = torch.optim.Adam(critic_nets[i].parameters(), lr=1e-3)
    # use DDPGPolicy
    policy = ts.policy.DDPGPolicy(actor=policy_nets[i], actor_optim=optim_p,critic=critic_nets[i], critic_optim=optim_c,gamma=0.95,tau=0.01)
    policies.append(policy)
maddpg = ts.policy.MultiAgentPolicyManager(policies,env)
# define a replay buffer to store transitions


In [7]:
train_env = SubprocVectorEnv([lambda: env for _ in range(10)])
test_env = SubprocVectorEnv([lambda: env for _ in range(10)])
buffer = ts.data.VectorReplayBuffer(total_size=100000,buffer_num=10)
# define a collector to interact with the environment and collect data
train_collector = ts.data.Collector(maddpg, train_env, buffer)
test_collector = ts.data.Collector(maddpg, test_env)


In [10]:
train_collector.collect(n_episode=10, random=True)

{'n/ep': 10,
 'n/st': 4412,
 'rews': array([[  50.13333333, 1038.93333333,  711.26666667,  611.15555556,
          284.53333333,  449.84444444,  449.97777778,  197.31111111,
          132.02222222,   69.13333333],
        [  10.66666667,   53.66666667,  123.06666667,  383.53333333,
          295.26666667,   84.        ,  861.73333333,  256.28888889,
          401.28888889,  273.2       ],
        [ 122.22222222,  498.        ,  670.68888889,  261.68888889,
           88.84444444,  352.73333333,  292.48888889,  141.82222222,
           48.13333333, 1352.13333333],
        [ 310.88888889,  391.15555556,  509.6       ,  338.26666667,
          189.8       ,  988.        ,  259.84444444,   43.24444444,
          512.48888889,    0.        ],
        [  14.31111111, 2008.22222222,  408.95555556,  499.42222222,
          282.82222222,  853.08888889,  136.48888889,    0.        ,
          159.75555556,  224.33333333],
        [  43.55555556,  628.24444444,  666.84444444,  234.4       ,
     

In [4]:



# define a trainer to train the policies and critics
trainer = ts.trainer.offpolicy_trainer(
    policy= maddpg,
    buffer=buffer,
    train_collector = train_collector,
    test_collector = test_collector,
    max_epoch=5000,
    step_per_epoch=10000,
    step_per_collect=500,
    episode_per_test=10,
    batch_size=64,
    update_per_step=0.1,
    test_in_train=False,
    
)


NameError: name 'ts' is not defined