In [1]:
import numpy as np
from gym import spaces
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPO
from ray.tune.registry import register_env
from ray.tune.registry import _global_registry, ENV_CREATOR
ray.shutdown() 

In [None]:
import numpy as np
from gym import spaces
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPO
from ray.tune.registry import register_env
from ray.tune.registry import _global_registry, ENV_CREATOR

# 定义环境类
class MAEnvironment(MultiAgentEnv):
    def __init__(self, num_agents=5, num_iterations=200, dt=0.1):
        super().__init__()  # 调用父类初始化
        self.num_agents = num_agents
        self.agents = ["agent_" + str(i) for i in range(num_agents)]
        self.agent_name_mapping = dict(zip(self.agents, list(range(num_agents))))
        self._agent_ids = set(self.agents)  # 添加 _agent_ids 属性

        # 初始化其他属性
        self.num_iterations = num_iterations
        self.dt = dt
        self.current_iteration = 0

        initial_positions = [0.55, 0.4, -0.05, -0.1, -0.7]
        self.agent_objs = [self.Agent(pos, i) for i, pos in enumerate(initial_positions)]
        self.init_neighbors()

        self.epsilon = 0.005
        self.time_to_reach_epsilon = None
        self.epsilon_violated = True
        self.all_within_epsilon = False
        self.total_trigger_count = 0
        self.time_to_reach_epsilon_changes = 0
        self.max_obs_size = self.compute_max_obs_size()
    
    def compute_max_obs_size(self):
        max_neighbors = max(len(agent.neighbors) for agent in self.agent_objs)
        return 1 + max_neighbors
    
    def init_neighbors(self):
        self.agent_objs[0].add_neighbor(self.agent_objs[1])
        self.agent_objs[0].add_neighbor(self.agent_objs[2])
        self.agent_objs[0].add_neighbor(self.agent_objs[3])
        self.agent_objs[0].add_neighbor(self.agent_objs[4])
        self.agent_objs[1].add_neighbor(self.agent_objs[2])
        self.agent_objs[1].add_neighbor(self.agent_objs[3])
        self.agent_objs[1].add_neighbor(self.agent_objs[4])
        self.agent_objs[2].add_neighbor(self.agent_objs[3])
        self.agent_objs[2].add_neighbor(self.agent_objs[4])
        self.agent_objs[3].add_neighbor(self.agent_objs[4])

    def reset(self, *, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
        
        initial_positions = [0.55, 0.4, -0.05, -0.1, -0.7]
        self.agent_objs = [self.Agent(pos, i) for i, pos in enumerate(initial_positions)]
        self.init_neighbors()
        self.current_iteration = 0
        self.epsilon_violated = True
        self.all_within_epsilon = False
        self.total_trigger_count = 0
        self.time_to_reach_epsilon_changes = 0
        self.time_to_reach_epsilon = None
        
        observations = {agent: self.get_observation(agent) for agent in self.agents}
        infos = {agent: {} for agent in self.agents}  # 返回额外的 per-agent 信息字典
        return observations, infos

    def get_observation(self, agent):
        agent_index = self.agent_name_mapping[agent]
        agent_obj = self.agent_objs[agent_index]
        neighbors_positions = [neighbor.position for neighbor in agent_obj.neighbors]
        obs = np.array([agent_obj.position] + neighbors_positions, dtype=np.float32)
        
        # 填充观测到最大观测大小
        if len(obs) < self.max_obs_size:
            padding = np.zeros(self.max_obs_size - len(obs))
            obs = np.concatenate([obs, padding])
        
        # 不进行裁剪
        return obs

    def compute_average_position_difference(self):
        total_difference = 0
        count = 0
        for i, agent_i in enumerate(self.agent_objs):
            for j, agent_j in enumerate(self.agent_objs):
                if i < j:
                    total_difference += abs(agent_i.position - agent_j.position)
                    count += 1
        if count > 0:
            return total_difference / count
        else:
            return 0
    
    def step(self, action_dict):
        triggers = np.array([action_dict.get(agent, 0) for agent in self.agents])  # 确保访问安全
        trigger_count = np.sum(triggers)
        self.total_trigger_count += trigger_count

        for i, agent in enumerate(self.agent_objs):
            agent.update_position(self.current_iteration, self.dt, triggers[i])

        self.all_within_epsilon = all(all(abs(agent.position - neighbor.position) < self.epsilon for neighbor in agent.neighbors) for agent in self.agent_objs)

        if self.all_within_epsilon:
            if self.epsilon_violated:
                self.time_to_reach_epsilon = self.current_iteration
                self.epsilon_violated = False
                self.time_to_reach_epsilon_changes += 1
        else:
            self.epsilon_violated = True
            self.time_to_reach_epsilon = None
        
        self.current_iteration += 1
        terminated = self.current_iteration >= self.num_iterations

        rewards = {}
        if not terminated:
            average_position_difference = self.compute_average_position_difference()
            for agent in self.agents:
                if self.time_to_reach_epsilon is not None:
                    rewards[agent] = 20 if action_dict.get(agent, 0) == 0 else -5  # 动作为0奖励，1惩罚
                else:
                    rewards[agent] = -20 - 5 * np.abs(average_position_difference)
        else:
            if self.time_to_reach_epsilon is not None:
                global_reward = 5000 - self.time_to_reach_epsilon - self.total_trigger_count
            else:
                global_reward = -5000
            for agent in self.agents:
                rewards[agent] = global_reward

        observations = {agent: self.get_observation(agent) for agent in self.agents}
        terminateds = {agent: terminated for agent in self.agents}
        terminateds["__all__"] = terminated
        truncateds = {agent: False for agent in self.agents}  # 无需提前结束
        truncateds["__all__"] = False
        infos = {agent: {} for agent in self.agents}

        return observations, rewards, terminateds, truncateds, infos
    
    def render(self, mode='human'):
        positions = [agent.position for agent in self.agent_objs]
        print(f"Positions: {positions}")
    
    def observation_space(self, agent):
        return spaces.Box(low=-np.inf, high=np.inf, shape=(self.max_obs_size,), dtype=np.float32)
    
    def action_space(self, agent):
        return spaces.Discrete(2)

    class Agent:
        def __init__(self, initial_position, index):
            self.position = initial_position
            self.index = index
            self.neighbors = []
            self.last_broadcast_position = self.position
            self.trigger_points = []
            self.u_i = 0

        def add_neighbor(self, neighbor):
            if neighbor not in self.neighbors:
                self.neighbors.append(neighbor)
                neighbor.neighbors.append(self)

        def update_position(self, t, dt, trigger):
            if trigger == 1 or t == 0:
                self.u_i = -sum((self.last_broadcast_position - neighbor.last_broadcast_position) for neighbor in self.neighbors)
                self.position += self.u_i * dt
                self.last_broadcast_position = self.position
                self.trigger_points.append((t, self.position))
            else:
                self.position += self.u_i * dt

# 环境创建函数
def env_creator(config):
    return MAEnvironment(num_agents=config.get("num_agents", 5))

# 注册环境
register_env("env", lambda config: MAEnvironment(num_agents=config.get("num_agents", 5)))
print("环境注册成功")


# 定义共享策略的映射函数
def shared_policy_mapping_fn(agent_id, *args, **kwargs):
    return "shared_policy"

# 启动 Ray
ray.shutdown() 
ray.init()

# 配置
config = {
    "env": "env",  # 使用注册的环境名
    "env_config": {
        "num_agents": 5,  # 传递环境的配置参数
    },
    "multiagent": {
        "policies": {
            "shared_policy": (None,  # 使用默认模型
                              env_creator({"num_agents": 5}).observation_space("agent_0"),  # 观测空间
                              env_creator({"num_agents": 5}).action_space("agent_0"),  # 动作空间
                              {}),
        },
        "policy_mapping_fn": shared_policy_mapping_fn,  # 使用共享策略映射
    },
    "framework": "torch",  # 使用 "torch" 或 "tf"
    "num_workers": 1,  # 使用的工作线程数
    "train_batch_size": 200,
    "sgd_minibatch_size": 64,
    "lr": 0.0003,
    "num_sgd_iter": 10,
}
# print("配置成功")
# env = env_creator({"num_agents": 5})
# print(f"环境创建成功: {env}")
# print(_global_registry.contains(ENV_CREATOR, "env"))
# 使用 PPO 训练器进行训练

print("开始训练")
try:
    tune.run(PPO, config=config, stop={"training_iteration": 200})
except Exception as e:
    print(f"训练出错: {e}")
#tune.run(PPO, config=config, stop={"training_iteration": 200})

# 关闭 Ray
ray.shutdown()

环境注册成功


2024-11-17 21:27:45,898	INFO worker.py:1752 -- Started a local Ray instance.
2024-11-17 21:27:46,546	INFO tune.py:613 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


开始训练


0,1
Current time:,2024-11-17 21:27:55
Running for:,00:00:08.83
Memory:,12.0/16.0 GiB

Trial name,# failures,error file
PPO_env_bbb69_00000,1,/tmp/ray/session_2024-11-17_21-27-44_258649_70807/artifacts/2024-11-17_21-27-46/PPO_2024-11-17_21-27-46/driver_artifacts/PPO_env_bbb69_00000_0_2024-11-17_21-27-46/error.txt

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_env_bbb69_00000,ERROR,127.0.0.1:71216,1,0.503247,200,26943,26943,26943,200


[36m(PPO pid=71216)[0m Install gputil for GPU system monitoring.


Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,info,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,timers
PPO_env_bbb69_00000,1000,"{'ObsPreprocessorConnector_ms': 0.004267692565917969, 'StateBufferConnector_ms': 0.0036954879760742188, 'ViewRequirementAgentConnector_ms': 0.121307373046875}","{'num_env_steps_sampled': 200, 'num_env_steps_trained': 200, 'num_agent_steps_sampled': 1000, 'num_agent_steps_trained': 1000}",{},200,{},26943,26943,26943,1,"{'learner': {'shared_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.21767616760917008, 'cur_kl_coeff': 0.19999999999999998, 'cur_lr': 0.0003, 'total_loss': 9.99299510717392, 'policy_loss': -0.010902766877552494, 'vf_loss': 10.0, 'vf_explained_var': 2.995133399963379e-07, 'kl': 0.01948941615937656, 'entropy': 0.6736863292753696, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 62.5, 'num_grad_updates_lifetime': 80.5, 'diff_num_grad_updates_vs_sampler_policy': 79.5}}, 'num_env_steps_sampled': 200, 'num_env_steps_trained': 200, 'num_agent_steps_sampled': 1000, 'num_agent_steps_trained': 1000}",1000,1000,200,200,399.517,200,200,399.517,0,1,0,0,200,"{'cpu_util_percent': 0.0, 'ram_util_percent': 74.7}",{'shared_policy': 5528.591453473173},{'shared_policy': 5388.591453473173},{'shared_policy': 5253.591453473173},"{'mean_raw_obs_processing_ms': 0.3542117218473064, 'mean_inference_ms': 0.3190372713762729, 'mean_action_processing_ms': 0.12780303385720324, 'mean_env_wait_ms': 0.036243182509692745, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 26942.957267365862, 'episode_reward_min': 26942.957267365862, 'episode_reward_mean': 26942.957267365862, 'episode_len_mean': 200.0, 'episode_media': {}, 'episodes_this_iter': 1, 'policy_reward_min': {'shared_policy': 5253.591453473173}, 'policy_reward_max': {'shared_policy': 5528.591453473173}, 'policy_reward_mean': {'shared_policy': 5388.591453473173}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [26942.957267365862], 'episode_lengths': [200], 'policy_shared_policy_reward': [5528.591453473173, 5278.591453473173, 5253.591453473173, 5503.591453473173, 5378.591453473173]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.3542117218473064, 'mean_inference_ms': 0.3190372713762729, 'mean_action_processing_ms': 0.12780303385720324, 'mean_env_wait_ms': 0.036243182509692745, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.004267692565917969, 'StateBufferConnector_ms': 0.0036954879760742188, 'ViewRequirementAgentConnector_ms': 0.121307373046875}}","{'training_iteration_time_ms': 500.555, 'sample_time_ms': 173.136, 'learn_time_ms': 325.906, 'learn_throughput': 613.674, 'synch_weights_time_ms': 0.967}"


2024-11-17 21:27:55,373	ERROR tune_controller.py:1332 -- Trial task failed for trial PPO_env_bbb69_00000
Traceback (most recent call last):
  File "/Users/cyj/anaconda3/envs/py38/lib/python3.8/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/cyj/anaconda3/envs/py38/lib/python3.8/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/cyj/anaconda3/envs/py38/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/cyj/anaconda3/envs/py38/lib/python3.8/site-packages/ray/_private/worker.py", line 2667, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/Users/cyj/anaconda3/envs/py38/lib/python3.8/site-packages/ray/_private/worker.py", line 864, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskEr

训练出错: ('Trials did not complete', [PPO_env_bbb69_00000])
