In [1]:
import numpy as np
from gym import spaces
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPO
from ray.tune.registry import register_env
from ray.tune.registry import _global_registry, ENV_CREATOR
ray.shutdown() 

In [None]:
import numpy as np
from gym import spaces
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPO
from ray.tune.registry import register_env
from ray.tune.registry import _global_registry, ENV_CREATOR
import os
from ray.rllib.algorithms.callbacks import DefaultCallbacks

# 定义环境类
class MAEnvironment(MultiAgentEnv):
    def __init__(self, num_agents=5, num_iterations=200, dt=0.1):
        super().__init__()  # 调用父类初始化
        self.num_agents = num_agents
        self.agents = ["agent_" + str(i) for i in range(num_agents)]
        self.agent_name_mapping = dict(zip(self.agents, list(range(num_agents))))
        self._agent_ids = set(self.agents)  # 添加 _agent_ids 属性

        # 初始化其他属性
        self.num_iterations = num_iterations
        self.dt = dt
        self.current_iteration = 0

        initial_positions = [0.55, 0.4, -0.05, -0.1, -0.7]
        self.agent_objs = [self.Agent(pos, i) for i, pos in enumerate(initial_positions)]
        self.init_neighbors()

        self.epsilon = 0.005
        self.time_to_reach_epsilon = None
        self.epsilon_violated = True
        self.all_within_epsilon = False
        self.total_trigger_count = 0
        self.time_to_reach_epsilon_changes = 0
        self.max_obs_size = self.compute_max_obs_size()
    
    def compute_max_obs_size(self):
        max_neighbors = max(len(agent.neighbors) for agent in self.agent_objs)
        return 1 + max_neighbors
    
    def init_neighbors(self):
        self.agent_objs[0].add_neighbor(self.agent_objs[1])
        self.agent_objs[0].add_neighbor(self.agent_objs[2])
        self.agent_objs[0].add_neighbor(self.agent_objs[3])
        self.agent_objs[0].add_neighbor(self.agent_objs[4])
        self.agent_objs[1].add_neighbor(self.agent_objs[2])
        self.agent_objs[1].add_neighbor(self.agent_objs[3])
        self.agent_objs[1].add_neighbor(self.agent_objs[4])
        self.agent_objs[2].add_neighbor(self.agent_objs[3])
        self.agent_objs[2].add_neighbor(self.agent_objs[4])
        self.agent_objs[3].add_neighbor(self.agent_objs[4])

    def reset(self, *, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
        
        initial_positions = [0.55, 0.4, -0.05, -0.1, -0.7]
        self.agent_objs = [self.Agent(pos, i) for i, pos in enumerate(initial_positions)]
        self.init_neighbors()
        self.current_iteration = 0
        self.epsilon_violated = True
        self.all_within_epsilon = False
        self.total_trigger_count = 0
        self.time_to_reach_epsilon_changes = 0
        self.time_to_reach_epsilon = None
        
        observations = {agent: self.get_observation(agent) for agent in self.agents}
        infos = {agent: {} for agent in self.agents}  # 返回额外的 per-agent 信息字典
        return observations, infos

    # 统一的观测空间
    # def get_observation(self, agent):
    #     agent_index = self.agent_name_mapping[agent]
    #     agent_obj = self.agent_objs[agent_index]
    #     neighbors_positions = [neighbor.position for neighbor in agent_obj.neighbors]
    #     obs = np.array([agent_obj.position] + neighbors_positions, dtype=np.float32)
        
    #     # 填充观测到最大观测大小
    #     if len(obs) < self.max_obs_size:
    #         padding = np.zeros(self.max_obs_size - len(obs))
    #         obs = np.concatenate([obs, padding])
        
    #     # 不进行裁剪
    #     return obs

    # # 不同的观测空间
    def get_observation(self, agent):
        agent_index = self.agent_name_mapping[agent]
        agent_obj = self.agent_objs[agent_index]
        neighbors_positions = [neighbor.position for neighbor in agent_obj.neighbors]
        obs = np.array([agent_obj.position] + neighbors_positions, dtype=np.float32)
        return obs

    def compute_average_position_difference(self):
        total_difference = 0
        count = 0
        for i, agent_i in enumerate(self.agent_objs):
            for j, agent_j in enumerate(self.agent_objs):
                if i < j:
                    total_difference += abs(agent_i.position - agent_j.position)
                    count += 1
        if count > 0:
            return total_difference / count
        else:
            return 0
    
    def step(self, action_dict):
        triggers = np.array([action_dict.get(agent, 0) for agent in self.agents])  # 确保访问安全
        trigger_count = np.sum(triggers)
        self.total_trigger_count += trigger_count

        for i, agent in enumerate(self.agent_objs):
            agent.update_position(self.current_iteration, self.dt, triggers[i])

        self.all_within_epsilon = all(all(abs(agent.position - neighbor.position) < self.epsilon for neighbor in agent.neighbors) for agent in self.agent_objs)

        if self.all_within_epsilon:
            if self.epsilon_violated:
                self.time_to_reach_epsilon = self.current_iteration
                self.epsilon_violated = False
                self.time_to_reach_epsilon_changes += 1
        else:
            self.epsilon_violated = True
            self.time_to_reach_epsilon = None
        
        self.current_iteration += 1
        terminated = self.current_iteration >= self.num_iterations

        # 根据时间步调整奖励逻辑
        early_phase = self.current_iteration <= self.num_iterations * 0.25


        rewards = {}
        if not terminated:
            average_position_difference = self.compute_average_position_difference()
            # for agent in self.agents:
            #     if self.all_within_epsilon:
            #         rewards[agent] = 10 if action_dict.get(agent, 0) == 0 else 0  # 动作为0奖励，1惩罚
            #     else:
            #         rewards[agent] = - 10 * np.abs(average_position_difference)

            for agent in self.agents:
                if early_phase:
                    # 在前 25% 的时间步，奖励更加注重位置一致性，减少对触发的惩罚
                    if self.all_within_epsilon:
                        rewards[agent] = 2 if action_dict.get(agent, 0) == 0 else 1
                    else:
                        rewards[agent] = -1 -10 * np.abs(average_position_difference)
                else:
                    # 在后 75% 的时间步，更加注重减少触发次数
                    if self.all_within_epsilon:
                        rewards[agent] = 3 if action_dict.get(agent, 0) == 0 else -0.5
                    else:
                        rewards[agent] = -1.5 - 10 * np.abs(average_position_difference)
        else:
            if self.time_to_reach_epsilon is not None:
                global_reward = 1250 - self.time_to_reach_epsilon - self.total_trigger_count
            else:
                global_reward = -10000
            for agent in self.agents:
                rewards[agent] = global_reward

        observations = {agent: self.get_observation(agent) for agent in self.agents}
        terminateds = {agent: terminated for agent in self.agents}
        terminateds["__all__"] = terminated
        truncateds = {agent: False for agent in self.agents}  # 无需提前结束
        truncateds["__all__"] = False
        infos = {agent: {} for agent in self.agents}

        return observations, rewards, terminateds, truncateds, infos
    
    def render(self, mode='human'):
        positions = [agent.position for agent in self.agent_objs]
        print(f"Positions: {positions}")
    
    # 统一大小的观测空间
    # def observation_space(self, agent):
    #     return spaces.Box(low=-np.inf, high=np.inf, shape=(self.max_obs_size,), dtype=np.float32)
    
    def observation_space(self, agent):
        num_neighbors = len(self.agent_objs[self.agent_name_mapping[agent]].neighbors)
        obs_size = 1 + num_neighbors  # 自身位置 + 邻居数量
        return spaces.Box(low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float32)
    
    def action_space(self, agent):
        return spaces.Discrete(2)

    class Agent:
        def __init__(self, initial_position, index):
            self.position = initial_position
            self.index = index
            self.neighbors = []
            self.last_broadcast_position = self.position
            self.trigger_points = []
            self.u_i = 0

        def add_neighbor(self, neighbor):
            if neighbor not in self.neighbors:
                self.neighbors.append(neighbor)
                neighbor.neighbors.append(self)

        def update_position(self, t, dt, trigger):
            if trigger == 1 or t == 0:
                self.u_i = -sum((self.last_broadcast_position - neighbor.last_broadcast_position) for neighbor in self.neighbors)
                self.position += self.u_i * dt
                self.last_broadcast_position = self.position
                self.trigger_points.append((t, self.position))
            else:
                self.position += self.u_i * dt

# 环境创建函数
def env_creator(config):
    return MAEnvironment(num_agents=config.get("num_agents", 5))

# 注册环境
register_env("env", lambda config: MAEnvironment(num_agents=config.get("num_agents", 5)))
print("环境注册成功")


# 定义共享策略的映射函数
def shared_policy_mapping_fn(agent_id, *args, **kwargs):
    return "shared_policy"

# 启动 Ray
ray.shutdown() 
ray.init(local_mode=False)

# 配置
config = {
    "env": "env",  # 使用注册的环境名
    "env_config": {
        "num_agents": 5,  # 传递环境的配置参数
    },
    "multiagent": {
        "policies": {
            "shared_policy": (None,  # 使用默认模型
                              env_creator({"num_agents": 5}).observation_space("agent_0"),  # 观测空间
                              env_creator({"num_agents": 5}).action_space("agent_0"),  # 动作空间
                              {}),
        },
        "policy_mapping_fn": shared_policy_mapping_fn,  # 使用共享策略映射
    },
    "framework": "torch",  # 使用 "torch" 或 "tf"
    "num_workers": 4,  # 使用的工作线程数
    "num_envs_per_worker": 5,
    "train_batch_size": 4000, #每次训练时使用的总样本数
    "sgd_minibatch_size": 256,
    "lr": 0.0003,
    "num_sgd_iter": 20,
}

print("开始训练")

class SaveOnMaxRewardCallback(DefaultCallbacks):
    def __init__(self, reward_threshold=5000, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.reward_threshold = reward_threshold  # 奖励阈值
        self.saved_checkpoint = False  # 确保只保存一次模型

    def on_train_result(self, *, algorithm, result, **kwargs):
        """在每次训练结束时调用，检查最大奖励并保存模型。"""
        max_reward = result["episode_reward_max"]  # 获取当前最大奖励

        if max_reward >= self.reward_threshold and not self.saved_checkpoint:
            checkpoint_dir = algorithm.save()  # 保存模型
            print(f"模型已保存，路径为：{checkpoint_dir}，最大奖励：{max_reward}")
            self.saved_checkpoint = True  # 确保只保存一次

# 更新配置
config.update({
    "callbacks": SaveOnMaxRewardCallback,  # 设置自定义回调
})



# 运行训练并保存模型
analysis = tune.run(
    PPO,
    config=config,
    stop={"training_iteration": 50},
    local_dir="/Users/cyj/Documents/Project/Python/Multi-agent-consensus-algorithm/MARL/ray/tensorboard_logs",
    checkpoint_at_end=True,
    checkpoint_freq=5
)


# 关闭 Ray
ray.shutdown()

环境注册成功


2024-11-23 20:29:59,184	INFO worker.py:1752 -- Started a local Ray instance.
2024-11-23 20:29:59,744	INFO tune.py:613 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


开始训练


0,1
Current time:,2024-11-23 20:34:01
Running for:,00:04:02.01
Memory:,12.2/16.0 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_env_a7d11_00000,TERMINATED,127.0.0.1:80966,50,231.699,200000,-6199.74,4957.1,-51819.2,200


[36m(PPO pid=80966)[0m Install gputil for GPU system monitoring.


Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,info,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,timers
PPO_env_a7d11_00000,1000000,"{'ObsPreprocessorConnector_ms': 0.0045549869537353516, 'StateBufferConnector_ms': 0.011849403381347656, 'ViewRequirementAgentConnector_ms': 0.13973164558410645}","{'num_env_steps_sampled': 200000, 'num_env_steps_trained': 200000, 'num_agent_steps_sampled': 1000000, 'num_agent_steps_trained': 1000000}",{},200,{},4957.1,-6199.74,-51819.2,20,"{'learner': {'shared_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.37853255993391893, 'cur_kl_coeff': 0.020022583007812504, 'cur_lr': 0.0003, 'total_loss': 9.93772621939454, 'policy_loss': -0.0005587956591809899, 'vf_loss': 9.93808272941203, 'vf_explained_var': -5.590097059177447e-05, 'kl': 0.0101021480373283, 'entropy': 0.6696904824881614, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 253.16455696202533, 'num_grad_updates_lifetime': 78210.5, 'diff_num_grad_updates_vs_sampler_policy': 789.5}}, 'num_env_steps_sampled': 200000, 'num_env_steps_trained': 200000, 'num_agent_steps_sampled': 1000000, 'num_agent_steps_trained': 1000000}",1000000,1000000,200000,4000,906.146,200000,4000,906.146,0,4,0,0,4000,"{'cpu_util_percent': 19.066666666666666, 'ram_util_percent': 76.01666666666667}",{'shared_policy': 1018.7193657334625},{'shared_policy': -1239.9480124159136},{'shared_policy': -10363.836856682494},"{'mean_raw_obs_processing_ms': 1.9757700993768967, 'mean_inference_ms': 0.5803132026572598, 'mean_action_processing_ms': 0.7312221542513175, 'mean_env_wait_ms': 0.1695161718845582, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 4957.096828667312, 'episode_reward_min': -51819.18428341247, 'episode_reward_mean': -6199.74006207957, 'episode_len_mean': 200.0, 'episode_media': {}, 'episodes_this_iter': 20, 'policy_reward_min': {'shared_policy': -10363.836856682494}, 'policy_reward_max': {'shared_policy': 1018.7193657334625}, 'policy_reward_mean': {'shared_policy': -1239.9480124159136}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [4542.384730979764, -50439.18787125811, 3006.0930279059994, 3922.0722114895016, 2799.2643644514937, 4513.89712403498, 4046.0790670295028, 4447.564985040659, 3069.252700076649, 4475.984653251458, 4006.8228630009044, 3479.661533934509, 4485.277374804949, 3975.2913283970743, 3586.9332672620653, 3167.5616902727506, -51819.18428341247, 4304.489470304824, 3451.3393427318533, 4005.3280224775085, 4003.612050559751, 4179.460897047105, 3426.658112255497, 4012.747409247977, 3400.259361295107, 4544.50747937107, 3804.9870031698947, 3265.85341444167, 4233.486777135119, 4119.312451960033, 3798.549592980332, 4053.7106992241497, 3529.545654991619, 4280.082465722215, 4462.298232273359, 3980.0882983700008, 4455.901386576779, 4007.3626803588236, 2703.185107078576, 4078.344924551222, 4414.5648226340545, 4527.012463324431, 3484.144063174046, 3510.5120963580925, 4618.924509978505, 4281.502979562494, 4240.003308719048, 4322.020537149992, 4441.977474088517, 4381.095902587298, 3742.8398937726197, 4320.674124542302, 3938.8588510256777, 4148.203005694026, 4218.39235284581, 4291.833743431275, 4224.4169814290635, 4752.242500362259, 3022.2390563001836, 4413.638702747708, -51455.11323307004, -51667.78758675697, -51816.79101732272, 2479.687922298451, -51733.96258438194, -51681.02438534392, 1740.340404866709, -51392.05798344302, 2638.2059547950817, 2291.4314432424944, 2861.1409294286364, -51595.117115313595, 3011.99204097157, 3010.77015577146, -51259.3855703647, -51690.32660248343, 3485.042254504341, -51643.91175403286, -51774.95366644313, -51697.52607665984, 2549.335437469599, 2389.86106855143, 3395.841538438559, 2473.831164220789, -51621.08328480527, 3475.496859213866, 2777.85924559523, 2863.880937541474, -51575.749510941736, 3574.2948113050365, 2976.627263073421, 4034.2680752616607, 2861.344454145565, -50918.12300648797, 2965.8121499101967, 4957.096828667312, 4754.812390089368, -50570.62959962831, 3709.813722770126, 4178.772746276622], 'episode_lengths': [200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200], 'policy_shared_policy_reward': [927.176946195953, 888.676946195953, 939.676946195953, 908.676946195953, 878.176946195953, -10101.437574251622, -10078.437574251622, -10099.437574251622, -10087.437574251622, -10072.437574251622, 591.1186055811996, 579.6186055811996, 605.1186055811996, 604.1186055811996, 626.1186055811996, 765.9144422979003, 774.4144422979003, 798.4144422979003, 796.4144422979003, 786.9144422979003, 598.3528728902986, 542.3528728902986, 545.8528728902985, 563.3528728902986, 549.3528728902986, 896.979424806996, 920.479424806996, 889.979424806996, 921.479424806996, 884.979424806996, 803.0158134059004, 811.0158134059004, 793.5158134059004, 828.5158134059004, 810.0158134059004, 894.1129970081316, 884.1129970081316, 909.6129970081316, 892.1129970081316, 867.6129970081316, 625.0505400153297, 593.5505400153297, 593.5505400153297, 653.0505400153297, 604.0505400153297, 872.6969306502916, 908.6969306502916, 870.1969306502916, 905.1969306502916, 919.1969306502916, 802.464572600181, 812.964572600181, 781.464572600181, 811.964572600181, 797.964572600181, 696.6323067869015, 696.6323067869015, 700.1323067869015, 682.6323067869015, 703.6323067869015, 927.15547496099, 867.15547496099, 898.15547496099, 913.15547496099, 879.65547496099, 799.9582656794147, 747.4582656794147, 817.4582656794147, 782.4582656794147, 827.9582656794147, 695.1866534524129, 714.6866534524129, 728.1866534524129, 719.6866534524129, 729.1866534524129, 618.1123380545503, 621.6123380545503, 614.6123380545503, 674.1123380545503, 639.1123380545503, -10363.836856682494, -10363.836856682494, -10363.836856682494, -10363.836856682494, -10363.836856682494, 845.4978940609649, 862.9978940609649, 887.4978940609649, 852.4978940609649, 855.9978940609649, 674.0678685463706, 676.5678685463707, 705.5678685463706, 707.0678685463707, 688.0678685463706, 811.5656044955017, 801.0656044955018, 773.0656044955018, 829.0656044955017, 790.5656044955017, 781.32241011195, 808.32241011195, 837.32241011195, 792.82241011195, 783.82241011195, 810.4921794094212, 830.4921794094212, 881.4921794094212, 821.9921794094212, 834.9921794094212, 727.7316224510995, 705.7316224510995, 657.7316224510995, 660.2316224510995, 675.2316224510995, 803.2494818495956, 794.7494818495956, 800.7494818495956, 828.7494818495956, 785.2494818495956, 718.5518722590216, 687.0518722590216, 676.5518722590216, 680.0518722590216, 638.0518722590216, 939.2014958742141, 886.7014958742141, 913.7014958742141, 892.2014958742141, 912.7014958742141, 780.4974006339789, 736.4974006339789, 768.9974006339789, 770.4974006339789, 748.4974006339789, 658.770682888334, 634.270682888334, 651.770682888334, 669.270682888334, 651.770682888334, 822.6973554270239, 838.6973554270239, 809.6973554270239, 897.6973554270239, 864.6973554270239, 798.4624903920068, 834.4624903920068, 833.4624903920068, 826.4624903920068, 826.4624903920068, 734.0099185960664, 802.0099185960664, 752.0099185960664, 771.5099185960664, 739.0099185960664, 838.7421398448299, 848.7421398448299, 795.7421398448299, 764.7421398448299, 805.7421398448299, 711.5091309983238, 708.0091309983238, 701.0091309983238, 708.0091309983238, 701.0091309983238, 859.4164931444429, 853.9164931444429, 824.4164931444429, 869.9164931444429, 872.4164931444429, 881.9596464546717, 921.9596464546717, 893.9596464546717, 857.9596464546717, 906.4596464546717, 786.6176596740002, 821.6176596740002, 798.1176596740002, 770.6176596740002, 803.1176596740002, 867.8802773153559, 908.8802773153559, 915.3802773153559, 911.3802773153559, 852.3802773153559, 806.4725360717647, 853.4725360717647, 756.9725360717647, 787.9725360717647, 802.4725360717647, 538.5370214157151, 542.0370214157151, 521.0370214157151, 542.0370214157151, 559.5370214157151, 781.8689849102444, 835.3689849102444, 843.3689849102444, 829.8689849102444, 787.8689849102444, 860.312964526811, 886.312964526811, 873.312964526811, 884.812964526811, 909.812964526811, 894.1024926648863, 905.6024926648863, 885.1024926648863, 908.1024926648863, 934.1024926648863, 689.0288126348094, 699.5288126348094, 714.5288126348094, 690.0288126348094, 691.0288126348094, 693.0024192716187, 696.5024192716187, 717.5024192716187, 689.5024192716186, 714.0024192716186, 924.2849019957009, 927.7849019957009, 916.2849019957009, 912.7849019957009, 937.7849019957009, 846.6005959124989, 848.6005959124989, 864.1005959124989, 895.6005959124989, 826.6005959124989, 848.7006617438096, 843.2006617438096, 808.2006617438096, 880.2006617438096, 859.7006617438096, 837.3041074299985, 858.3041074299985, 879.3041074299985, 871.3041074299985, 875.8041074299985, 899.1954948177034, 884.1954948177034, 888.6954948177034, 880.6954948177034, 889.1954948177033, 891.5191805174595, 850.5191805174595, 918.5191805174595, 876.5191805174595, 844.0191805174595, 739.467978754524, 735.967978754524, 749.967978754524, 763.967978754524, 753.467978754524, 875.2348249084605, 841.2348249084605, 867.2348249084605, 864.2348249084605, 872.7348249084605, 783.3717702051356, 783.3717702051356, 784.3717702051356, 804.3717702051356, 783.3717702051356, 821.3406011388051, 852.8406011388051, 854.3406011388051, 821.8406011388051, 797.8406011388051, 849.3784705691619, 853.8784705691619, 832.8784705691619, 845.8784705691619, 836.3784705691619, 858.866748686255, 861.366748686255, 868.366748686255, 846.366748686255, 856.866748686255, 847.4833962858127, 838.4833962858127, 838.4833962858127, 874.4833962858127, 825.4833962858127, 944.2485000724519, 920.2485000724519, 957.7485000724519, 980.2485000724519, 949.7485000724519, 609.3478112600371, 584.8478112600371, 598.8478112600371, 612.8478112600371, 616.3478112600371, 855.5277405495416, 867.5277405495416, 879.0277405495416, 910.5277405495416, 901.0277405495416, -10289.522646614008, -10292.022646614008, -10292.022646614008, -10293.022646614008, -10288.522646614008, -10333.557517351393, -10333.557517351393, -10333.557517351393, -10333.557517351393, -10333.557517351393, -10363.358203464544, -10363.358203464544, -10363.358203464544, -10363.358203464544, -10363.358203464544, 498.03758445969044, 480.53758445969044, 480.53758445969044, 508.53758445969044, 512.0375844596904, -10346.792516876389, -10346.792516876389, -10346.792516876389, -10346.792516876389, -10346.792516876389, -10336.704877068783, -10336.704877068783, -10334.204877068783, -10336.704877068783, -10336.704877068783, 349.4680809733418, 342.4680809733418, 345.9680809733418, 349.4680809733418, 352.9680809733418, -10265.111596688603, -10279.111596688603, -10282.611596688603, -10286.111596688603, -10279.111596688603, 531.8411909590168, 517.8411909590168, 528.3411909590168, 531.8411909590168, 528.3411909590168, 462.4862886484992, 448.4862886484992, 451.9862886484992, 462.4862886484992, 465.9862886484992, 563.8281858857273, 560.3281858857273, 588.3281858857273, 570.8281858857273, 577.8281858857273, -10319.02342306272, -10319.02342306272, -10312.02342306272, -10319.02342306272, -10326.02342306272, 627.5984081943143, 596.0984081943143, 592.5984081943143, 582.0984081943143, 613.5984081943143, 622.4540311542918, 604.9540311542918, 583.9540311542918, 569.9540311542918, 629.4540311542918, -10260.97711407294, -10239.97711407294, -10253.97711407294, -10250.47711407294, -10253.97711407294, -10338.065320496686, -10338.065320496686, -10338.065320496686, -10338.065320496686, -10338.065320496686, 705.4084509008683, 673.9084509008683, 698.4084509008683, 705.4084509008683, 701.9084509008683, -10327.382350806573, -10327.382350806573, -10330.882350806573, -10327.382350806573, -10330.882350806573, -10352.890733288627, -10352.890733288627, -10356.390733288627, -10356.390733288627, -10356.390733288627, -10339.505215331967, -10339.505215331967, -10339.505215331967, -10339.505215331967, -10339.505215331967, 526.3670874939199, 522.8670874939199, 509.8670874939199, 503.8670874939198, 486.3670874939199, 461.8722137102867, 479.3722137102867, 482.8722137102867, 458.3722137102867, 507.3722137102867, 668.3683076877116, 701.3683076877116, 671.8683076877116, 675.3683076877116, 678.8683076877116, 487.7662328441579, 494.7662328441579, 505.2662328441579, 484.2662328441579, 501.7662328441579, -10329.116656961052, -10318.616656961052, -10322.116656961052, -10322.116656961052, -10329.116656961052, 710.4993718427731, 699.9993718427733, 671.9993718427733, 682.4993718427733, 710.4993718427733, 552.671849119046, 563.1718491190461, 556.1718491190461, 547.6718491190461, 558.1718491190461, 591.6761875082947, 539.1761875082947, 584.6761875082947, 581.1761875082947, 567.1761875082947, -10320.749902188349, -10317.249902188349, -10310.249902188349, -10313.749902188349, -10313.749902188349, 716.9589622610073, 695.9589622610073, 755.4589622610073, 692.4589622610073, 713.4589622610073, 580.125452614684, 634.625452614684, 604.125452614684, 574.125452614684, 583.625452614684, 818.1536150523324, 817.1536150523324, 785.6536150523324, 818.1536150523324, 795.1536150523324, 576.4688908291134, 576.4688908291134, 569.4688908291134, 565.9688908291134, 572.9688908291134, -10201.724601297594, -10183.224601297594, -10176.224601297594, -10177.224601297594, -10179.724601297594, 607.8624299820391, 590.3624299820391, 583.3624299820391, 586.8624299820391, 597.3624299820391, 1018.7193657334625, 974.2193657334625, 998.7193657334625, 992.2193657334625, 973.2193657334625, 936.3624780178735, 943.3624780178735, 954.8624780178735, 980.3624780178735, 939.8624780178735, -10122.525919925662, -10108.525919925662, -10115.525919925662, -10105.025919925662, -10119.025919925662, 721.6627445540253, 763.6627445540253, 749.6627445540253, 721.6627445540253, 753.1627445540253, 827.3545492553242, 869.3545492553242, 837.8545492553242, 830.8545492553242, 813.3545492553242]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 1.9757700993768967, 'mean_inference_ms': 0.5803132026572598, 'mean_action_processing_ms': 0.7312221542513175, 'mean_env_wait_ms': 0.1695161718845582, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.0045549869537353516, 'StateBufferConnector_ms': 0.011849403381347656, 'ViewRequirementAgentConnector_ms': 0.13973164558410645}}","{'training_iteration_time_ms': 4606.776, 'sample_time_ms': 732.205, 'learn_time_ms': 3871.824, 'learn_throughput': 1033.105, 'synch_weights_time_ms': 2.112}"


[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000000)
[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000001)
[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000002)
[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000003)
[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/chec

[36m(PPO pid=80966)[0m 模型已保存，路径为：TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000006), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'shared_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.278447392757369, 'cur_kl_coeff': 0.0158203125, 'cur_lr': 0.0003, 'total_loss': 9.972784683975993, 'policy_loss': -0.004088834314194473, 'vf_loss': 9.976735165752942, 'vf_explained_var': 0.013945311496529398, 'kl': 0.008744058961412372, 'entropy': 0.6814882958237128, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 253.16455696202533, 'num_grad_updates_lifetime': 46610.5, 'diff_num_grad_updates_vs_sampler_policy': 789.5}}, 'num_env_steps_sampled': 120000, 'num_env_steps_trained': 120000, 'num_agent_steps_sampled': 600000, 'num_agent_steps_trained': 600000}, 'sampler_results': {'episode_reward_max': 4862

[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000007)
[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000008)
[36m(PPO pid=80966)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/cyj/ray_results/PPO_2024-11-23_20-29-59/PPO_env_a7d11_00000_0_2024-11-23_20-29-59/checkpoint_000009)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-11-23 20:34:01,755	INFO tune.py:1016 -- Wrote the latest version of all result files an