In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
import matplotlib.pyplot as plt
from env.custom_environment import CustomEnvironment

class Actor(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=64):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

class Critic(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DDPGAgent:
    def __init__(self, obs_dim, action_dim, lr_actor=1e-3, lr_critic=1e-3, gamma=0.99, tau=0.01):
        self.actor = Actor(obs_dim, action_dim)
        self.critic = Critic(obs_dim + action_dim)
        self.target_actor = Actor(obs_dim, action_dim)
        self.target_critic = Critic(obs_dim + action_dim)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)
        
        self.gamma = gamma
        self.tau = tau
        
        self.update_targets(tau=1.0)

    def update_targets(self, tau=None):
        if tau is None:
            tau = self.tau
        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action_prob = self.actor(state).squeeze(0).detach().numpy()
        # 选择动作0或1
        action = 1 if action_prob >= 0 else 0
        print(action)
        return action


    def train(self, memory, batch_size):
        if len(memory) < batch_size:
            return

        batch = random.sample(memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        next_actions = self.target_actor(next_states)
        target_q_values = self.target_critic(torch.cat([next_states, next_actions], 1))
        q_targets = rewards + self.gamma * target_q_values * (1 - dones)

        q_values = self.critic(torch.cat([states, actions], 1))
        critic_loss = torch.mean((q_values - q_targets.detach()) ** 2)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic(torch.cat([states, self.actor(states)], 1)).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.update_targets()

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

def train_maddpg(env, num_episodes=1000, batch_size=64, memory_capacity=100000, max_steps=200):
    memory = ReplayBuffer(memory_capacity)
    agents = [DDPGAgent(env.observation_space(agent).shape[0], env.action_space(agent).n) for agent in env.agents]

    total_rewards = []

    for episode in range(num_episodes):
        observations = env.reset()
        episode_rewards = {agent: 0 for agent in env.agents}

        for step in range(max_steps):
            actions = {agent: agents[i].select_action(observations[agent]) for i, agent in enumerate(env.agents)}
            print(actions)

            next_observations, rewards, dones, infos = env.step(actions)
            for i, agent in enumerate(env.agents):
                memory.push(observations[agent], actions[agent], rewards[agent], next_observations[agent], dones[agent])

            for agent in agents:
                agent.train(memory, batch_size)

            observations = next_observations

            for agent in env.agents:
                episode_rewards[agent] += rewards[agent]

            if all(dones.values()):
                break

        total_episode_reward = sum(episode_rewards.values())
        total_rewards.append(total_episode_reward)

        if (episode + 1) % 10 == 0:
            print(f'Episode {episode + 1}, Total Reward: {total_episode_reward}')

    plt.plot(total_rewards)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Total Reward vs. Episode')
    plt.show()

    return agents

# 使用自定义环境训练MADDPG
env = CustomEnvironment()
trained_agents = train_maddpg(env)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()



详细阅读我的代码，帮我写一个能训练我环境的多智能体强化学习算法，具体是什么算法你自己决定。你一定要写一个适配我环境的算法，并且要能运行，每十步输出当前的总回报值，并在最后绘制出奖励的变化曲线。
关于我的环境我有几点要说明：我的环境的总体意思是有五个智能体，他们能选取的动作只有0或1，0为不触发1为触发，最终要在200步内达成实现一致性并且尽可能少触发。算法就是要训练出这两百步内每一步中五个智能体的动作选择，来让奖励最大化。要注意每个智能体能选择的动作只有0或1，然后每一步的actions为五个智能体动作选择的拼接矩阵，类似于[1，0，0，0，1],要以这种格式与环境交互。其他的一些点你也要与我的环境相匹配。