In [1]:
import numpy as np
from maddpg import MADDPG
from buffer import MultiAgentReplayBuffer
from pettingzoo.mpe import simple_adversary_v3
import warnings
import time

In [2]:
scenario = 'simple_adversary'
env = simple_adversary_v3.parallel_env(continuous_actions=True,render_mode='human')
obs = env.reset()

In [3]:
env.num_agents, env.agents

(3, ['adversary_0', 'agent_0', 'agent_1'])

In [4]:
n_agents = env.num_agents

In [5]:
actor_dims = []
for agent_name in env.agents:
    print(f'{agent_name} - {env.observation_spaces[agent_name]} - {env.observation_spaces[agent_name].shape}')
    actor_dims.append(env.observation_spaces[agent_name].shape[0])
actor_dims

adversary_0 - Box(-inf, inf, (8,), float32) - (8,)
agent_0 - Box(-inf, inf, (10,), float32) - (10,)
agent_1 - Box(-inf, inf, (10,), float32) - (10,)




[8, 10, 10]

In [6]:
critic_dims = sum(actor_dims)
critic_dims

28

In [7]:
for agent_name in env.agents:
    print(f'{agent_name} - {env.action_spaces[agent_name]} - {env.action_spaces[agent_name].shape}')
# all agent has 5 action
n_actions = 5

adversary_0 - Box(0.0, 1.0, (5,), float32) - (5,)
agent_0 - Box(0.0, 1.0, (5,), float32) - (5,)
agent_1 - Box(0.0, 1.0, (5,), float32) - (5,)




In [8]:
env.reset()[0]

{'adversary_0': array([-1.2346485 ,  1.4577886 , -0.5728462 ,  1.1731899 ,  0.22436805,
        -0.01101489, -0.54150075,  0.96164197], dtype=float32),
 'agent_0': array([-1.4590166 ,  1.4688034 , -1.4590166 ,  1.4688034 , -0.79721427,
         1.1842048 , -0.22436805,  0.01101489, -0.7658688 ,  0.97265685],
       dtype=float32),
 'agent_1': array([-0.6931477 ,  0.4961466 , -0.6931477 ,  0.4961466 , -0.03134545,
         0.21154793,  0.54150075, -0.96164197,  0.7658688 , -0.97265685],
       dtype=float32)}

In [9]:
obs = env.reset()[0]

In [10]:
actions = {
    'adversary_0' : [0.2, 0.3, 0.5, 0, 0],
    'agent_0' : [0.5, 0, 0, 0, 0.5],
    'agent_1' : [1, 0, 0, 0, 0],
}

next_state, reward, termination, truncation, info = env.step(actions)

  gym.logger.warn("Casting input x to numpy array.")


In [11]:
next_state

{'adversary_0': array([-0.08454347, -0.06841106,  1.4396316 ,  1.3236113 ,  0.19912532,
         0.99912477,  1.4778169 ,  0.6007171 ], dtype=float32),
 'agent_0': array([ 1.2405063 ,  0.32448646, -0.2836688 , -1.0675359 ,  1.2405063 ,
         0.32448646, -0.19912532, -0.99912477,  1.2786916 , -0.39840764],
       dtype=float32),
 'agent_1': array([-0.03818534,  0.7228941 , -1.5623604 , -0.6691282 , -0.03818534,
         0.7228941 , -1.4778169 , -0.6007171 , -1.2786916 ,  0.39840764],
       dtype=float32)}

In [12]:
reward

defaultdict(int,
            {'adversary_0': -1.9556292612005246,
             'agent_0': 1.2317273332334362,
             'agent_1': 1.2317273332334362})

In [13]:
termination

{'adversary_0': False, 'agent_0': False, 'agent_1': False}

In [14]:
truncation

{'adversary_0': False, 'agent_0': False, 'agent_1': False}

In [15]:
info

{'adversary_0': {}, 'agent_0': {}, 'agent_1': {}}

In [16]:
t = np.concatenate([i for i in next_state.values()])
t, t.shape

(array([-0.08454347, -0.06841106,  1.4396316 ,  1.3236113 ,  0.19912532,
         0.99912477,  1.4778169 ,  0.6007171 ,  1.2405063 ,  0.32448646,
        -0.2836688 , -1.0675359 ,  1.2405063 ,  0.32448646, -0.19912532,
        -0.99912477,  1.2786916 , -0.39840764, -0.03818534,  0.7228941 ,
        -1.5623604 , -0.6691282 , -0.03818534,  0.7228941 , -1.4778169 ,
        -0.6007171 , -1.2786916 ,  0.39840764], dtype=float32),
 (28,))

In [17]:
env.possible_agents # return list alive agent

['adversary_0', 'agent_0', 'agent_1']

In [18]:
memory = MultiAgentReplayBuffer(1000000, critic_dims, actor_dims,
                                n_actions, n_agents, batch_size=1024,
                                agent_names=env.agents)
# critic_dims = 28
# actor_dims = [8, 10, 10]
# n_actions = 5
# n_agents = 3
# agents_names = ['adversary_0', 'agent_0', 'agent_1'])

In [19]:
# memory.store_transition(obs, state, actions, reward, obs_, state_, done)

# obs : list, keys = agent_names, values = agent_states
# state : np.array with shape = (28, ) = 8 + 10 + 10
# actions: list, keys = agent_names, values = probs action for each agent [0.5, 0.5, 0, 0, 0]
# reward: list, keys = agent_names, values = number for each agent
# obs_ same obs
# state_ same state
# done = [True, False, True]

In [20]:
# def sample_buffer(self):

# actor_states: list with len = n_agents, each elemt with shape = (batch, state_len = 8 or 10)
# states: np array shape (batch, 28)
# actions: same actor_states, each has shape = (batch, 5), overall (3, batch, 5)
# rewards: np array shape (batch, 3) <- 3 agent
# actor_new_states: same actor_states
# states_: same states
# terminal: saem rewards

In [21]:
# score = sum(reward) each action

In [22]:
# maddpg_agents = MADDPG(actor_dims, critic_dims, n_agents, n_actions, #!
#                         fc1=64, fc2=64,
#                         alpha=0.01, beta=0.01, scenario=scenario,
#                         chkpt_dir='tmp/maddpg/')

# actor_dims: list = [8, 10, 10]
# critic_dims: scalar = 28
# n_agents: scalar = 3
# n_actions: scalar = 5
# alpha: lr for adam optimizer of Actor
# beta: lr fo adam optimizer of Critic
# scenario: string = 'simple_adversary'
# chkpt_dir: checkpoint folder 
    # - in constructor: checkpoint folder = chkpt_dir + scenario

In [23]:
# class Agent:
#     def __init__(self, actor_dims, critic_dims, n_actions, n_agents,  chkpt_dir,agent_name,
#                     alpha=0.01, beta=0.01, fc1=64,
#                     fc2=64, gamma=0.95, tau=0.01 ,
#                     ):

# agent_name: string, ex: 'adversary_0'
# alpha: for optimiz actor    
# beta: for optimize critic
# gamma:
# tau: % main network use for update target
# chkpt_dir =  chkpt_dir + scenario 

In [None]:
# class CriticNetwork(nn.Module):
#     def __init__(self, beta, input_dims, fc1_dims, fc2_dims, 
#                     n_agents, n_actions, name, chkpt_dir):

# n_agents: scalar = 3
# n_actions: scalar = 5
# input_dims: critics_dims = 28
# name: string = agent_name + critic, ex: 'agent_0_critic'
# beta: optimize critic network
# chkpt_dir =  chkpt_dir + scenario, same in Agent
    # in constructor: chkpt_file = chk_pt + name

# neural net with input = 28 + 5 * 3, output is scalar for evaluate

# def forward(self, state, action):
# state: (batch, 28), (batch, 15)

In [25]:
# class ActorNetwork(nn.Module):
#     def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, 
#                  n_actions, name, chkpt_dir):
#         super(ActorNetwork, self).__init__()

# alpha: for optimize network
# input_dims: scalar one of [8, 10, 10]
# n_actions: scalar = 5
# name: string =agent_name + actor, ex: 'agent_1_actor' or 'agent_1_target_actor' for target
# chkpt_dir =  chkpt_dir + scenario, same in Agent
    # in constructor: chkpt_file = chk_pt + name

# def forward(self, state):
# state: torch tensor (batch, dims tương ứng với agent 8 hoặc 10)
# pi: ma trận xác suất [batch, dài 5, mỗi hàng là 1 phân phối xác suất]

In [26]:
import torch as T

In [27]:
raw_obs = env.reset()[0]

input_for_agent = raw_obs['adversary_0'] # array np
input_for_agent

array([-1.2629526 ,  0.66632277, -0.17945327,  1.1819197 ,  0.03000507,
        0.52336186, -0.00786891,  0.7526242 ], dtype=float32)

In [28]:
T.Tensor([input_for_agent])

  T.Tensor([input_for_agent])


tensor([[-1.2630,  0.6663, -0.1795,  1.1819,  0.0300,  0.5234, -0.0079,  0.7526]])

In [29]:
a = T.Tensor([input_for_agent]).detach()
a

tensor([[-1.2630,  0.6663, -0.1795,  1.1819,  0.0300,  0.5234, -0.0079,  0.7526]])

In [30]:
print(1-a)
print((1-a).min())

tensor([[ 2.2630,  0.3337,  1.1795, -0.1819,  0.9700,  0.4766,  1.0079,  0.2474]])
tensor(-0.1819)


In [31]:
# output trong 1 agent sau khi choose action tương tự như trên
print(T.Tensor([input_for_agent]).detach())
print(T.Tensor([input_for_agent]).detach().cpu().numpy())

tensor([[-1.2630,  0.6663, -0.1795,  1.1819,  0.0300,  0.5234, -0.0079,  0.7526]])
[[-1.2629526   0.66632277 -0.17945327  1.1819197   0.03000507  0.52336186
  -0.00786891  0.7526242 ]]


In [32]:
a = np.array([[1, 2, 3], [0, 0, -1]])
T.tensor(a).shape

torch.Size([2, 3])

In [33]:
a = [[1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]]
T.tensor([a, a, a, a]).shape

torch.Size([4, 3, 5])

In [36]:
a = T.tensor([[1, 2], [3, 4]])
b = T.tensor([[5, 6], [7, 8]])

a, b

(tensor([[1, 2],
         [3, 4]]),
 tensor([[5, 6],
         [7, 8]]))

In [38]:
T.cat([a, b])

tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])

In [39]:
T.cat([a, b], dim=0)

tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])

In [41]:
T.cat([a, b], dim=1)

tensor([[1, 2, 5, 6],
        [3, 4, 7, 8]])

In [43]:
T.cat([a, b], dim=-1)

tensor([[1, 2, 5, 6],
        [3, 4, 7, 8]])

In [47]:
a = T.tensor([[1], [2], [3]])
a, a.shape

(tensor([[1],
         [2],
         [3]]),
 torch.Size([3, 1]))

In [46]:
b = T.tensor([[False], [True], [False]])
b, b.shape

(tensor([[False],
         [ True],
         [False]]),
 torch.Size([3, 1]))

In [48]:
a[b[:,0]]=0
a

tensor([[1],
        [0],
        [3]])

In [53]:
a=T.tensor([[1], [2], [3]]).flatten()
a

tensor([1, 2, 3])

In [None]:
a[b[:,0]]=0
a

tensor([1, 0, 3])

: 