Collaboartion of snake agents in a MARL environment 

In [1]:
n_agents = 2

shared_rewards = [] # it can use the same rewards, but changed build

In [14]:
import math
import random
import time
import torch
import torch.optim as optim

from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import ActionTuple

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from models import DQN, DuelingDQN, ReplayMemory, optimize_model

def select_action(state_in, policy_net, spec):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            action_out = policy_net(state_in).max(1).indices.view(1, 1)
            return action_out
    else:
        return torch.tensor(spec.action_spec.random_action(1).discrete, device=device, dtype=torch.long)


In [15]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.1
EPS_DECAY = 5_000
TAU = 0.005
LR = 1e-4

SAVE_WEIGHTS = True
LOAD_WEIGHTS = False
steps_done = 0
STEPS = 250
DOUBLE = True
DUELING = False
GRAPHICS = True

if torch.cuda.is_available():
    num_episodes = 500
else:
    num_episodes = 50


In [18]:
env = UnityEnvironment(file_name="marl_builds/snake", seed=1, side_channels=[], no_graphics=not GRAPHICS)
env.reset()


In [19]:
list(env.behavior_specs)

['Snake1?team=0', 'Snake2?team=0']

In [20]:

# fix so it has more behaviours
behaviour_names = list(env.behavior_specs)
specs = [env.behavior_specs[name] for name in behaviour_names]

n_actions = specs[0].action_spec.discrete_branches[0]
state, _ = env.get_steps(behaviour_names[0])
state = state.obs[0]
n_observations = specs[0].observation_specs[0].shape[0]

if DUELING:
    policy_nets = [DuelingDQN(n_observations, n_actions).to(device) if DUELING else DQN(n_observations, n_actions).to(device) for _ in range(n_agents)]
    target_nets = [DuelingDQN(n_observations, n_actions).to(device) if DUELING else DQN(n_observations, n_actions).to(device) for _ in range(n_agents)]
else:
    policy_nets = [DQN(n_observations, n_actions).to(device) for _ in range(n_agents)]
    target_nets = [DQN(n_observations, n_actions).to(device) for _ in range(n_agents)]

optimizers = [optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True) for policy_net in policy_nets]
memories = [ReplayMemory(1000) for _ in range(n_agents)]

if LOAD_WEIGHTS:
    for i, policy_net in enumerate(policy_nets):
        policy_net.load_state_dict(torch.load(f'marl_weights/policy_net_{i}.pth'))
    print("Loaded MARL weights from file")

for policy_net, target_net in zip(policy_nets, target_nets):
    target_net.load_state_dict(policy_net.state_dict())

rewards = [[] for _ in range(n_agents)]
losses = [[] for _ in range(n_agents)]
lengths = [[] for _ in range(n_agents)]

print(f"Initialized {n_agents} DQNs with {n_observations} observations and {n_actions} actions each")

Initialized 2 DQNs with 29 observations and 4 actions each


In [12]:
timer_start = time.perf_counter()
pbar = tqdm(range(num_episodes))
for i_episode in pbar:
    if i_episode % 100 == 0 and i_episode != 0:
        for i, policy_net in enumerate(policy_nets):
            torch.save(policy_net.state_dict(), f'weights/policy_net_{i}.pth')
        print(f"Episode {i_episode}, avg reward: {np.mean(rewards[-100:]):.2f}, "
              f"epsilon: {EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY):.2f}")
        print("Checkpoint: Saved weights to file")

    step_rewards = [0 for _ in range(n_agents)]
    step_losses = [[] for _ in range(n_agents)]
    episode_length = [0 for _ in range(n_agents)]

    env.reset()
    decision_steps1, terminal_steps1 = env.get_steps(behaviour_names[0])
    decision_steps2, terminal_steps2 = env.get_steps(behaviour_names[1])
    states = [decision_steps1.obs[0], decision_steps2.obs[0]]
    states = [torch.tensor(states, dtype=torch.float32, device=device), torch.tensor(states, dtype=torch.float32, device=device)]
    for t in range(STEPS):
        actions = [select_action(state, policy_net) for policy_net in policy_nets]

        action_tuple = ActionTuple()
        for action in actions:
            action_tuple.add_discrete(action.cpu().numpy())
        env.set_actions(behaviour_name, action_tuple)
        env.step()
        
        decision_steps, terminal_steps = env.get_steps(behaviour_name)
        observation = decision_steps.obs[0]
        reward = np.zeros(state.shape[0])
        if len(decision_steps.reward) > 0:
            reward += decision_steps.reward
        if len(terminal_steps.reward) > 0:
            reward += terminal_steps.reward
        done = len(decision_steps) == 0
        terminated = len(terminal_steps) > 0

        reward = torch.tensor(reward, device=device)
        for i in range(n_agents):
            step_rewards[i] += reward.item()

        if done or terminated:
            next_state = None
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device)

        for i in range(n_agents):
            memories[i].push(state, actions[i], next_state, reward)
            loss = optimize_model(memories[i], policy_nets[i], target_nets[i], optimizers[i], device, double=DOUBLE, BATCH_SIZE=BATCH_SIZE, GAMMA=GAMMA)
            if loss is not None:
                step_losses[i].append(loss)

            target_net_state_dict = target_nets[i].state_dict()
            policy_net_state_dict = policy_nets[i].state_dict()
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key] * TAU + target_net_state_dict[key] * (1 - TAU)
            target_nets[i].load_state_dict(target_net_state_dict)

        state = next_state

        if terminated or done:
            for i in range(n_agents):
                episode_length[i] = t
            break

    for i in range(n_agents):
        if len(step_losses[i]) == 0:
            step_losses[i].append(0)
        losses[i].append(np.mean(step_losses[i]))
        lengths[i].append(np.mean(episode_length[i]))
        rewards[i].append(step_rewards[i])

    pbar.set_description(f"E {i_episode} done after {t + 1} t, with r: {sum(step_rewards):.2f} and l: {np.mean([np.mean(loss) for loss in step_losses]):.2f}")

env.close()
print(f"Finished training in {(time.perf_counter() - timer_start)/60 :.3} minutes")

  0%|          | 0/500 [00:00<?, ?it/s]


TypeError: select_action() takes 1 positional argument but 2 were given