In [None]:
from gym.spaces import Dict, Discrete, MultiDiscrete, Tuple, Box
import numpy as np

from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE

# Environment

In [None]:
class FoodGame(MultiAgentEnv):
    def __init__(self, env_config):
        # Game Settings
        self.agents = env_config['agents'] # [0,1,...]
        self.agent_names = ['agent'+str(i) for i in range(len(self.agents))]
        self.food_loc = env_config['food_loc'] # {'food1': (row,col), ... } in 0 index
        self.agent_loc = env_config['agent_loc'] # {0: (row,col), 1: (row,col), ...} in 0 index
        self.board_size = env_config['board_size'] # (rows,cols)
        self.time_limit = 20
        self.cur_timestep = 0
        self.mode = env_config['mode']
        self.trainer = env_config['trainer']

        # Environment Settings
        self.action_space = Discrete(4) # L,R,U,D
        if self.trainer == 'maddpg':
            self.observation_space = Box(low=np.array([0, 0]*len(self.agents)+[0,0]*len(self.food_loc.items())), high=np.array([self.board_size[0]-1, self.board_size[1]-1]*len(self.agents)+[self.board_size[0], self.board_size[1]]*len(self.food_loc.items())), dtype=np.float32)
        else:
            self.observation_space = MultiDiscrete([self.board_size[0], self.board_size[1]]*len(self.agents)+[1+self.board_size[0], 1+self.board_size[1]]*len(self.food_loc.items())) # agent1_loc, agent2_loc, food1_loc, ...
        self.state = None

    def reset(self):
        s = ()
        self.cur_timestep = 0
        for key in self.agent_loc:
            s = s + self.agent_loc[key]

        for key in self.food_loc:
            s = s + (self.food_loc[key][0]+1,self.food_loc[key][1]+1)

        self.state = np.array(s)
#         if self.trainer == "qmix":
#             return {agent: {"obs": self.state.copy(), ENV_STATE: self.state.copy()} for agent in self.agent_names}
        return {agent: self.state.copy() for agent in self.agent_names}

    def render(self):
        board = np.full(self.board_size,'',dtype=object)
        s = self.state.reshape(-1,2)
        agents_loc = s[:len(self.agents),:]
        foods_loc = s[len(self.agents):,:]
        for i in range(len(agents_loc)):
            board[agents_loc[i][0],agents_loc[i][1]] += str(i)
        for j in range(len(foods_loc)):
            if tuple(foods_loc[j]) != (0,0):
                board[foods_loc[j][0]-1,foods_loc[j][1]-1] += 'F'
        print(board)
        print()
        return board

    def step(self, action_dict):
        self.cur_timestep += 1
        '''
        actions = {"agent1": ..., "agent2": ..., ...}
        '''
        if self.trainer == 'maddpg':
            action_dict = {
                    k: np.random.choice([0, 1, 2, 3], p=v) for k, v in action_dict.items()
                }
        s = self.state.reshape(-1,2)
        rewards = {}
        for k in self.agent_names:
            rewards[k] = -1 # penalty per step
        food_loc_lt = s[len(self.agent_names):,:]
        for i in range(len(self.agent_names)):
            action = action_dict[self.agent_names[i]]
            if action == 0:
                # L
                new_pos = [s[i][0], max(0,s[i][1]-1)]
            elif action == 1:
                # R
                new_pos = [s[i][0], min(self.board_size[1]-1,s[i][1]+1)]
            elif action == 2:
                # U
                new_pos = [max(0,s[i][0]-1), s[i][1]]
            elif action == 3:
                # D
                new_pos = [min(self.board_size[0]-1,s[i][0]+1), s[i][1]]
            else:
                raise ValueError('ActionError')

            s[i] = np.array(new_pos)
            for j in range(len(food_loc_lt)):
                if new_pos[0] == food_loc_lt[j][0]-1 and new_pos[1] == food_loc_lt[j][1]-1:
                    s[j+len(self.agent_names)] = np.array([0,0])
                    if self.mode == "coop":
                        for k in self.agent_names:
                            rewards[k] += 10 # cooperative global reward
                    else:
                        rewards[self.agent_names[i]] += 10 # competitive individual reward
        new_obs_single = s.flatten()
        done = True
        for j in range(len(food_loc_lt)):
            if tuple(food_loc_lt[j]) != (0,0):
                done = False
                break
        if done or self.time_limit == self.cur_timestep:
            dones = {agent: True for agent in self.agent_names}
            dones['__all__'] = True
        else:
            dones = {agent: False for agent in self.agent_names}
            dones['__all__'] = False
            
        infos = {agent: dict() for agent in self.agent_names}

#         if self.trainer == 'qmix':
#             new_obs = {agent: {"obs": new_obs_single.copy(), ENV_STATE: new_obs_single.copy()} for agent in self.agent_names}
#         else:
        new_obs = {agent: new_obs_single.copy() for agent in self.agent_names}

        return new_obs, rewards, dones, infos

In [None]:
def simulate_environment():
    env_config = dict()
    env_config['agents'] = [0,1]
    env_config['food_loc'] = {'food1': (3,3), 'food2': (5,5)}
    env_config['agent_loc'] = {0: (4,4), 1: (6,7)}
    env_config['board_size'] = (7,10)
    env_config['mode'] = "comp"
    env_config['trainer'] = 'ppo'

    fg = FoodGame(env_config)
    fg.reset()
    fg.render()
    MOVE_MAPPING = {0: 'L', 1: 'R', 2: 'U', 3: 'D'}
    cumu_rewards = [0,0]
    for i in range(100):
        a0_action = np.random.randint(4)
        a1_action = np.random.randint(4)
        print('Actions: agent0 = {}, agent1 = {}'.format(MOVE_MAPPING[a0_action],MOVE_MAPPING[a1_action]))
        a, rewards, dones, infos = fg.step(action_dict={'agent0': a0_action, 'agent1': a1_action})
        fg.render()
        cumu_rewards[0] += rewards['agent0']
        cumu_rewards[1] += rewards['agent1']
        if (np.array(list(dones.values())) == True).all():
            print("Episode Ended")
            print("Rewards:",cumu_rewards)
            break

from ray.tune.registry import register_env

register_env("foodenv", FoodGame)  

In [None]:
simulate_environment()

# Model (not used)

In [None]:
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC

torch, nn = try_import_torch()

class TorchCustomModel(TorchModelV2, nn.Module):
    """Example of a PyTorch custom model that just delegates to a fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name
        )
        nn.Module.__init__(self)

        self.torch_sub_model = TorchFC(
            obs_space, action_space, num_outputs, model_config, name
        )

    def forward(self, input_dict, state, seq_lens):
        input_dict["obs"] = input_dict["obs"].float()
        fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
        return fc_out, []

    def value_function(self):
        return torch.reshape(self.torch_sub_model.value_function(), [-1])
    
from ray.rllib.models import ModelCatalog
ModelCatalog.register_custom_model(
        "my_model", TorchCustomModel
    )


# Environment Config

In [None]:
# Coop Environment
coop_env_config = dict()
coop_env_config['agents'] = [0,1]
coop_env_config['food_loc'] = {'food1': (3,3), 'food2': (5,5)}
coop_env_config['agent_loc'] = {0: (4,4), 1: (6,7)}
coop_env_config['board_size'] = (7,10)
coop_env_config['mode'] = 'coop'
coop_env_config['trainer'] = 'ppo'

# Competitive Environment
comp_env_config = dict()
comp_env_config['agents'] = [0,1]
comp_env_config['food_loc'] = {'food1': (3,3), 'food2': (5,5)}
comp_env_config['agent_loc'] = {0: (4,4), 1: (6,7)}
comp_env_config['board_size'] = (7,10)
comp_env_config['mode'] = 'comp'
comp_env_config['trainer'] = 'ppo'

In [None]:
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print
from ray.rllib.policy.policy import PolicySpec
from ray import tune
from datetime import datetime
import os

# PPO Trainer (Coop)

In [None]:
e = FoodGame(coop_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["timesteps_per_iteration"] = 1000
config["env_config"] = coop_env_config

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"training_iteration": 20}

analysis = ray.tune.run(
    ppo.PPOTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=1,
    checkpoint_at_end=True)

# Results Analysis (Coop)

In [None]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]

In [None]:
trainer = ppo.PPOTrainer(config=config, env="foodenv")
trainer.restore(best_checkpoint[0])

In [None]:
# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

# PPO Trainer (Comp)

In [None]:
e = FoodGame(comp_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["timesteps_per_iteration"] = 1000
config["num_workers"] = 1
config["env_config"] = comp_env_config

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"training_iteration": 20}

analysis = ray.tune.run(
    ppo.PPOTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=1,
    checkpoint_at_end=True)

# Results Analysis (Comp)

In [None]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = ppo.PPOTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

# DQN Trainer (Coop)

In [None]:
# Coop Environment
coop_env_config['trainer'] = 'dqn'

# Competitive Environment
comp_env_config['trainer'] = 'dqn'

In [None]:
import ray.rllib.agents.dqn as dqn

e = FoodGame(coop_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dqn.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["batch_mode"] = "complete_episodes"
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config

# Buffer params
config["buffer_size"] = 20000 # in batches
config["prioritized_replay"] = True

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

# eval
# config["evaluation_interval"] = 1
# config["evaluation_duration"] = 1

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    dqn.DQNTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

# Results Analysis (Coop + DQN)

In [None]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]

trainer = dqn.DQNTrainer(config=config, env="foodenv")
trainer.restore(best_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

# DQN Trainer (Comp)

In [None]:
import ray.rllib.agents.dqn as dqn
e = FoodGame(comp_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dqn.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["batch_mode"] = "complete_episodes"
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = comp_env_config

# Buffer params
config["buffer_size"] = 20000 # in batches
config["prioritized_replay"] = False

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

# eval
# config["evaluation_interval"] = 1
# config["evaluation_duration"] = 1

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    dqn.DQNTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

# Results Analysis (Comp + DQN)

In [None]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = dqn.DQNTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

# MADDPG Trainer (Coop)

In [None]:
coop_env_config['trainer'] = 'maddpg'
comp_env_config['trainer'] = 'maddpg'
e = FoodGame(coop_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dict()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config
config["batch_mode"] = "complete_episodes"

# model
config["actor_hiddens"] = [256, 256]
config["actor_hidden_activation"] = "tanh"
config["critic_hiddens"] = [256, 256]
config["critic_hidden_activation"] = "tanh"
config["learning_starts"] = 1000 # in terms of samples
config["critic_lr"] = 1e-2 # in terms of samples
config["actor_lr"] = 1e-2 # in terms of samples

# Buffer params
config["buffer_size"] = 20000 # in batches

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    "contrib/MADDPG",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

# Results Analysis (Coop + MADDPG)

In [None]:
import ray.rllib.contrib.maddpg as maddpg

# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = maddpg.MADDPGTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

# MADDPG Trainer (Comp)

In [None]:
coop_env_config['trainer'] = 'maddpg'
comp_env_config['trainer'] = 'maddpg'

e = FoodGame(comp_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dict()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config
config["batch_mode"] = "complete_episodes"

# model
config["actor_hiddens"] = [256, 256]
config["actor_hidden_activation"] = "tanh"
config["critic_hiddens"] = [256, 256]
config["critic_hidden_activation"] = "tanh"
config["learning_starts"] = 1000 # in terms of samples
config["critic_lr"] = 1e-2 # in terms of samples
config["actor_lr"] = 1e-2 # in terms of samples

# Buffer params
config["buffer_size"] = 20000 # in batches

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    "contrib/MADDPG",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
#     checkpoint_freq=100,
    checkpoint_at_end=True)

# Results Analysis (Comp + MADDPG)

In [None]:
import ray.rllib.contrib.maddpg as maddpg

# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = maddpg.MADDPGTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

# QMIX Trainer (Coop)

In [None]:
coop_env_config['trainer'] = 'qmix'
comp_env_config['trainer'] = 'qmix'

In [None]:
e = FoodGame(coop_env_config)
tuple_obs_space = Tuple([e.observation_space for i in range(len(e.agents))])
tuple_act_space = Tuple([e.action_space for i in range(len(e.agents))])

register_env("grouped_foodenv", lambda config: FoodGame(config).with_agent_groups(
            groups={"agents": e.agent_names},
            obs_space=tuple_obs_space,
            act_space=tuple_act_space,
        ))

config = dict()
config["num_gpus"] = 0
config["env"] = "grouped_foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config

# Learning params
config["lr"] = 1e-3
config["optim_alpha"] = 0.99
config["optim_eps"] = 0.00001
config["grad_norm_clipping"] = 10
config["target_network_update_freq"] = 100

# Buffer params
config["buffer_size"] = 20000 # in batches

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep
config["model"]={
        "lstm_cell_size": 256,
        "max_seq_len": 20,
    }

config["mixing_embed_dim"] = 256
# eval
# config["evaluation_interval"] = 1
# config["evaluation_duration"] = 1

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}
analysis = ray.tune.run(
    "QMIX",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

# Results Analysis (QMIX + Coop)

In [None]:
import ray.rllib.agents.qmix as qmix

# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = qmix.QMixTrainer(config=config, env="grouped_foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends

grouped_e = FoodGame(coop_env_config).with_agent_groups(
            groups={"agents": FoodGame(coop_env_config).agent_names},
            obs_space=tuple_obs_space,
            act_space=tuple_act_space,
        )

# episode_reward = {'agent'+str(i):0 for i in range(len(grouped_e.agents))}
# done = False
# obs = grouped_e.reset()
# print("Initial state at Time 0:")
# grouped_e.render()
# time = 1
# while not done:
#     print("Time:",time)
#     time += 1
#     action = trainer.compute_actions(obs,policy_id='default_policy')
#     for i in range(1,len(grouped_e.agents)):
#         a = trainer.compute_actions(obs,policy_id='default_policy')
#         action['agent'+str(i)] = a['agent'+str(i)]
#     obs, reward, dones, info = grouped_e.step(action)
#     done = dones['__all__']
#     grouped_e.render()
#     for i in range(len(grouped_e.agents)):
#         episode_reward['agent'+str(i)] += reward['agent'+str(i)]
# print("Episode Ended")
# print("Episode Rewards",episode_reward)

# QMIX Trainer (Comp)

In [None]:
e = FoodGame(comp_env_config)
tuple_obs_space = Tuple([e.observation_space for i in range(len(e.agents))])
tuple_act_space = Tuple([e.action_space for i in range(len(e.agents))])

register_env("grouped_foodenv", lambda config: FoodGame(config).with_agent_groups(
            groups={'group'+str(i):[e.agent_names[i]] for i in range(len(e.agents))},
            obs_space=tuple_obs_space,
            act_space=tuple_act_space,
        ))

config = dict()
config["num_gpus"] = 0
config["env"] = "grouped_foodenv"
config["num_workers"] = 1
config["env_config"] = comp_env_config
config["lr"] = 1e-2
config["learning_starts"] = 1000
config["target_network_update_freq"] = 500
config["rollout_fragment_length"] = 4
config["train_batch_size"] = 32
config["timesteps_per_iteration"] = 1000
# config["exploration_config"] = {"epsilon_timesteps": 10000, }
config["exploration_config"] = {"epsilon_timesteps": 15000, "final_epsilon": 0.0}


now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"training_iteration": 20}

analysis = ray.tune.run(
    "QMIX",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=1,
    checkpoint_at_end=True)

In [None]:
{'group'+str(i):e.agent_names[i] for i in range(len(e.agents))}