In [128]:
from gym.spaces import Dict, Discrete, MultiDiscrete, Tuple, Box
import numpy as np

from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE

# Environment

In [129]:
class FoodGame(MultiAgentEnv):
    def __init__(self, env_config):
        # Game Settings
        self.agents = env_config['agents'] # [0,1,...]
        self.agent_names = ['agent'+str(i) for i in range(len(self.agents))]
        self.food_loc = env_config['food_loc'] # {'food1': (row,col), ... } in 0 index
        self.agent_loc = env_config['agent_loc'] # {0: (row,col), 1: (row,col), ...} in 0 index
        self.board_size = env_config['board_size'] # (rows,cols)
        self.time_limit = 20
        self.cur_timestep = 0
        self.mode = env_config['mode']
        self.trainer = env_config['trainer']

        # Environment Settings
        self.action_space = Discrete(4) # L,R,U,D
        if self.trainer == 'maddpg':
            self.observation_space = Box(low=np.array([0, 0]*len(self.agents)+[0,0]*len(self.food_loc.items())), high=np.array([self.board_size[0]-1, self.board_size[1]-1]*len(self.agents)+[self.board_size[0], self.board_size[1]]*len(self.food_loc.items())), dtype=np.float32)
        else:
            self.observation_space = MultiDiscrete([self.board_size[0], self.board_size[1]]*len(self.agents)+[1+self.board_size[0], 1+self.board_size[1]]*len(self.food_loc.items())) # agent1_loc, agent2_loc, food1_loc, ...
        self.state = None

    def reset(self):
        s = ()
        self.cur_timestep = 0
        for key in self.agent_loc:
            s = s + self.agent_loc[key]

        for key in self.food_loc:
            s = s + (self.food_loc[key][0]+1,self.food_loc[key][1]+1)

        self.state = np.array(s)
#         if self.trainer == "qmix":
#             return {agent: {"obs": self.state.copy(), ENV_STATE: self.state.copy()} for agent in self.agent_names}
        return {agent: self.state.copy() for agent in self.agent_names}

    def render(self):
        board = np.full(self.board_size,'',dtype=object)
        s = self.state.reshape(-1,2)
        agents_loc = s[:len(self.agents),:]
        foods_loc = s[len(self.agents):,:]
        for i in range(len(agents_loc)):
            board[agents_loc[i][0],agents_loc[i][1]] += str(i)
        for j in range(len(foods_loc)):
            if tuple(foods_loc[j]) != (0,0):
                board[foods_loc[j][0]-1,foods_loc[j][1]-1] += 'F'
        print(board)
        print()
        return board

    def step(self, action_dict):
        self.cur_timestep += 1
        '''
        actions = {"agent1": ..., "agent2": ..., ...}
        '''
        if self.trainer == 'maddpg':
            action_dict = {
                    k: np.random.choice([0, 1, 2, 3], p=v) for k, v in action_dict.items()
                }
        s = self.state.reshape(-1,2)
        rewards = {}
        for k in self.agent_names:
            rewards[k] = -1 # penalty per step
        food_loc_lt = s[len(self.agent_names):,:]
        for i in range(len(self.agent_names)):
            action = action_dict[self.agent_names[i]]
            if action == 0:
                # L
                new_pos = [s[i][0], max(0,s[i][1]-1)]
            elif action == 1:
                # R
                new_pos = [s[i][0], min(self.board_size[1]-1,s[i][1]+1)]
            elif action == 2:
                # U
                new_pos = [max(0,s[i][0]-1), s[i][1]]
            elif action == 3:
                # D
                new_pos = [min(self.board_size[0]-1,s[i][0]+1), s[i][1]]
            else:
                raise ValueError('ActionError')

            s[i] = np.array(new_pos)
            for j in range(len(food_loc_lt)):
                if new_pos[0] == food_loc_lt[j][0]-1 and new_pos[1] == food_loc_lt[j][1]-1:
                    s[j+len(self.agent_names)] = np.array([0,0])
                    if self.mode == "coop":
                        for k in self.agent_names:
                            rewards[k] += 10 # cooperative global reward
                    else:
                        rewards[self.agent_names[i]] += 10 # competitive individual reward
        new_obs_single = s.flatten()
        done = True
        for j in range(len(food_loc_lt)):
            if tuple(food_loc_lt[j]) != (0,0):
                done = False
                break
        if done or self.time_limit == self.cur_timestep:
            dones = {agent: True for agent in self.agent_names}
            dones['__all__'] = True
        else:
            dones = {agent: False for agent in self.agent_names}
            dones['__all__'] = False
            
        infos = {agent: dict() for agent in self.agent_names}

#         if self.trainer == 'qmix':
#             new_obs = {agent: {"obs": new_obs_single.copy(), ENV_STATE: new_obs_single.copy()} for agent in self.agent_names}
#         else:
        new_obs = {agent: new_obs_single.copy() for agent in self.agent_names}

        return new_obs, rewards, dones, infos

In [130]:
def simulate_environment():
    env_config = dict()
    env_config['agents'] = [0,1]
    env_config['food_loc'] = {'food1': (3,3), 'food2': (5,5)}
    env_config['agent_loc'] = {0: (4,4), 1: (6,7)}
    env_config['board_size'] = (7,10)
    env_config['mode'] = "comp"
    env_config['trainer'] = 'ppo'

    fg = FoodGame(env_config)
    fg.reset()
    fg.render()
    MOVE_MAPPING = {0: 'L', 1: 'R', 2: 'U', 3: 'D'}
    cumu_rewards = [0,0]
    for i in range(100):
        a0_action = np.random.randint(4)
        a1_action = np.random.randint(4)
        print('Actions: agent0 = {}, agent1 = {}'.format(MOVE_MAPPING[a0_action],MOVE_MAPPING[a1_action]))
        a, rewards, dones, infos = fg.step(action_dict={'agent0': a0_action, 'agent1': a1_action})
        fg.render()
        cumu_rewards[0] += rewards['agent0']
        cumu_rewards[1] += rewards['agent1']
        if (np.array(list(dones.values())) == True).all():
            print("Episode Ended")
            print("Rewards:",cumu_rewards)
            break

from ray.tune.registry import register_env

register_env("foodenv", FoodGame)  

In [14]:
simulate_environment()

[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Actions: agent0 = D, agent1 = D
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '0' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Actions: agent0 = R, agent1 = L
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' '' '1' '' '' '']]

Actions: agent0 = U, agent1 = R
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' '' '' '' '

# Model (not used)

In [50]:
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC

torch, nn = try_import_torch()

class TorchCustomModel(TorchModelV2, nn.Module):
    """Example of a PyTorch custom model that just delegates to a fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name
        )
        nn.Module.__init__(self)

        self.torch_sub_model = TorchFC(
            obs_space, action_space, num_outputs, model_config, name
        )

    def forward(self, input_dict, state, seq_lens):
        input_dict["obs"] = input_dict["obs"].float()
        fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
        return fc_out, []

    def value_function(self):
        return torch.reshape(self.torch_sub_model.value_function(), [-1])
    
from ray.rllib.models import ModelCatalog
ModelCatalog.register_custom_model(
        "my_model", TorchCustomModel
    )


# Environment Config

In [131]:
# Coop Environment
coop_env_config = dict()
coop_env_config['agents'] = [0,1]
coop_env_config['food_loc'] = {'food1': (3,3), 'food2': (5,5)}
coop_env_config['agent_loc'] = {0: (4,4), 1: (6,7)}
coop_env_config['board_size'] = (7,10)
coop_env_config['mode'] = 'coop'
coop_env_config['trainer'] = 'ppo'

# Competitive Environment
comp_env_config = dict()
comp_env_config['agents'] = [0,1]
comp_env_config['food_loc'] = {'food1': (3,3), 'food2': (5,5)}
comp_env_config['agent_loc'] = {0: (4,4), 1: (6,7)}
comp_env_config['board_size'] = (7,10)
comp_env_config['mode'] = 'comp'
comp_env_config['trainer'] = 'ppo'

In [132]:
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print
from ray.rllib.policy.policy import PolicySpec
from ray import tune
from datetime import datetime
import os

# PPO Trainer (Coop)

In [162]:
e = FoodGame(coop_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["timesteps_per_iteration"] = 1000
config["env_config"] = coop_env_config

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"training_iteration": 20}

analysis = ray.tune.run(
    ppo.PPOTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=1,
    checkpoint_at_end=True)

Trial name,status,loc
PPOTrainer_foodenv_d9b3e_00000,PENDING,


[2m[36m(PPOTrainer pid=4233)[0m 2022-03-06 02:54:56,052	INFO trainer.py:2055 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(PPOTrainer pid=4233)[0m 2022-03-06 02:54:56,053	INFO ppo.py:250 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=4233)[0m 2022-03-06 02:54:56,053	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233




Trial name,status,loc
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233


Trial name,status,loc
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233




Trial name,status,loc
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2022-03-06_02-55-14
  done: false
  episode_len_mean: 18.339449541284402
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -17.871559633027523
  episode_reward_min: -40.0
  episodes_this_iter: 218
  episodes_total: 218
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.375950813293457
          entropy_coeff: 0.0
          kl: 0.010521992109715939
          model: {}
          policy_loss: -0.008709507994353771
          total_loss: 50.425697326660156
          vf_explained_var: 0.0059417420998215675
          vf_loss: 50.43230438232422
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          c



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,1,12.8168,4000,-17.8716,34,-40,18.3394


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,1,12.8168,4000,-17.8716,34,-40,18.3394


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2022-03-06_02-55-27
  done: false
  episode_len_mean: 16.389344262295083
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -6.713114754098361
  episode_reward_min: -40.0
  episodes_this_iter: 244
  episodes_total: 462
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.3278099298477173
          entropy_coeff: 0.0
          kl: 0.018406812101602554
          model: {}
          policy_loss: -0.027785705402493477
          total_loss: 59.033878326416016
          vf_explained_var: 0.022440912202000618
          vf_loss: 59.0579833984375
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,2,25.2552,8000,-6.71311,34,-40,16.3893


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,2,25.2552,8000,-6.71311,34,-40,16.3893


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,2,25.2552,8000,-6.71311,34,-40,16.3893


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2022-03-06_02-55-39
  done: false
  episode_len_mean: 13.293333333333333
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 6.1466666666666665
  episode_reward_min: -40.0
  episodes_this_iter: 300
  episodes_total: 762
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.2455263137817383
          entropy_coeff: 0.0
          kl: 0.02329617738723755
          model: {}
          policy_loss: -0.04341830313205719
          total_loss: 60.27446365356445
          vf_explained_var: 0.10624027997255325
          vf_loss: 60.3132209777832
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,3,37.6475,12000,6.14667,34,-40,13.2933


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,3,37.6475,12000,6.14667,34,-40,13.2933


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2022-03-06_02-55-53
  done: false
  episode_len_mean: 9.424882629107982
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 18.708920187793428
  episode_reward_min: -40.0
  episodes_this_iter: 426
  episodes_total: 1188
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.30000001192092896
          cur_lr: 4.999999873689376e-05
          entropy: 1.1457754373550415
          entropy_coeff: 0.0
          kl: 0.02284691482782364
          model: {}
          policy_loss: -0.034737031906843185
          total_loss: 53.443328857421875
          vf_explained_var: 0.22624389827251434
          vf_loss: 53.47121047973633
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,4,50.9676,16000,18.7089,34,-40,9.42488


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,4,50.9676,16000,18.7089,34,-40,9.42488


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,4,50.9676,16000,18.7089,34,-40,9.42488


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2022-03-06_02-56-07
  done: false
  episode_len_mean: 6.712605042016807
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 25.768067226890757
  episode_reward_min: -20.0
  episodes_this_iter: 595
  episodes_total: 1783
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr: 4.999999873689376e-05
          entropy: 1.0313035249710083
          entropy_coeff: 0.0
          kl: 0.01652257889509201
          model: {}
          policy_loss: -0.03275007754564285
          total_loss: 33.45328903198242
          vf_explained_var: 0.43037667870521545
          vf_loss: 33.47860336303711
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.30000001192092896
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,5,64.6123,20000,25.7681,34,-20,6.71261


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,5,64.6123,20000,25.7681,34,-20,6.71261


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2022-03-06_02-56-20
  done: false
  episode_len_mean: 4.838164251207729
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 30.22705314009662
  episode_reward_min: -20.0
  episodes_this_iter: 828
  episodes_total: 2611
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr: 4.999999873689376e-05
          entropy: 0.9060773849487305
          entropy_coeff: 0.0
          kl: 0.014765262603759766
          model: {}
          policy_loss: -0.0358523391187191
          total_loss: 9.402002334594727
          vf_explained_var: 0.7081252336502075
          vf_loss: 9.4312105178833
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr: 4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,6,77.6759,24000,30.2271,34,-20,4.83816


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,6,77.6759,24000,30.2271,34,-20,4.83816


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,6,77.6759,24000,30.2271,34,-20,4.83816


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 56000
  custom_metrics: {}
  date: 2022-03-06_02-56-33
  done: false
  episode_len_mean: 4.088957055214724
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 31.801635991820042
  episode_reward_min: -20.0
  episodes_this_iter: 978
  episodes_total: 3589
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr: 4.999999873689376e-05
          entropy: 0.8028068542480469
          entropy_coeff: 0.0
          kl: 0.010326562449336052
          model: {}
          policy_loss: -0.02910958230495453
          total_loss: 3.455625295639038
          vf_explained_var: 0.8659370541572571
          vf_loss: 3.4800877571105957
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,7,90.9391,28000,31.8016,34,-20,4.08896


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,7,90.9391,28000,31.8016,34,-20,4.08896


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,7,90.9391,28000,31.8016,34,-20,4.08896


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 64000
  custom_metrics: {}
  date: 2022-03-06_02-56-47
  done: false
  episode_len_mean: 3.5897666068222622
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 32.820466786355475
  episode_reward_min: 14.0
  episodes_this_iter: 1114
  episodes_total: 4703
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr: 4.999999873689376e-05
          entropy: 0.7289505004882812
          entropy_coeff: 0.0
          kl: 0.008693879470229149
          model: {}
          policy_loss: -0.02640712633728981
          total_loss: 1.359155297279358
          vf_explained_var: 0.9307165145874023
          vf_loss: 1.3816500902175903
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,8,104.4,32000,32.8205,34,14,3.58977


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,8,104.4,32000,32.8205,34,14,3.58977


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 72000
  custom_metrics: {}
  date: 2022-03-06_02-57-00
  done: false
  episode_len_mean: 3.372681281618887
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.23777403035413
  episode_reward_min: -20.0
  episodes_this_iter: 1186
  episodes_total: 5889
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr: 4.999999873689376e-05
          entropy: 0.6801328659057617
          entropy_coeff: 0.0
          kl: 0.004350955132395029
          model: {}
          policy_loss: -0.020742267370224
          total_loss: 1.36017906665802
          vf_explained_var: 0.9568761587142944
          vf_loss: 1.3789633512496948
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,9,117.978,36000,33.2378,34,-20,3.37268


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,9,117.978,36000,33.2378,34,-20,3.37268


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,9,117.978,36000,33.2378,34,-20,3.37268


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 80000
  custom_metrics: {}
  date: 2022-03-06_02-57-14
  done: false
  episode_len_mean: 3.229217110573043
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.541565778853915
  episode_reward_min: 16.0
  episodes_this_iter: 1239
  episodes_total: 7128
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 0.6585890650749207
          entropy_coeff: 0.0
          kl: 0.008488683961331844
          model: {}
          policy_loss: -0.019339345395565033
          total_loss: 0.4175325036048889
          vf_explained_var: 0.9751597046852112
          vf_loss: 0.4349619150161743
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,10,131.609,40000,33.5416,34,16,3.22922


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,10,131.609,40000,33.5416,34,16,3.22922


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 88000
  custom_metrics: {}
  date: 2022-03-06_02-57-28
  done: false
  episode_len_mean: 3.134012539184953
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.73197492163009
  episode_reward_min: 26.0
  episodes_this_iter: 1276
  episodes_total: 8404
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 0.6185867190361023
          entropy_coeff: 0.0
          kl: 0.00634278729557991
          model: {}
          policy_loss: -0.01196957565844059
          total_loss: 0.14816413819789886
          vf_explained_var: 0.9905228018760681
          vf_loss: 0.15870656073093414
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,11,145.478,44000,33.732,34,26,3.13401


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,11,145.478,44000,33.732,34,26,3.13401


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,11,145.478,44000,33.732,34,26,3.13401


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 96000
  custom_metrics: {}
  date: 2022-03-06_02-57-42
  done: false
  episode_len_mean: 3.0903474903474906
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.81930501930502
  episode_reward_min: 22.0
  episodes_this_iter: 1295
  episodes_total: 9699
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 0.5999578237533569
          entropy_coeff: 0.0
          kl: 0.006947714369744062
          model: {}
          policy_loss: -0.011476180516183376
          total_loss: 0.18044686317443848
          vf_explained_var: 0.9890069961547852
          vf_loss: 0.19035981595516205
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.11249999701976776
          

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,12,159.155,48000,33.8193,34,22,3.09035


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,12,159.155,48000,33.8193,34,22,3.09035


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,12,159.155,48000,33.8193,34,22,3.09035


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 104000
  custom_metrics: {}
  date: 2022-03-06_02-57-55
  done: false
  episode_len_mean: 3.0379939209726445
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.92401215805471
  episode_reward_min: 30.0
  episodes_this_iter: 1316
  episodes_total: 11015
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 0.5749653577804565
          entropy_coeff: 0.0
          kl: 0.010239525698125362
          model: {}
          policy_loss: -0.006517163943499327
          total_loss: 0.05973077565431595
          vf_explained_var: 0.9960660338401794
          vf_loss: 0.06394404917955399
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.05624999850988388
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,13,172.555,52000,33.924,34,30,3.03799


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,13,172.555,52000,33.924,34,30,3.03799


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 112000
  custom_metrics: {}
  date: 2022-03-06_02-58-09
  done: false
  episode_len_mean: 3.028009084027252
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.94398183194549
  episode_reward_min: 24.0
  episodes_this_iter: 1321
  episodes_total: 12336
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.22499999403953552
          cur_lr: 4.999999873689376e-05
          entropy: 0.5809305310249329
          entropy_coeff: 0.0
          kl: 0.004271918907761574
          model: {}
          policy_loss: -0.012118248268961906
          total_loss: 0.044982463121414185
          vf_explained_var: 0.9967002272605896
          vf_loss: 0.05613952875137329
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.02812499925494194
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,14,186.239,56000,33.944,34,24,3.02801


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,14,186.239,56000,33.944,34,24,3.02801


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,14,186.239,56000,33.944,34,24,3.02801


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 120000
  custom_metrics: {}
  date: 2022-03-06_02-58-22
  done: false
  episode_len_mean: 3.018099547511312
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.963800904977376
  episode_reward_min: 24.0
  episodes_this_iter: 1326
  episodes_total: 13662
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.11249999701976776
          cur_lr: 4.999999873689376e-05
          entropy: 0.555726945400238
          entropy_coeff: 0.0
          kl: 0.008117114193737507
          model: {}
          policy_loss: -0.008461705408990383
          total_loss: 0.038939666002988815
          vf_explained_var: 0.9972318410873413
          vf_loss: 0.0464881993830204
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.01406249962747097
         

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,15,199.676,60000,33.9638,34,24,3.0181


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,15,199.676,60000,33.9638,34,24,3.0181


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 128000
  custom_metrics: {}
  date: 2022-03-06_02-58-36
  done: false
  episode_len_mean: 3.0112951807228914
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.977409638554214
  episode_reward_min: 30.0
  episodes_this_iter: 1328
  episodes_total: 14990
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.11249999701976776
          cur_lr: 4.999999873689376e-05
          entropy: 0.5589627027511597
          entropy_coeff: 0.0
          kl: 0.008648366667330265
          model: {}
          policy_loss: -0.008758383803069592
          total_loss: 0.00825829803943634
          vf_explained_var: 0.9990207552909851
          vf_loss: 0.01604374125599861
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.01406249962747097
       

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,16,213.113,64000,33.9774,34,30,3.0113


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,16,213.113,64000,33.9774,34,30,3.0113


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,16,213.113,64000,33.9774,34,30,3.0113


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 136000
  custom_metrics: {}
  date: 2022-03-06_02-58-49
  done: false
  episode_len_mean: 3.0060105184072126
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.987978963185576
  episode_reward_min: 30.0
  episodes_this_iter: 1331
  episodes_total: 16321
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.11249999701976776
          cur_lr: 4.999999873689376e-05
          entropy: 0.5367640852928162
          entropy_coeff: 0.0
          kl: 0.013071063905954361
          model: {}
          policy_loss: -0.004756246693432331
          total_loss: 0.007613020949065685
          vf_explained_var: 0.9993414282798767
          vf_loss: 0.01089877262711525
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.01406249962747097
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,17,226.41,68000,33.988,34,30,3.00601


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,17,226.41,68000,33.988,34,30,3.00601


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,17,226.41,68000,33.988,34,30,3.00601


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 144000
  custom_metrics: {}
  date: 2022-03-06_02-59-03
  done: false
  episode_len_mean: 3.003003003003003
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.993993993993996
  episode_reward_min: 30.0
  episodes_this_iter: 1332
  episodes_total: 17653
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.11249999701976776
          cur_lr: 4.999999873689376e-05
          entropy: 0.5394428968429565
          entropy_coeff: 0.0
          kl: 0.008600348606705666
          model: {}
          policy_loss: -0.004174966365098953
          total_loss: 0.0011222268221899867
          vf_explained_var: 0.9997391700744629
          vf_loss: 0.004329653922468424
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.007031249813735485
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,18,239.765,72000,33.994,34,30,3.003


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,18,239.765,72000,33.994,34,30,3.003


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 152000
  custom_metrics: {}
  date: 2022-03-06_02-59-16
  done: false
  episode_len_mean: 3.0082768999247556
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.98344620015049
  episode_reward_min: 30.0
  episodes_this_iter: 1329
  episodes_total: 18982
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.11249999701976776
          cur_lr: 4.999999873689376e-05
          entropy: 0.5575036406517029
          entropy_coeff: 0.0
          kl: 0.009043997153639793
          model: {}
          policy_loss: -0.0032459888607263565
          total_loss: 0.010768668726086617
          vf_explained_var: 0.9992141723632812
          vf_loss: 0.012997210025787354
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.010546875186264515
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,19,253.339,76000,33.9834,34,30,3.00828


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,19,253.339,76000,33.9834,34,30,3.00828


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,RUNNING,172.17.0.2:4233,19,253.339,76000,33.9834,34,30,3.00828


Result for PPOTrainer_foodenv_d9b3e_00000:
  agent_timesteps_total: 160000
  custom_metrics: {}
  date: 2022-03-06_02-59-30
  done: true
  episode_len_mean: 3.0015003750937734
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.996999249812454
  episode_reward_min: 30.0
  episodes_this_iter: 1333
  episodes_total: 20315
  experiment_id: 4c60d222bcb243f3a24b5f056b18be0a
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.11249999701976776
          cur_lr: 4.999999873689376e-05
          entropy: 0.5216752290725708
          entropy_coeff: 0.0
          kl: 0.00932738184928894
          model: {}
          policy_loss: -0.0026260078884661198
          total_loss: 0.000766995653975755
          vf_explained_var: 0.9998573660850525
          vf_loss: 0.0023436725605279207
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.010546875186264515
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_d9b3e_00000,TERMINATED,172.17.0.2:4233,20,266.672,80000,33.997,34,30,3.0015


[2m[36m(RolloutWorker pid=4232)[0m 2022-03-06 02:59:30,711	ERROR worker.py:432 -- SystemExit was raised from the worker.
[2m[36m(RolloutWorker pid=4232)[0m Traceback (most recent call last):
[2m[36m(RolloutWorker pid=4232)[0m   File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=4232)[0m   File "python/ray/_raylet.pyx", line 640, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=4232)[0m   File "python/ray/_raylet.pyx", line 589, in ray._raylet.execute_task.function_executor
[2m[36m(RolloutWorker pid=4232)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/function_manager.py", line 639, in actor_method_executor
[2m[36m(RolloutWorker pid=4232)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(RolloutWorker pid=4232)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(RolloutWorker pid=4232)[0m     return me

# Results Analysis (Coop)

In [163]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]

In [165]:
trainer = ppo.PPOTrainer(config=config, env="foodenv")
trainer.restore(best_checkpoint[0])

2022-03-06 03:00:41,427	INFO trainable.py:473 -- Restored on 172.17.0.2 from checkpoint: /home/ray/cs4246/06_03_2022_02:54:52/PPOTrainer_2022-03-06_02-54-52/PPOTrainer_foodenv_d9b3e_00000_0_2022-03-06_02-54-52/checkpoint_000020/checkpoint-20
2022-03-06 03:00:41,427	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 20, '_timesteps_total': 80000, '_time_total': 266.6717360019684, '_episodes_total': 20315}


In [171]:
# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

Initial state at Time 0:
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 1
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '0' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '1' '' '' '']]

Time: 2
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '0' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '1' '' '' '' '']]

Time: 3
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '0' '' '' '' '' '' '']
 ['' '' '' '' '' '1' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Episod

# PPO Trainer (Comp)

In [123]:
e = FoodGame(comp_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["timesteps_per_iteration"] = 1000
config["num_workers"] = 1
config["env_config"] = comp_env_config

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"training_iteration": 20}

analysis = ray.tune.run(
    ppo.PPOTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=1,
    checkpoint_at_end=True)

Trial name,status,loc
PPOTrainer_foodenv_f2dcd_00000,PENDING,


[2m[36m(PPOTrainer pid=3770)[0m 2022-03-06 02:34:09,852	INFO trainer.py:2055 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(PPOTrainer pid=3770)[0m 2022-03-06 02:34:09,853	INFO ppo.py:250 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=3770)[0m 2022-03-06 02:34:09,853	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770




Trial name,status,loc
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770


Trial name,status,loc
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770




Trial name,status,loc
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2022-03-06_02-34-28
  done: false
  episode_len_mean: 18.205479452054796
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -26.319634703196346
  episode_reward_min: -40.0
  episodes_this_iter: 219
  episodes_total: 219
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.37324059009552
          entropy_coeff: 0.0
          kl: 0.013065727427601814
          model: {}
          policy_loss: -0.023408152163028717
          total_loss: 45.62505340576172
          vf_explained_var: 0.008458759635686874
          vf_loss: 45.645851135253906
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,1,12.7584,4000,-26.3196,14,-40,18.2055


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,1,12.7584,4000,-26.3196,14,-40,18.2055


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2022-03-06_02-34-41
  done: false
  episode_len_mean: 16.48971193415638
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -19.234567901234566
  episode_reward_min: -40.0
  episodes_this_iter: 243
  episodes_total: 462
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.3291960954666138
          entropy_coeff: 0.0
          kl: 0.019748281687498093
          model: {}
          policy_loss: -0.034526389092206955
          total_loss: 42.509395599365234
          vf_explained_var: 0.01757637783885002
          vf_loss: 42.53997039794922
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,2,25.151,8000,-19.2346,14,-40,16.4897


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,2,25.151,8000,-19.2346,14,-40,16.4897


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,2,25.151,8000,-19.2346,14,-40,16.4897


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2022-03-06_02-34-53
  done: false
  episode_len_mean: 13.34
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -9.913333333333334
  episode_reward_min: -40.0
  episodes_this_iter: 300
  episodes_total: 762
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.2406936883926392
          entropy_coeff: 0.0
          kl: 0.02633821591734886
          model: {}
          policy_loss: -0.041184499859809875
          total_loss: 53.8184700012207
          vf_explained_var: 0.052820127457380295
          vf_loss: 53.85438537597656
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.30000001192092896
          cur_lr: 4.9999998

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,3,37.5633,12000,-9.91333,14,-40,13.34


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,3,37.5633,12000,-9.91333,14,-40,13.34


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2022-03-06_02-35-06
  done: false
  episode_len_mean: 9.60576923076923
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -0.1971153846153846
  episode_reward_min: -30.0
  episodes_this_iter: 416
  episodes_total: 1178
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.30000001192092896
          cur_lr: 4.999999873689376e-05
          entropy: 1.1180161237716675
          entropy_coeff: 0.0
          kl: 0.026903493329882622
          model: {}
          policy_loss: -0.05394747480750084
          total_loss: 47.69227600097656
          vf_explained_var: 0.0785447433590889
          vf_loss: 47.7381591796875
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,4,49.9591,16000,-0.197115,14,-30,9.60577


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,4,49.9591,16000,-0.197115,14,-30,9.60577


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2022-03-06_02-35-18
  done: false
  episode_len_mean: 6.443729903536977
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 6.983922829581994
  episode_reward_min: -40.0
  episodes_this_iter: 622
  episodes_total: 1800
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.44999998807907104
          cur_lr: 4.999999873689376e-05
          entropy: 0.9872897267341614
          entropy_coeff: 0.0
          kl: 0.02665587328374386
          model: {}
          policy_loss: -0.05086745321750641
          total_loss: 26.47137451171875
          vf_explained_var: 0.22563855350017548
          vf_loss: 26.510244369506836
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,5,62.5497,20000,6.98392,14,-40,6.44373


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,5,62.5497,20000,6.98392,14,-40,6.44373


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,5,62.5497,20000,6.98392,14,-40,6.44373


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2022-03-06_02-35-31
  done: false
  episode_len_mean: 5.383580080753701
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 9.179004037685061
  episode_reward_min: -30.0
  episodes_this_iter: 743
  episodes_total: 2543
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4.999999873689376e-05
          entropy: 0.8998331427574158
          entropy_coeff: 0.0
          kl: 0.017797553911805153
          model: {}
          policy_loss: -0.058445826172828674
          total_loss: 18.84800148010254
          vf_explained_var: 0.35562750697135925
          vf_loss: 18.894433975219727
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,6,75.1759,24000,9.179,14,-30,5.38358


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,6,75.1759,24000,9.179,14,-30,5.38358


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 56000
  custom_metrics: {}
  date: 2022-03-06_02-35-44
  done: false
  episode_len_mean: 4.950433705080545
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 10.074349442379182
  episode_reward_min: -30.0
  episodes_this_iter: 807
  episodes_total: 3350
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4.999999873689376e-05
          entropy: 0.825964629650116
          entropy_coeff: 0.0
          kl: 0.018055064603686333
          model: {}
          policy_loss: -0.06609361618757248
          total_loss: 15.506969451904297
          vf_explained_var: 0.44298866391181946
          vf_loss: 15.560876846313477
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,7,87.9285,28000,10.0743,14,-30,4.95043


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,7,87.9285,28000,10.0743,14,-30,4.95043


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,7,87.9285,28000,10.0743,14,-30,4.95043


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 64000
  custom_metrics: {}
  date: 2022-03-06_02-35-56
  done: false
  episode_len_mean: 5.128205128205129
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 9.743589743589743
  episode_reward_min: -4.0
  episodes_this_iter: 780
  episodes_total: 4130
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4.999999873689376e-05
          entropy: 0.7236355543136597
          entropy_coeff: 0.0
          kl: 0.016552314162254333
          model: {}
          policy_loss: -0.05815799534320831
          total_loss: 12.897686004638672
          vf_explained_var: 0.4687221646308899
          vf_loss: 12.944671630859375
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,8,100.481,32000,9.74359,14,-4,5.12821


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,8,100.481,32000,9.74359,14,-4,5.12821


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 72000
  custom_metrics: {}
  date: 2022-03-06_02-36-09
  done: false
  episode_len_mean: 5.444897959183673
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 9.110204081632654
  episode_reward_min: 0.0
  episodes_this_iter: 735
  episodes_total: 4865
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4.999999873689376e-05
          entropy: 0.6315877437591553
          entropy_coeff: 0.0
          kl: 0.01062021404504776
          model: {}
          policy_loss: -0.043713513761758804
          total_loss: 9.505982398986816
          vf_explained_var: 0.5462391972541809
          vf_loss: 9.54252815246582
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4.999

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,9,113.253,36000,9.1102,14,0,5.4449


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,9,113.253,36000,9.1102,14,0,5.4449


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 80000
  custom_metrics: {}
  date: 2022-03-06_02-36-22
  done: false
  episode_len_mean: 5.662889518413598
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.674220963172804
  episode_reward_min: -4.0
  episodes_this_iter: 706
  episodes_total: 5571
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4.999999873689376e-05
          entropy: 0.5683839321136475
          entropy_coeff: 0.0
          kl: 0.006266110111027956
          model: {}
          policy_loss: -0.038955360651016235
          total_loss: 8.66318130493164
          vf_explained_var: 0.6192721724510193
          vf_loss: 8.697906494140625
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,10,125.796,40000,8.67422,14,-4,5.66289


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,10,125.796,40000,8.67422,14,-4,5.66289


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,10,125.796,40000,8.67422,14,-4,5.66289


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 88000
  custom_metrics: {}
  date: 2022-03-06_02-36-35
  done: false
  episode_len_mean: 5.821220930232558
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.357558139534884
  episode_reward_min: -2.0
  episodes_this_iter: 688
  episodes_total: 6259
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.675000011920929
          cur_lr: 4.999999873689376e-05
          entropy: 0.5158219337463379
          entropy_coeff: 0.0
          kl: 0.004861412104219198
          model: {}
          policy_loss: -0.030947940424084663
          total_loss: 7.382653713226318
          vf_explained_var: 0.6732740998268127
          vf_loss: 7.41032075881958
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,11,138.429,44000,8.35756,14,-2,5.82122


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,11,138.429,44000,8.35756,14,-2,5.82122


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 96000
  custom_metrics: {}
  date: 2022-03-06_02-36-47
  done: false
  episode_len_mean: 5.809593023255814
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.380813953488373
  episode_reward_min: 2.0
  episodes_this_iter: 688
  episodes_total: 6947
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 0.4715823531150818
          entropy_coeff: 0.0
          kl: 0.006364115048199892
          model: {}
          policy_loss: -0.04019234701991081
          total_loss: 4.656186103820801
          vf_explained_var: 0.7781650424003601
          vf_loss: 4.694231033325195
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,12,151.157,48000,8.38081,14,2,5.80959


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,12,151.157,48000,8.38081,14,2,5.80959


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,12,151.157,48000,8.38081,14,2,5.80959


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 104000
  custom_metrics: {}
  date: 2022-03-06_02-37-00
  done: false
  episode_len_mean: 5.918639053254438
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.162721893491124
  episode_reward_min: 2.0
  episodes_this_iter: 676
  episodes_total: 7623
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 0.426715224981308
          entropy_coeff: 0.0
          kl: 0.008138692937791348
          model: {}
          policy_loss: -0.02577308379113674
          total_loss: 3.28844952583313
          vf_explained_var: 0.8221600651741028
          vf_loss: 3.3114757537841797
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_lr: 4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,13,163.789,52000,8.16272,14,2,5.91864


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,13,163.789,52000,8.16272,14,2,5.91864


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 112000
  custom_metrics: {}
  date: 2022-03-06_02-37-13
  done: false
  episode_len_mean: 5.950892857142857
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.098214285714286
  episode_reward_min: 2.0
  episodes_this_iter: 672
  episodes_total: 8295
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.3375000059604645
          cur_lr: 4.999999873689376e-05
          entropy: 0.3962229788303375
          entropy_coeff: 0.0
          kl: 0.004285112023353577
          model: {}
          policy_loss: -0.017291121184825897
          total_loss: 2.334390640258789
          vf_explained_var: 0.8761123418807983
          vf_loss: 2.3502357006073
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_lr: 4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,14,176.493,56000,8.09821,14,2,5.95089


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,14,176.493,56000,8.09821,14,2,5.95089


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 120000
  custom_metrics: {}
  date: 2022-03-06_02-37-25
  done: false
  episode_len_mean: 5.986526946107785
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.02694610778443
  episode_reward_min: 4.0
  episodes_this_iter: 668
  episodes_total: 8963
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 0.373714417219162
          entropy_coeff: 0.0
          kl: 0.009819568134844303
          model: {}
          policy_loss: -0.01877244934439659
          total_loss: 1.1385146379470825
          vf_explained_var: 0.9281362891197205
          vf_loss: 1.155630111694336
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_lr: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,15,189.109,60000,8.02695,14,4,5.98653


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,15,189.109,60000,8.02695,14,4,5.98653


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,15,189.109,60000,8.02695,14,4,5.98653


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 128000
  custom_metrics: {}
  date: 2022-03-06_02-37-38
  done: false
  episode_len_mean: 6.004504504504505
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 7.990990990990991
  episode_reward_min: 4.0
  episodes_this_iter: 666
  episodes_total: 9629
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 0.3213249146938324
          entropy_coeff: 0.0
          kl: 0.006361422594636679
          model: {}
          policy_loss: -0.012260351330041885
          total_loss: 1.2037289142608643
          vf_explained_var: 0.9278494715690613
          vf_loss: 1.2149157524108887
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,16,201.792,64000,7.99099,14,4,6.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,16,201.792,64000,7.99099,14,4,6.0045


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 136000
  custom_metrics: {}
  date: 2022-03-06_02-37-51
  done: false
  episode_len_mean: 5.994011976047904
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.011976047904191
  episode_reward_min: 4.0
  episodes_this_iter: 668
  episodes_total: 10297
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 0.27376291155815125
          entropy_coeff: 0.0
          kl: 0.00534240435808897
          model: {}
          policy_loss: -0.010850643739104271
          total_loss: 0.4971477687358856
          vf_explained_var: 0.96705162525177
          vf_loss: 0.5070968866348267
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,17,214.475,68000,8.01198,14,4,5.99401


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,17,214.475,68000,8.01198,14,4,5.99401


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,17,214.475,68000,8.01198,14,4,5.99401


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 144000
  custom_metrics: {}
  date: 2022-03-06_02-38-04
  done: false
  episode_len_mean: 6.016541353383459
  episode_media: {}
  episode_reward_max: 8.0
  episode_reward_mean: 7.966917293233083
  episode_reward_min: 4.0
  episodes_this_iter: 665
  episodes_total: 10962
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.16875000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 0.26339468359947205
          entropy_coeff: 0.0
          kl: 0.003279311116784811
          model: {}
          policy_loss: -0.009250523522496223
          total_loss: 1.0983420610427856
          vf_explained_var: 0.9371823668479919
          vf_loss: 1.1070390939712524
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.08437500149011612
          cur

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,18,227.117,72000,7.96692,8,4,6.01654


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,18,227.117,72000,7.96692,8,4,6.01654


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 152000
  custom_metrics: {}
  date: 2022-03-06_02-38-16
  done: false
  episode_len_mean: 6.007518796992481
  episode_media: {}
  episode_reward_max: 8.0
  episode_reward_mean: 7.984962406015038
  episode_reward_min: 6.0
  episodes_this_iter: 665
  episodes_total: 11627
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.08437500149011612
          cur_lr: 4.999999873689376e-05
          entropy: 0.22511355578899384
          entropy_coeff: 0.0
          kl: 0.00565635971724987
          model: {}
          policy_loss: -0.008726169355213642
          total_loss: 0.6411762833595276
          vf_explained_var: 0.9607021808624268
          vf_loss: 0.649425208568573
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.08437500149011612
          cur_l

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,19,239.793,76000,7.98496,8,6,6.00752


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,RUNNING,172.17.0.2:3770,19,239.793,76000,7.98496,8,6,6.00752


Result for PPOTrainer_foodenv_f2dcd_00000:
  agent_timesteps_total: 160000
  custom_metrics: {}
  date: 2022-03-06_02-38-29
  done: true
  episode_len_mean: 5.994011976047904
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: 8.011976047904191
  episode_reward_min: 6.0
  episodes_this_iter: 668
  episodes_total: 12295
  experiment_id: 5e56b41f05ce4fff83faa7e71123084e
  hostname: 310c4782fa76
  info:
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.08437500149011612
          cur_lr: 4.999999873689376e-05
          entropy: 0.21561482548713684
          entropy_coeff: 0.0
          kl: 0.004171359818428755
          model: {}
          policy_loss: -0.00849121529608965
          total_loss: 0.2004651129245758
          vf_explained_var: 0.9855367541313171
          vf_loss: 0.2086043655872345
      policy1:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.08437500149011612
          cur_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_foodenv_f2dcd_00000,TERMINATED,172.17.0.2:3770,20,252.385,80000,8.01198,14,6,5.99401


[2m[36m(RolloutWorker pid=3769)[0m 2022-03-06 02:38:29,769	ERROR worker.py:432 -- SystemExit was raised from the worker.
[2m[36m(RolloutWorker pid=3769)[0m Traceback (most recent call last):
[2m[36m(RolloutWorker pid=3769)[0m   File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=3769)[0m   File "python/ray/_raylet.pyx", line 640, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=3769)[0m   File "python/ray/_raylet.pyx", line 589, in ray._raylet.execute_task.function_executor
[2m[36m(RolloutWorker pid=3769)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/function_manager.py", line 639, in actor_method_executor
[2m[36m(RolloutWorker pid=3769)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(RolloutWorker pid=3769)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(RolloutWorker pid=3769)[0m     return me

# Results Analysis (Comp)

In [124]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = ppo.PPOTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

2022-03-06 02:38:47,583	INFO trainable.py:473 -- Restored on 172.17.0.2 from checkpoint: /home/ray/cs4246/06_03_2022_02:34:05/PPOTrainer_2022-03-06_02-34-05/PPOTrainer_foodenv_f2dcd_00000_0_2022-03-06_02-34-06/checkpoint_000020/checkpoint-20
2022-03-06 02:38:47,584	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 20, '_timesteps_total': 80000, '_time_total': 252.38456273078918, '_episodes_total': 12295}


Initial state at Time 0:
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 1
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '1' '' '' '']]

Time: 2
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' '1' '' '' '' '']]

Time: 3
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '1' '' '' '' '' '']]

Time:

# DQN Trainer (Coop)

In [69]:
# Coop Environment
coop_env_config['trainer'] = 'dqn'

# Competitive Environment
comp_env_config['trainer'] = 'dqn'

In [113]:
import ray.rllib.agents.dqn as dqn

e = FoodGame(coop_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dqn.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["batch_mode"] = "complete_episodes"
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config

# Buffer params
config["buffer_size"] = 20000 # in batches
config["prioritized_replay"] = True

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

# eval
# config["evaluation_interval"] = 1
# config["evaluation_duration"] = 1

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    dqn.DQNTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

Trial name,status,loc
DQNTrainer_foodenv_76743_00000,PENDING,


[2m[36m(DQNTrainer pid=19093)[0m 2022-04-01 04:11:26,065	INFO trainer.py:2055 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(DQNTrainer pid=19093)[0m 2022-04-01 04:11:26,067	INFO simple_q.py:154 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(DQNTrainer pid=19093)[0m 2022-04-01 04:11:26,067	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093




Trial name,status,loc
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 2028
  custom_metrics: {}
  date: 2022-04-01_04-11-36
  done: false
  episode_len_mean: 17.789473684210527
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -12.771929824561404
  episode_reward_min: -40.0
  episodes_this_iter: 57
  episodes_total: 57
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 1014
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 0.32941052317619324
          mean_q: -0.3010759949684143
          mean_td_error: -0.3353292942047119
          min_q: -0.9126817584037781
          model: {}
        td_error:
        - 0.7900476455688477
        - -9.476641654968262
        - 0.18028044700622559
        - 1.2315980195999146
        - 0.38541746139526367
        - 1.1500840187072754
        - -9.660634994506836
        - 0.5378077030181885


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,3,5.19381,1670,-14.5652,32,-40,18.1522


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 5208
  custom_metrics: {}
  date: 2022-04-01_04-11-42
  done: false
  episode_len_mean: 18.34
  episode_media: {}
  episode_reward_max: 30.0
  episode_reward_mean: -15.88
  episode_reward_min: -40.0
  episodes_this_iter: 16
  episodes_total: 143
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 2524
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -8.597418785095215
          mean_q: -10.731786727905273
          mean_td_error: -3.298344612121582
          min_q: -12.186394691467285
          model: {}
        td_error:
        - 0.596226692199707
        - -7.484169006347656
        - 0.13162803649902344
        - -10.007979393005371
        - -9.990279197692871
        - -11.101592063903809
        - -10.084178924560547
        - 0.29924488067626953
        - -9.368812561035

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,8,10.7422,3235,-15.64,30,-40,18.62


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 8456
  custom_metrics: {}
  date: 2022-04-01_04-11-48
  done: false
  episode_len_mean: 18.44
  episode_media: {}
  episode_reward_max: 28.0
  episode_reward_mean: -14.68
  episode_reward_min: -40.0
  episodes_this_iter: 20
  episodes_total: 232
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 4228
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -8.534445762634277
          mean_q: -14.113641738891602
          mean_td_error: -3.7987375259399414
          min_q: -17.65914535522461
          model: {}
        td_error:
        - -13.362943649291992
        - -3.289889335632324
        - 0.3116006851196289
        - 0.6304130554199219
        - -25.761676788330078
        - 0.23036861419677734
        - -0.06643867492675781
        - -3.408297538757324
        - -10.6047649383

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,13,16.715,4764,-11.74,30,-40,17.57


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 11464
  custom_metrics: {}
  date: 2022-04-01_04-11-54
  done: false
  episode_len_mean: 16.75
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -9.1
  episode_reward_min: -40.0
  episodes_this_iter: 17
  episodes_total: 321
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 5672
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -10.030786514282227
          mean_q: -16.82680320739746
          mean_td_error: -4.2961015701293945
          min_q: -21.93215560913086
          model: {}
        td_error:
        - -2.808797836303711
        - 0.4657630920410156
        - -19.979211807250977
        - -0.42719078063964844
        - 0.8417816162109375
        - -8.312045097351074
        - -27.975433349609375
        - -1.5690927505493164
        - 0.6160526275634

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,17,21.5637,6043,-10.06,34,-40,16.83


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 14004
  custom_metrics: {}
  date: 2022-04-01_04-12-00
  done: false
  episode_len_mean: 16.94
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -9.28
  episode_reward_min: -40.0
  episodes_this_iter: 19
  episodes_total: 398
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 7002
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 5.031968116760254
          mean_q: -11.484315872192383
          mean_td_error: -0.9186485409736633
          min_q: -21.814868927001953
          model: {}
        td_error:
        - -14.9752836227417
        - -2.2466869354248047
        - 2.3300304412841797
        - -8.78941535949707
        - 2.412179946899414
        - -1.2233190536499023
        - -0.7419300079345703
        - 1.7512664794921875
        - 0.07807254791259766

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,21,26.5001,7330,-6.96,34,-40,16.38


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 16614
  custom_metrics: {}
  date: 2022-04-01_04-12-05
  done: false
  episode_len_mean: 15.05
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -1.1
  episode_reward_min: -40.0
  episodes_this_iter: 21
  episodes_total: 486
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 8307
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -0.1849309206008911
          mean_q: -5.428021430969238
          mean_td_error: -2.6723365783691406
          min_q: -15.555228233337402
          model: {}
        td_error:
        - -22.049835205078125
        - 3.0053348541259766
        - 0.9420056343078613
        - -0.37282657623291016
        - -1.3367996215820312
        - -1.6943583488464355
        - -4.551888942718506
        - -1.0888638496398926
        - -0.0242977142

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,24,30.3932,8307,-1.1,32,-40,15.05


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 19208
  custom_metrics: {}
  date: 2022-04-01_04-12-10
  done: false
  episode_len_mean: 14.44
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: 3.12
  episode_reward_min: -40.0
  episodes_this_iter: 21
  episodes_total: 575
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 9604
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 11.453048706054688
          mean_q: 3.5956525802612305
          mean_td_error: -0.40196797251701355
          min_q: -3.4768013954162598
          model: {}
        td_error:
        - -7.625107765197754
        - 3.077608108520508
        - -0.5496244430541992
        - -1.9940552711486816
        - -1.5091133117675781
        - 1.680534839630127
        - -10.226167678833008
        - -0.04870271682739258
        - 1.9217624664306

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,28,35.502,9604,3.12,32,-40,14.44


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 21806
  custom_metrics: {}
  date: 2022-04-01_04-12-16
  done: false
  episode_len_mean: 12.73
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 8.74
  episode_reward_min: -40.0
  episodes_this_iter: 26
  episodes_total: 679
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 10903
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 9.965612411499023
          mean_q: 4.0150299072265625
          mean_td_error: -0.12075597792863846
          min_q: -3.588446617126465
          model: {}
        td_error:
        - -8.883025169372559
        - 1.131911277770996
        - -0.13797855377197266
        - -6.992007732391357
        - -0.5037169456481934
        - 1.161865234375
        - 1.8453998565673828
        - -0.28517913818359375
        - -0.18423223495483398

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,32,40.8987,10903,8.74,34,-40,12.73


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 24360
  custom_metrics: {}
  date: 2022-04-01_04-12-21
  done: false
  episode_len_mean: 14.73
  episode_media: {}
  episode_reward_max: 30.0
  episode_reward_mean: 4.54
  episode_reward_min: -40.0
  episodes_this_iter: 21
  episodes_total: 764
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 12101
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 8.095492362976074
          mean_q: 1.6845781803131104
          mean_td_error: -0.03299658000469208
          min_q: -5.0462517738342285
          model: {}
        td_error:
        - 0.914867639541626
        - -0.17677640914916992
        - 0.6906156539916992
        - -0.681574821472168
        - -0.596646785736084
        - -0.2894020080566406
        - -0.11177109181880951
        - -1.0229387283325195
        - 0.033337116241

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,37,46.8312,12392,3.82,28,-40,14.89


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 26878
  custom_metrics: {}
  date: 2022-04-01_04-12-27
  done: false
  episode_len_mean: 10.47
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 16.46
  episode_reward_min: -20.0
  episodes_this_iter: 23
  episodes_total: 879
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 13391
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 11.516716003417969
          mean_q: 5.139641761779785
          mean_td_error: -1.1111465692520142
          min_q: -3.193758726119995
          model: {}
        td_error:
        - -1.1378860473632812
        - -1.6024723052978516
        - -1.4521920680999756
        - -3.2711868286132812
        - -7.698770523071289
        - -0.4961634874343872
        - 0.020639419555664062
        - 0.7644863128662109
        - -0.0701441764

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,41,51.554,13439,16.46,34,-20,10.47


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 29446
  custom_metrics: {}
  date: 2022-04-01_04-12-33
  done: false
  episode_len_mean: 11.43
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 13.14
  episode_reward_min: -20.0
  episodes_this_iter: 26
  episodes_total: 990
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 14703
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 11.280315399169922
          mean_q: 2.7298262119293213
          mean_td_error: -0.5881629586219788
          min_q: -6.844762802124023
          model: {}
        td_error:
        - 1.67216157913208
        - -0.279803991317749
        - -1.7829484939575195
        - -0.9018895626068115
        - -0.3642911911010742
        - -2.2780637741088867
        - -0.6521854400634766
        - -0.9120477437973022
        - 0.2535606622695

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,45,57.0299,14723,13.14,34,-20,11.43


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 31580
  custom_metrics: {}
  date: 2022-04-01_04-12-38
  done: false
  episode_len_mean: 10.44
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 15.92
  episode_reward_min: -40.0
  episodes_this_iter: 26
  episodes_total: 1093
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 15690
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 14.233137130737305
          mean_q: 5.675693511962891
          mean_td_error: -0.13954231142997742
          min_q: -2.9593420028686523
          model: {}
        td_error:
        - -0.9370412826538086
        - -1.369354248046875
        - -2.0208911895751953
        - -5.445400238037109
        - 2.4252710342407227
        - 0.33203768730163574
        - -6.579209327697754
        - -3.9847660064697266
        - 1.10856342315

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,49,61.9507,15790,15.92,34,-40,10.44


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 33332
  custom_metrics: {}
  date: 2022-04-01_04-12-43
  done: false
  episode_len_mean: 8.17
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 21.06
  episode_reward_min: -20.0
  episodes_this_iter: 14
  episodes_total: 1202
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 16666
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 14.372072219848633
          mean_q: 7.382848262786865
          mean_td_error: -1.0041651725769043
          min_q: -4.405240058898926
          model: {}
        td_error:
        - -2.8905792236328125
        - 1.0245399475097656
        - -8.555474281311035
        - 0.4991464614868164
        - -4.516040802001953
        - -2.6707096099853516
        - -1.6971111297607422
        - 0.7747335433959961
        - 0.710945129394531

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,53,67.2969,16666,21.06,34,-20,8.17


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 35468
  custom_metrics: {}
  date: 2022-04-01_04-12-50
  done: false
  episode_len_mean: 9.1
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 21.0
  episode_reward_min: -40.0
  episodes_this_iter: 21
  episodes_total: 1314
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 17734
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 13.083006858825684
          mean_q: 5.959346771240234
          mean_td_error: -0.4045462906360626
          min_q: -3.13037371635437
          model: {}
        td_error:
        - 0.13582134246826172
        - -1.2202162742614746
        - -3.606572389602661
        - 3.6320719718933105
        - -1.4628114700317383
        - -1.2327601909637451
        - -1.946859359741211
        - -0.3401479721069336
        - -1.411271095275879

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,58,73.245,17734,21,34,-40,9.1


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,61,77.1945,18352,22.28,34,-40,8.66


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 37116
  custom_metrics: {}
  date: 2022-04-01_04-12-55
  done: false
  episode_len_mean: 8.36
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 23.28
  episode_reward_min: 0.0
  episodes_this_iter: 26
  episodes_total: 1413
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 18466
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 14.388236045837402
          mean_q: 7.176314353942871
          mean_td_error: 0.49485981464385986
          min_q: -1.8163902759552002
          model: {}
        td_error:
        - 0.8039312362670898
        - 1.875650405883789
        - 0.10837554931640625
        - -0.21106243133544922
        - 0.07729148864746094
        - 0.20067119598388672
        - -0.5483837127685547
        - 0.5539035797119141
        - -0.0378484725952

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,65,82.6907,19178,25.32,34,8,7.34


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 38764
  custom_metrics: {}
  date: 2022-04-01_04-13-01
  done: false
  episode_len_mean: 6.74
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 26.52
  episode_reward_min: 8.0
  episodes_this_iter: 33
  episodes_total: 1532
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 19311
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 16.284128189086914
          mean_q: 7.975105285644531
          mean_td_error: -0.1541382074356079
          min_q: 1.140122413635254
          model: {}
        td_error:
        - 0.07731819152832031
        - -0.5054183006286621
        - -0.02882838249206543
        - 0.10416030883789062
        - 0.5951786041259766
        - 0.9249563217163086
        - 1.0743083953857422
        - -0.01121377944946289
        - 0.47177886962890

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,69,88.3004,19995,30.78,34,10,4.61


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 40408
  custom_metrics: {}
  date: 2022-04-01_04-13-07
  done: false
  episode_len_mean: 5.24
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 29.52
  episode_reward_min: 12.0
  episodes_this_iter: 34
  episodes_total: 1696
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 20149
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 16.447742462158203
          mean_q: 7.835210800170898
          mean_td_error: 0.36630475521087646
          min_q: 1.4538906812667847
          model: {}
        td_error:
        - 0.39189577102661133
        - -0.2056121826171875
        - 0.25525569915771484
        - 0.15714263916015625
        - 0.7758417129516602
        - -1.252023696899414
        - 0.0024394989013671875
        - -0.6430091857910156
        - -0.2124767303

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,73,93.738,20825,27.58,34,12,6.21


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 42058
  custom_metrics: {}
  date: 2022-04-01_04-13-12
  done: false
  episode_len_mean: 5.98
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 28.04
  episode_reward_min: 6.0
  episodes_this_iter: 32
  episodes_total: 1828
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 21003
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 15.605137825012207
          mean_q: 7.503571033477783
          mean_td_error: -0.1668790578842163
          min_q: -1.6695919036865234
          model: {}
        td_error:
        - -0.042363643646240234
        - 2.4024460315704346
        - -2.0495529174804688
        - -2.300631523132324
        - 0.2927737236022949
        - 0.20853519439697266
        - -0.7211768627166748
        - 0.5228853225708008
        - 0.0068781375885

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,77,99.065,21648,30.6,34,16,4.7


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 43720
  custom_metrics: {}
  date: 2022-04-01_04-13-18
  done: false
  episode_len_mean: 3.74
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 32.52
  episode_reward_min: 18.0
  episodes_this_iter: 57
  episodes_total: 2013
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 21833
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 13.746150970458984
          mean_q: 6.81682825088501
          mean_td_error: -0.2084493339061737
          min_q: -2.269033670425415
          model: {}
        td_error:
        - 0.43277931213378906
        - -0.36921072006225586
        - 0.553380012512207
        - 1.095445156097412
        - -1.3824377059936523
        - -0.2669558525085449
        - 7.668928146362305
        - -0.049345970153808594
        - -0.02456808090209

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,81,104.755,22475,31.7,34,-20,4.05


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 45368
  custom_metrics: {}
  date: 2022-04-01_04-13-24
  done: false
  episode_len_mean: 3.83
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 32.14
  episode_reward_min: -20.0
  episodes_this_iter: 56
  episodes_total: 2212
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 22684
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 17.05537223815918
          mean_q: 8.997303009033203
          mean_td_error: 0.1611904501914978
          min_q: -2.415780782699585
          model: {}
        td_error:
        - 0.08132076263427734
        - 0.4536857604980469
        - 0.48641324043273926
        - -0.6228733062744141
        - -0.3916759490966797
        - -0.34851741790771484
        - 0.2766728401184082
        - 1.1870386600494385
        - -1.5412979125976

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,85,110.603,23299,33.1,34,24,3.45


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 47012
  custom_metrics: {}
  date: 2022-04-01_04-13-30
  done: false
  episode_len_mean: 3.45
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.1
  episode_reward_min: 24.0
  episodes_this_iter: 59
  episodes_total: 2442
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 23506
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 18.013113021850586
          mean_q: 7.6880950927734375
          mean_td_error: -0.020867682993412018
          min_q: 0.47741472721099854
          model: {}
        td_error:
        - -0.809389591217041
        - -0.7824668884277344
        - -2.5116405487060547
        - -0.23265838623046875
        - 4.311681747436523
        - 0.32550549507141113
        - 4.4841742515563965
        - 0.30559349060058594
        - 0.38438224792

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,88,115.125,23922,33.6,34,24,3.2


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 48664
  custom_metrics: {}
  date: 2022-04-01_04-13-36
  done: false
  episode_len_mean: 3.4
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.2
  episode_reward_min: 26.0
  episodes_this_iter: 61
  episodes_total: 2691
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 24332
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 18.087295532226562
          mean_q: 9.441871643066406
          mean_td_error: 0.00913470983505249
          min_q: 0.8215368986129761
          model: {}
        td_error:
        - 1.4573628902435303
        - 0.5746574401855469
        - 0.5746574401855469
        - -0.27515125274658203
        - -0.07477188110351562
        - 2.080082416534424
        - 0.0012340545654296875
        - -1.974686622619629
        - 0.545966625213623

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,92,121.083,24743,33,34,24,3.5


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 50300
  custom_metrics: {}
  date: 2022-04-01_04-13-42
  done: false
  episode_len_mean: 3.01
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.98
  episode_reward_min: 32.0
  episodes_this_iter: 68
  episodes_total: 2944
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 25054
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 17.47691535949707
          mean_q: 8.407682418823242
          mean_td_error: 0.06286823749542236
          min_q: -2.134502410888672
          model: {}
        td_error:
        - 0.056301116943359375
        - 2.25924015045166
        - -0.11838626861572266
        - -0.2757568359375
        - -0.284912109375
        - -0.11732673645019531
        - 0.29251527786254883
        - 0.10895252227783203
        - -0.04439735412597656


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,95,125.551,25354,34,34,34,3


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 51932
  custom_metrics: {}
  date: 2022-04-01_04-13-48
  done: false
  episode_len_mean: 3.06
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.88
  episode_reward_min: 32.0
  episodes_this_iter: 66
  episodes_total: 3214
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 25870
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 17.18553352355957
          mean_q: 9.20468521118164
          mean_td_error: -0.378124475479126
          min_q: 2.0992326736450195
          model: {}
        td_error:
        - 0.14053058624267578
        - -0.43143653869628906
        - -0.1893024444580078
        - -0.5519256591796875
        - -2.0222153663635254
        - -0.33851099014282227
        - -1.0572710037231445
        - -0.1812124252319336
        - -0.19479513168

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,99,131.448,26170,34,34,34,3




Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 53564
  custom_metrics: {}
  date: 2022-04-01_04-13-55
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 3486
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 26686
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 17.840282440185547
          mean_q: 10.432519912719727
          mean_td_error: 0.10497783869504929
          min_q: 3.476055145263672
          model: {}
        td_error:
        - -1.094869613647461
        - -0.07267570495605469
        - -0.20101118087768555
        - -0.5071282386779785
        - 0.7454538345336914
        - 0.33957815170288086
        - -1.3115243911743164
        - -1.3390159606933594
        - -2.012603759765

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,102,135.921,26782,34,34,34,3


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 54788
  custom_metrics: {}
  date: 2022-04-01_04-14-01
  done: false
  episode_len_mean: 3.06
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.88
  episode_reward_min: 28.0
  episodes_this_iter: 66
  episodes_total: 3688
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 27298
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 18.397287368774414
          mean_q: 10.352800369262695
          mean_td_error: 0.1370566487312317
          min_q: 1.4363470077514648
          model: {}
        td_error:
        - 0.5709223747253418
        - 0.43103599548339844
        - -0.06715583801269531
        - 0.45772552490234375
        - -0.6518914699554443
        - 0.4334087371826172
        - 0.011240959167480469
        - -0.29571533203125
        - 0.4577255249023

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,106,141.661,27394,33.88,34,28,3.06


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,110,146.142,27802,34,34,34,3


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 56012
  custom_metrics: {}
  date: 2022-04-01_04-14-08
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 3892
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 27910
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 18.113245010375977
          mean_q: 10.728208541870117
          mean_td_error: 0.4734596610069275
          min_q: 2.3994948863983154
          model: {}
        td_error:
        - 0.4753098487854004
        - 0.7502470016479492
        - -0.6772093772888184
        - -0.034951210021972656
        - 2.0293121337890625
        - 0.2321786880493164
        - -0.15713024139404297
        - -0.034951210021972656
        - -0.03495121002

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,114,152.395,28516,34,34,34,3


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 57440
  custom_metrics: {}
  date: 2022-04-01_04-14-14
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 4130
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 28624
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 17.77568244934082
          mean_q: 12.024489402770996
          mean_td_error: -0.13698144257068634
          min_q: 2.923290252685547
          model: {}
        td_error:
        - 0.36968135833740234
        - -0.5135736465454102
        - 1.046504020690918
        - -0.10833549499511719
        - -0.6027355194091797
        - 1.6478519439697266
        - 0.03274726867675781
        - 0.3634309768676758
        - -0.108335494995117

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,117,157.308,29128,34,34,34,3


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 58664
  custom_metrics: {}
  date: 2022-04-01_04-14-19
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 4334
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 29236
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 18.05947494506836
          mean_q: 13.764782905578613
          mean_td_error: 0.17610222101211548
          min_q: 5.303815841674805
          model: {}
        td_error:
        - 2.2279796600341797
        - 0.21038436889648438
        - -1.0966205596923828
        - -0.14990615844726562
        - -1.0966205596923828
        - -0.28546905517578125
        - 0.5570449829101562
        - 0.5570449829101562
        - 0.286959648132324

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,RUNNING,172.17.0.2:19093,120,162.27,29740,33.8,34,24,3.1


Result for DQNTrainer_foodenv_76743_00000:
  agent_timesteps_total: 60292
  custom_metrics: {}
  date: 2022-04-01_04-14-25
  done: true
  episode_len_mean: 3.01
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.98
  episode_reward_min: 32.0
  episodes_this_iter: 67
  episodes_total: 4599
  experiment_id: 6af836e00d10403b9eacf3717e17bd18
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 30058
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 18.256772994995117
          mean_q: 11.504793167114258
          mean_td_error: 0.5185688138008118
          min_q: 0.4442082643508911
          model: {}
        td_error:
        - 3.143796443939209
        - -0.24910831451416016
        - -0.6808938980102539
        - 0.17530250549316406
        - -0.12719345092773438
        - 0.4777870178222656
        - 0.4033021926879883
        - 0.38840293884277344
        - 0.47778701782226

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_76743_00000,TERMINATED,172.17.0.2:19093,122,165.434,30146,33.98,34,32,3.01


[2m[36m(RolloutWorker pid=19092)[0m 2022-04-01 04:14:26,433	ERROR worker.py:432 -- SystemExit was raised from the worker.
[2m[36m(RolloutWorker pid=19092)[0m Traceback (most recent call last):
[2m[36m(RolloutWorker pid=19092)[0m   File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=19092)[0m   File "python/ray/_raylet.pyx", line 640, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=19092)[0m   File "python/ray/_raylet.pyx", line 589, in ray._raylet.execute_task.function_executor
[2m[36m(RolloutWorker pid=19092)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/function_manager.py", line 639, in actor_method_executor
[2m[36m(RolloutWorker pid=19092)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(RolloutWorker pid=19092)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(RolloutWorker pid=19092)[0m     

# Results Analysis (Coop + DQN)

In [55]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]

trainer = dqn.DQNTrainer(config=config, env="foodenv")
trainer.restore(best_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

2022-04-01 00:36:28,508	INFO trainable.py:473 -- Restored on 172.17.0.2 from checkpoint: /home/ray/cs4246/01_04_2022_00:33:00/DQNTrainer_2022-04-01_00-33-00/DQNTrainer_foodenv_f5013_00000_0_2022-04-01_00-33-00/checkpoint_000100/checkpoint-100
2022-04-01 00:36:28,509	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 100, '_timesteps_total': 3200, '_time_total': 126.66100001335144, '_episodes_total': 4709}


Initial state at Time 0:
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 1
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '0' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '1' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Time: 2
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '0' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Time: 3
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '0' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '1' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Time:

[2m[36m(RolloutWorker pid=14301)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=14301)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


# DQN Trainer (Comp)

In [114]:
import ray.rllib.agents.dqn as dqn
e = FoodGame(comp_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dqn.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["batch_mode"] = "complete_episodes"
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = comp_env_config

# Buffer params
config["buffer_size"] = 20000 # in batches
config["prioritized_replay"] = False

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

# eval
# config["evaluation_interval"] = 1
# config["evaluation_duration"] = 1

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    dqn.DQNTrainer,
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

Trial name,status,loc
DQNTrainer_foodenv_f3a64_00000,PENDING,


[2m[36m(DQNTrainer pid=20461)[0m 2022-04-01 04:14:57,078	INFO trainer.py:2055 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(DQNTrainer pid=20461)[0m 2022-04-01 04:14:57,079	INFO simple_q.py:154 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(DQNTrainer pid=20461)[0m 2022-04-01 04:14:57,079	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461




Trial name,status,loc
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 2038
  custom_metrics: {}
  date: 2022-04-01_04-15-09
  done: false
  episode_len_mean: 17.56896551724138
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -24.275862068965516
  episode_reward_min: -40.0
  episodes_this_iter: 58
  episodes_total: 58
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 1019
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: 0.6649793982505798
          mean_q: 0.06474529951810837
          mean_td_error: 0.7047158479690552
          min_q: -0.5340701937675476
          model: {}
        td_error:
        - 1.0407304763793945
        - 1.7569246292114258
        - 2.4864189624786377
        - 2.121119737625122
        - 2.077812671661377
        - 0.9304206967353821
        - 1.3943556547164917
        - 2.363179922103882
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,3,6.04135,1654,-24.9247,14,-40,17.7849


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 5216
  custom_metrics: {}
  date: 2022-04-01_04-15-16
  done: false
  episode_len_mean: 18.3
  episode_media: {}
  episode_reward_max: 12.0
  episode_reward_mean: -26.8
  episode_reward_min: -40.0
  episodes_this_iter: 19
  episodes_total: 145
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 2588
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -10.824976921081543
          mean_q: -11.952216148376465
          mean_td_error: 0.15570113062858582
          min_q: -12.53102970123291
          model: {}
        td_error:
        - 1.1191015243530273
        - 1.0873136520385742
        - 0.6142587661743164
        - 0.8652582168579102
        - 0.9700965881347656
        - 0.7548952102661133
        - 0.7548952102661133
        - 0.8159418106079102
        - 0.7146711349487305
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,7,10.8767,2927,-25.88,12,-40,18.04


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 7830
  custom_metrics: {}
  date: 2022-04-01_04-15-21
  done: false
  episode_len_mean: 17.37
  episode_media: {}
  episode_reward_max: 12.0
  episode_reward_mean: -24.24
  episode_reward_min: -40.0
  episodes_this_iter: 18
  episodes_total: 220
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 3815
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -7.777599334716797
          mean_q: -15.26637077331543
          mean_td_error: -0.06331312656402588
          min_q: -16.97015953063965
          model: {}
        td_error:
        - 1.3578014373779297
        - 0.44939613342285156
        - 0.4143810272216797
        - 0.5985050201416016
        - 0.1995105743408203
        - -4.723553657531738
        - 0.44110679626464844
        - 0.01125335693359375
        - 0.27630996704101

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,11,15.8036,4230,-25.76,10,-40,17.88


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 10996
  custom_metrics: {}
  date: 2022-04-01_04-15-27
  done: false
  episode_len_mean: 18.11
  episode_media: {}
  episode_reward_max: 12.0
  episode_reward_mean: -26.42
  episode_reward_min: -40.0
  episodes_this_iter: 19
  episodes_total: 307
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 5398
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -24.400650024414062
          mean_q: -25.36679458618164
          mean_td_error: -0.6194554567337036
          min_q: -26.608016967773438
          model: {}
        td_error:
        - 1.2937183380126953
        - 1.1435909271240234
        - -23.56553077697754
        - 0.8293952941894531
        - 0.3928050994873047
        - 0.5535430908203125
        - 0.7357120513916016
        - 1.158987045288086
        - 0.6201019287109375

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,15,20.6673,5498,-26.42,12,-40,18.11


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,18,24.6712,6468,-23.62,14,-40,17.31


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 13598
  custom_metrics: {}
  date: 2022-04-01_04-15-33
  done: false
  episode_len_mean: 16.72
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -21.84
  episode_reward_min: -40.0
  episodes_this_iter: 21
  episodes_total: 386
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 6729
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -35.30793762207031
          mean_q: -36.36309814453125
          mean_td_error: -2.5804555416107178
          min_q: -37.02524185180664
          model: {}
        td_error:
        - -0.111236572265625
        - -0.5400886535644531
        - 0.38048553466796875
        - 0.39666748046875
        - 0.24942398071289062
        - -0.12961196899414062
        - -0.3407859802246094
        - -0.1884307861328125
        - 0.101646423339

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,22,29.5284,7654,-20.94,14,-40,16.37


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 15952
  custom_metrics: {}
  date: 2022-04-01_04-15-38
  done: false
  episode_len_mean: 16.73
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -21.76
  episode_reward_min: -40.0
  episodes_this_iter: 18
  episodes_total: 456
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 7936
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -42.6901969909668
          mean_q: -44.66306686401367
          mean_td_error: -5.379543781280518
          min_q: -46.06056213378906
          model: {}
        td_error:
        - 0.22397994995117188
        - 0.5646934509277344
        - -0.3327789306640625
        - -0.620513916015625
        - -0.3621978759765625
        - 0.5623207092285156
        - -0.24502182006835938
        - -44.39485549926758
        - -0.3277130126953

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,26,34.5079,8924,-23.48,14,-40,17.14


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 18484
  custom_metrics: {}
  date: 2022-04-01_04-15-43
  done: false
  episode_len_mean: 17.69
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -25.08
  episode_reward_min: -40.0
  episodes_this_iter: 18
  episodes_total: 528
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 9242
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -52.877140045166016
          mean_q: -53.72016143798828
          mean_td_error: -2.418001890182495
          min_q: -54.571537017822266
          model: {}
        td_error:
        - 0.036006927490234375
        - -0.23481369018554688
        - 0.9946098327636719
        - 0.07114028930664062
        - -9.174995422363281
        - -0.47945404052734375
        - 1.42645263671875
        - -0.5676383972167969
        - -9.426757812

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,30,39.5194,10203,-26.72,14,-40,18.01


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 20842
  custom_metrics: {}
  date: 2022-04-01_04-15-48
  done: false
  episode_len_mean: 18.22
  episode_media: {}
  episode_reward_max: 12.0
  episode_reward_mean: -27.74
  episode_reward_min: -40.0
  episodes_this_iter: 13
  episodes_total: 592
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 10375
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -62.68682098388672
          mean_q: -66.73705291748047
          mean_td_error: -8.515669822692871
          min_q: -68.62277221679688
          model: {}
        td_error:
        - 1.0878143310546875
        - -75.9627456665039
        - -0.4271659851074219
        - -1.0844573974609375
        - 0.2740325927734375
        - 0.1460418701171875
        - 0.5300140380859375
        - -0.6036300659179688
        - 0.782249450683593

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,34,44.3328,11401,-26.1,14,-40,17.45


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 23432
  custom_metrics: {}
  date: 2022-04-01_04-15-53
  done: false
  episode_len_mean: 17.53
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -26.86
  episode_reward_min: -40.0
  episodes_this_iter: 16
  episodes_total: 666
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 11696
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -67.0409927368164
          mean_q: -70.81454467773438
          mean_td_error: -1.9792735576629639
          min_q: -73.5518569946289
          model: {}
        td_error:
        - 0.3636016845703125
        - 0.30025482177734375
        - -0.2123260498046875
        - -0.05278778076171875
        - 0.5384979248046875
        - -72.39453887939453
        - 0.09502410888671875
        - 0.880950927734375
        - 0.27454376220703

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,39,50.1303,12837,-30.08,14,-40,18.59


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 26274
  custom_metrics: {}
  date: 2022-04-01_04-15-59
  done: false
  episode_len_mean: 19.38
  episode_media: {}
  episode_reward_max: 14.0
  episode_reward_mean: -32.06
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 739
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 13097
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -63.89541244506836
          mean_q: -67.17355346679688
          mean_td_error: -11.069683074951172
          min_q: -70.63358306884766
          model: {}
        td_error:
        - -1.449188232421875
        - -1.3108596801757812
        - 0.22731781005859375
        - -63.178855895996094
        - -0.5715408325195312
        - 1.1113853454589844
        - -0.5841445922851562
        - -0.8771591186523438
        - -1.396347045

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,43,54.5219,14048,-32.78,14,-40,19.69


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 29324
  custom_metrics: {}
  date: 2022-04-01_04-16-05
  done: false
  episode_len_mean: 19.65
  episode_media: {}
  episode_reward_max: 6.0
  episode_reward_mean: -33.1
  episode_reward_min: -40.0
  episodes_this_iter: 16
  episodes_total: 817
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 14622
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -65.82879638671875
          mean_q: -66.524169921875
          mean_td_error: -0.4208188056945801
          min_q: -67.37167358398438
          model: {}
        td_error:
        - 0.6931610107421875
        - 0.8508987426757812
        - -0.8996124267578125
        - 0.00043487548828125
        - 0.7117919921875
        - -0.0603790283203125
        - -0.3138885498046875
        - 0.04476165771484375
        - -0.209320068359375
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,47,59.4329,15262,-33.5,6,-40,19.65


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 32324
  custom_metrics: {}
  date: 2022-04-01_04-16-12
  done: false
  episode_len_mean: 19.74
  episode_media: {}
  episode_reward_max: 6.0
  episode_reward_mean: -34.48
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 892
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 16062
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -62.212406158447266
          mean_q: -63.86095428466797
          mean_td_error: 2.0004966259002686
          min_q: -65.70736694335938
          model: {}
        td_error:
        - 3.4071807861328125
        - 1.5128097534179688
        - 1.28448486328125
        - 1.3609237670898438
        - 1.5805435180664062
        - 2.103252410888672
        - 1.7797470092773438
        - 2.9673614501953125
        - 0.76031494140625
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,52,65.3892,16771,-35.28,2,-40,19.89


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 35382
  custom_metrics: {}
  date: 2022-04-01_04-16-18
  done: false
  episode_len_mean: 19.49
  episode_media: {}
  episode_reward_max: 12.0
  episode_reward_mean: -34.38
  episode_reward_min: -40.0
  episodes_this_iter: 16
  episodes_total: 971
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 17691
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -67.58438110351562
          mean_q: -70.01862335205078
          mean_td_error: -3.298466444015503
          min_q: -73.185546875
          model: {}
        td_error:
        - -0.8498306274414062
        - -1.0848541259765625
        - -0.6150665283203125
        - -0.02811431884765625
        - 0.07430267333984375
        - -1.4709243774414062
        - -1.379974365234375
        - -1.112548828125
        - 0.326416015625
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,56,70.175,17991,-34.68,12,-40,19.49


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 38406
  custom_metrics: {}
  date: 2022-04-01_04-16-24
  done: false
  episode_len_mean: 19.56
  episode_media: {}
  episode_reward_max: 12.0
  episode_reward_mean: -35.52
  episode_reward_min: -40.0
  episodes_this_iter: 16
  episodes_total: 1048
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 19103
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -70.66542053222656
          mean_q: -72.38944244384766
          mean_td_error: -7.35831356048584
          min_q: -73.9136962890625
          model: {}
        td_error:
        - 0.9200286865234375
        - 0.4076995849609375
        - -9.436508178710938
        - 0.5185470581054688
        - 0.2406005859375
        - 0.28678131103515625
        - 0.8547515869140625
        - 0.49072265625
        - -0.11667633056640625
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,60,74.8234,19203,-35.52,12,-40,19.56


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 41406
  custom_metrics: {}
  date: 2022-04-01_04-16-30
  done: false
  episode_len_mean: 19.72
  episode_media: {}
  episode_reward_max: 10.0
  episode_reward_mean: -36.94
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1123
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 20663
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -72.29829406738281
          mean_q: -75.0681381225586
          mean_td_error: -0.08209395408630371
          min_q: -77.34774780273438
          model: {}
        td_error:
        - -0.32318878173828125
        - 0.45609283447265625
        - 0.6941909790039062
        - -0.00542449951171875
        - -0.49576568603515625
        - -0.25516510009765625
        - -0.875274658203125
        - -0.303955078125
        - 0.298248291

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,65,80.5548,20703,-36.94,10,-40,19.72


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,69,85.2213,21903,-38.4,-30,-40,20


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 44406
  custom_metrics: {}
  date: 2022-04-01_04-16-36
  done: false
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -30.0
  episode_reward_mean: -38.7
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1198
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 22103
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -76.02014923095703
          mean_q: -77.9546890258789
          mean_td_error: -9.170734405517578
          min_q: -79.3365707397461
          model: {}
        td_error:
        - -0.445037841796875
        - 0.980438232421875
        - 2.5007171630859375
        - 1.7609176635742188
        - -87.00434112548828
        - 1.213043212890625
        - -75.02014923095703
        - 2.1174163818359375
        - 1.6714096069335938
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,74,90.8891,23403,-39,-30,-40,20


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 47406
  custom_metrics: {}
  date: 2022-04-01_04-16-42
  done: false
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -30.0
  episode_reward_mean: -39.5
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1273
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 23663
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -75.72018432617188
          mean_q: -79.20428466796875
          mean_td_error: 0.21366405487060547
          min_q: -84.97614288330078
          model: {}
        td_error:
        - -0.1597900390625
        - 0.0472412109375
        - -0.7767257690429688
        - 0.09889984130859375
        - -1.6234893798828125
        - -0.6915740966796875
        - -0.06475067138671875
        - 0.16448211669921875
        - 0.03115081787109

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,78,95.8638,24603,-39.6,-30,-40,20


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 49806
  custom_metrics: {}
  date: 2022-04-01_04-16-47
  done: false
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -30.0
  episode_reward_mean: -39.7
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1333
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 24863
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -76.40267944335938
          mean_q: -78.58735656738281
          mean_td_error: -3.6817989349365234
          min_q: -84.9720230102539
          model: {}
        td_error:
        - -1.5561981201171875
        - -1.1321563720703125
        - -1.891021728515625
        - -1.8920059204101562
        - -1.808319091796875
        - -1.5810775756835938
        - -1.791168212890625
        - -1.7445297241210938
        - -1.59833526611

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,83,101.74,26103,-40,-40,-40,20


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 52806
  custom_metrics: {}
  date: 2022-04-01_04-16-53
  done: false
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -40.0
  episode_reward_mean: -40.0
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1408
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 26303
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -74.17301940917969
          mean_q: -79.2740478515625
          mean_td_error: -7.266983985900879
          min_q: -87.39598846435547
          model: {}
        td_error:
        - -0.7339324951171875
        - -1.3946914672851562
        - 1.7386245727539062
        - -0.7384262084960938
        - -0.937103271484375
        - -1.3675079345703125
        - 0.2681884765625
        - 0.34789276123046875
        - -0.949104309082031

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,87,106.564,27303,-40,-40,-40,20


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 55206
  custom_metrics: {}
  date: 2022-04-01_04-16-58
  done: false
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -40.0
  episode_reward_mean: -40.0
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1468
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 27503
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -71.67064666748047
          mean_q: -76.25967407226562
          mean_td_error: 0.5345897674560547
          min_q: -83.65037536621094
          model: {}
        td_error:
        - -0.04266357421875
        - 0.8610000610351562
        - 0.5261688232421875
        - 0.7689971923828125
        - 0.9368438720703125
        - 0.3939208984375
        - -0.2229766845703125
        - 0.9316635131835938
        - 1.3775253295898438
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,91,111.301,28503,-40,-40,-40,20


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 58206
  custom_metrics: {}
  date: 2022-04-01_04-17-05
  done: false
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -40.0
  episode_reward_mean: -40.0
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1543
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 29063
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -71.65473937988281
          mean_q: -76.37184143066406
          mean_td_error: -4.601147174835205
          min_q: -83.63836669921875
          model: {}
        td_error:
        - 0.257965087890625
        - 0.6244735717773438
        - 0.257965087890625
        - 1.2918930053710938
        - -0.13196563720703125
        - 2.7148284912109375
        - 0.14471435546875
        - -0.26543426513671875
        - 0.11617279052734375

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,RUNNING,172.17.0.2:20461,95,116.2,29703,-40,-40,-40,20


Result for DQNTrainer_foodenv_f3a64_00000:
  agent_timesteps_total: 60006
  custom_metrics: {}
  date: 2022-04-01_04-17-09
  done: true
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -40.0
  episode_reward_mean: -40.0
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 1588
  experiment_id: 43f8d33789234291abbd5b44824e2889
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 29903
    learner:
      policy0:
        custom_metrics: {}
        learner_stats:
          cur_lr: 0.009999999776482582
          max_q: -71.06665802001953
          mean_q: -74.20606231689453
          mean_td_error: -2.535921573638916
          min_q: -83.14420318603516
          model: {}
        td_error:
        - -0.07349395751953125
        - 0.25847625732421875
        - -0.26969146728515625
        - -0.45507049560546875
        - -1.0364456176757812
        - -1.1384201049804688
        - -0.08036041259765625
        - 0.32134246826171875
        - 0.027549



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQNTrainer_foodenv_f3a64_00000,TERMINATED,172.17.0.2:20461,96,117.54,30003,-40,-40,-40,20


2022-04-01 04:17:09,965	INFO tune.py:636 -- Total run time: 137.78 seconds (137.14 seconds for the tuning loop).


# Results Analysis (Comp + DQN)

In [47]:
# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = dqn.DQNTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

2022-04-01 00:04:52,177	INFO trainable.py:473 -- Restored on 172.17.0.2 from checkpoint: /home/ray/cs4246/31_03_2022_23:50:10/DQNTrainer_2022-03-31_23-50-10/DQNTrainer_foodenv_f928e_00000_0_2022-03-31_23-50-10/checkpoint_000100/checkpoint-100
2022-04-01 00:04:52,178	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 100, '_timesteps_total': 3200, '_time_total': 121.95173215866089, '_episodes_total': 2526}


Initial state at Time 0:
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 1
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' 'F' '' '1' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Time: 2
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 3
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '1' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Time:

[2m[36m(RolloutWorker pid=13858)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=13858)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


# MADDPG Trainer (Coop)

In [127]:
coop_env_config['trainer'] = 'maddpg'
comp_env_config['trainer'] = 'maddpg'
e = FoodGame(coop_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dict()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config
config["batch_mode"] = "complete_episodes"

# model
config["actor_hiddens"] = [256, 256]
config["actor_hidden_activation"] = "tanh"
config["critic_hiddens"] = [256, 256]
config["critic_hidden_activation"] = "tanh"
config["learning_starts"] = 1000 # in terms of samples
config["critic_lr"] = 1e-2 # in terms of samples
config["actor_lr"] = 1e-2 # in terms of samples

# Buffer params
config["buffer_size"] = 20000 # in batches

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    "contrib/MADDPG",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

Trial name,status,loc
contrib_MADDPG_foodenv_30cc8_00000,PENDING,


[2m[36m(MADDPGTrainer pid=27071)[0m 2022-04-01 05:28:13,922	INFO trainer.py:2055 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(MADDPGTrainer pid=27071)[0m 2022-04-01 05:28:13,923	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(RolloutWorker pid=27070)[0m   "Box bound precision lowered by casting to {}".format(self.dtype)
[2m[36m(RolloutWorker pid=27070)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pid=27070)[0m   return layer.apply(inputs)
[2m[36m(RolloutWorker pid=27070)[0m   out = tf1.layers.dense(feature, units=1, activation=None)
[2m[36m(RolloutWorker pid=27070)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pi

Trial name,status,loc
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071




Trial name,status,loc
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 2014
  custom_metrics: {}
  date: 2022-04-01_05-28-24
  done: false
  episode_len_mean: 19.0
  episode_media: {}
  episode_reward_max: 28.0
  episode_reward_mean: -20.264150943396228
  episode_reward_min: -40.0
  episodes_this_iter: 53
  episodes_total: 53
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 1007
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 2014
    num_agent_steps_trained: 64
    num_steps_sampled: 1007
    num_steps_trained: 32
    num_steps_trained_this_iter: 32
    num_target_updates: 1
  iterations_since_restore: 1
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 14.36
    ram_util_percent: 47.92
  pid: 27071
  policy_reward_max:
    policy0: 14.0
    policy1: 14.0
  policy_reward_mean:
    policy0: -10.13

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,11,6.05644,2049,-23.16,30,-40,19.18


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 4950
  custom_metrics: {}
  date: 2022-04-01_05-28-29
  done: false
  episode_len_mean: 18.79
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -21.98
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 131
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 2383
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 4950
    num_agent_steps_trained: 5056
    num_steps_sampled: 2475
    num_steps_trained: 2528
    num_steps_trained_this_iter: 32
    num_target_updates: 13
  iterations_since_restore: 15
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 16.1
    ram_util_percent: 48.2
  pid: 27071
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -10.99
    pol

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,24,10.6955,3469,-17.84,32,-40,18.42


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 8034
  custom_metrics: {}
  date: 2022-04-01_05-28-34
  done: false
  episode_len_mean: 18.58
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -18.36
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 213
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 3997
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 8034
    num_agent_steps_trained: 10304
    num_steps_sampled: 4017
    num_steps_trained: 5152
    num_steps_trained_this_iter: 32
    num_target_updates: 27
  iterations_since_restore: 29
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -9.18
    policy1: -9.18
  policy_reward_min:
    policy0: -20.0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,39,15.6977,5062,-18.86,30,-40,18.73


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 11170
  custom_metrics: {}
  date: 2022-04-01_05-28-40
  done: false
  episode_len_mean: 18.8
  episode_media: {}
  episode_reward_max: 28.0
  episode_reward_mean: -18.2
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 297
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 5585
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 11170
    num_agent_steps_trained: 15680
    num_steps_sampled: 5585
    num_steps_trained: 7840
    num_steps_trained_this_iter: 32
    num_target_updates: 41
  iterations_since_restore: 44
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.2
    ram_util_percent: 48.3
  pid: 27071
  policy_reward_max:
    policy0: 14.0
    policy1: 14.0
  policy_reward_mean:
    policy0: -9.1
    poli

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,54,20.6252,6661,-19.64,32,-40,18.72


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 14366
  custom_metrics: {}
  date: 2022-04-01_05-28-45
  done: false
  episode_len_mean: 18.38
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -18.76
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 384
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 7083
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 14366
    num_agent_steps_trained: 21184
    num_steps_sampled: 7183
    num_steps_trained: 10592
    num_steps_trained_this_iter: 32
    num_target_updates: 54
  iterations_since_restore: 59
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -9.38
    policy1: -9.38
  policy_reward_min:
    policy0: -2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,69,25.5536,8275,-16.16,34,-40,17.68


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 17410
  custom_metrics: {}
  date: 2022-04-01_05-28-50
  done: false
  episode_len_mean: 18.42
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -18.64
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 468
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 8685
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 17410
    num_agent_steps_trained: 26496
    num_steps_sampled: 8705
    num_steps_trained: 13248
    num_steps_trained_this_iter: 32
    num_target_updates: 68
  iterations_since_restore: 73
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 18.3
    ram_util_percent: 48.5
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -9.32
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,84,30.4998,9878,-17.2,34,-40,18.6


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 20426
  custom_metrics: {}
  date: 2022-04-01_05-28-55
  done: false
  episode_len_mean: 18.54
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -16.08
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 550
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 10153
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 20426
    num_agent_steps_trained: 31616
    num_steps_sampled: 10213
    num_steps_trained: 15808
    num_steps_trained_this_iter: 32
    num_target_updates: 81
  iterations_since_restore: 87
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.1
    ram_util_percent: 48.6
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -8.04
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,97,35.1824,11277,-14.42,34,-40,17.91




Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 23378
  custom_metrics: {}
  date: 2022-04-01_05-29-00
  done: false
  episode_len_mean: 17.91
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -14.82
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 632
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 11603
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 23378
    num_agent_steps_trained: 36864
    num_steps_sampled: 11689
    num_steps_trained: 18432
    num_steps_trained_this_iter: 32
    num_target_updates: 94
  iterations_since_restore: 101
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 16.9
    ram_util_percent: 48.7
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -7.41
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,110,39.7841,12690,-17.08,32,-40,18.44


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 26398
  custom_metrics: {}
  date: 2022-04-01_05-29-06
  done: false
  episode_len_mean: 18.4
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -16.8
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 714
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 13179
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 26398
    num_agent_steps_trained: 42112
    num_steps_sampled: 13199
    num_steps_trained: 21056
    num_steps_trained_this_iter: 32
    num_target_updates: 108
  iterations_since_restore: 115
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -8.4
    policy1: -8.4
  policy_reward_min:
    policy0: -2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,125,44.7518,14273,-15.82,32,-40,18.21


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 29408
  custom_metrics: {}
  date: 2022-04-01_05-29-11
  done: false
  episode_len_mean: 18.14
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -15.28
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 798
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 14632
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 29408
    num_agent_steps_trained: 47424
    num_steps_sampled: 14704
    num_steps_trained: 23712
    num_steps_trained_this_iter: 32
    num_target_updates: 121
  iterations_since_restore: 129
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -7.64
    policy1: -7.64
  policy_reward_min:
    policy0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,139,49.4219,15768,-12.64,34,-40,17.32


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 32374
  custom_metrics: {}
  date: 2022-04-01_05-29-16
  done: false
  episode_len_mean: 17.7
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -14.6
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 881
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 16103
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 32374
    num_agent_steps_trained: 52672
    num_steps_sampled: 16187
    num_steps_trained: 26336
    num_steps_trained_this_iter: 32
    num_target_updates: 134
  iterations_since_restore: 143
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -7.3
    policy1: -7.3
  policy_reward_min:
    policy0: -2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,154,54.4137,17360,-19.34,32,-40,18.77


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 35340
  custom_metrics: {}
  date: 2022-04-01_05-29-21
  done: false
  episode_len_mean: 18.89
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -20.38
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 959
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 17600
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 35340
    num_agent_steps_trained: 57664
    num_steps_sampled: 17670
    num_steps_trained: 28832
    num_steps_trained_this_iter: 32
    num_target_updates: 147
  iterations_since_restore: 157
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 16.5
    ram_util_percent: 49.0
  pid: 27071
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -10.19

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,168,59.1061,18850,-17.14,30,-40,18.57


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 38344
  custom_metrics: {}
  date: 2022-04-01_05-29-26
  done: false
  episode_len_mean: 18.12
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -14.84
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 1043
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 19172
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 38344
    num_agent_steps_trained: 63040
    num_steps_sampled: 19172
    num_steps_trained: 31520
    num_steps_trained_this_iter: 32
    num_target_updates: 161
  iterations_since_restore: 171
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 17.3
    ram_util_percent: 49.0
  pid: 27071
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -7.42

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,182,64.0865,20371,-11.16,32,-40,17.18


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 41380
  custom_metrics: {}
  date: 2022-04-01_05-29-31
  done: false
  episode_len_mean: 16.95
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -10.3
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 1132
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 20690
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 41380
    num_agent_steps_trained: 68736
    num_steps_sampled: 20690
    num_steps_trained: 34368
    num_steps_trained_this_iter: 32
    num_target_updates: 175
  iterations_since_restore: 185
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 16.2
    ram_util_percent: 49.1
  pid: 27071
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -5.15


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,197,68.9823,21930,-14.56,32,-40,18.38


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 44514
  custom_metrics: {}
  date: 2022-04-01_05-29-37
  done: false
  episode_len_mean: 18.73
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -17.06
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 1214
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 22177
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 44514
    num_agent_steps_trained: 73984
    num_steps_sampled: 22257
    num_steps_trained: 36992
    num_steps_trained_this_iter: 32
    num_target_updates: 188
  iterations_since_restore: 200
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -8.53
    policy1: -8.53
  policy_reward_min:
    policy

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,211,73.6286,23399,-20.16,34,-40,19.08


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 47416
  custom_metrics: {}
  date: 2022-04-01_05-29-42
  done: false
  episode_len_mean: 19.18
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -20.56
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 1290
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 23688
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 47416
    num_agent_steps_trained: 78784
    num_steps_sampled: 23708
    num_steps_trained: 39392
    num_steps_trained_this_iter: 32
    num_target_updates: 201
  iterations_since_restore: 214
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -10.28
    policy1: -10.28
  policy_reward_min:
    poli

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,225,78.4127,24919,-17.34,32,-40,18.47


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 50466
  custom_metrics: {}
  date: 2022-04-01_05-29-47
  done: false
  episode_len_mean: 18.21
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -16.42
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 1375
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 25147
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 50466
    num_agent_steps_trained: 84160
    num_steps_sampled: 25233
    num_steps_trained: 42080
    num_steps_trained_this_iter: 32
    num_target_updates: 214
  iterations_since_restore: 228
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 17.1
    ram_util_percent: 49.1
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -8.21

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,239,83.3531,26423,-16.26,34,-40,18.23


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 53468
  custom_metrics: {}
  date: 2022-04-01_05-29-52
  done: false
  episode_len_mean: 17.95
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -14.3
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 1458
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 26724
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 53468
    num_agent_steps_trained: 89472
    num_steps_sampled: 26734
    num_steps_trained: 44736
    num_steps_trained_this_iter: 32
    num_target_updates: 228
  iterations_since_restore: 242
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -7.15
    policy1: -7.15
  policy_reward_min:
    policy0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,253,88.2034,27910,-15.68,30,-40,18.04


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 56442
  custom_metrics: {}
  date: 2022-04-01_05-29-57
  done: false
  episode_len_mean: 17.98
  episode_media: {}
  episode_reward_max: 30.0
  episode_reward_mean: -15.16
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 1540
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 28167
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 56442
    num_agent_steps_trained: 94720
    num_steps_sampled: 28221
    num_steps_trained: 47360
    num_steps_trained_this_iter: 32
    num_target_updates: 241
  iterations_since_restore: 256
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.3
    ram_util_percent: 49.1
  pid: 27071
  policy_reward_max:
    policy0: 15.0
    policy1: 15.0
  policy_reward_mean:
    policy0: -7.58

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,RUNNING,172.17.0.2:27071,267,93.1012,29395,-16.38,30,-40,18.29


Result for contrib_MADDPG_foodenv_30cc8_00000:
  agent_timesteps_total: 59428
  custom_metrics: {}
  date: 2022-04-01_05-30-03
  done: false
  episode_len_mean: 18.32
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -16.64
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 1621
  experiment_id: 987af3d6744e4009b4bc9d92518314c0
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 29651
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 59428
    num_agent_steps_trained: 99840
    num_steps_sampled: 29714
    num_steps_trained: 49920
    num_steps_trained_this_iter: 32
    num_target_updates: 254
  iterations_since_restore: 270
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 27071
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -8.32
    policy1: -8.32
  policy_reward_min:
    policy

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_30cc8_00000,TERMINATED,172.17.0.2:27071,273,95.1373,30025,-16.56,34,-40,18.38


2022-04-01 05:30:04,765	INFO tune.py:636 -- Total run time: 115.02 seconds (114.58 seconds for the tuning loop).


# Results Analysis (Coop + MADDPG)

In [49]:
import ray.rllib.contrib.maddpg as maddpg

# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = maddpg.MADDPGTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

[2m[36m(RolloutWorker pid=2860)[0m   "Box bound precision lowered by casting to {}".format(self.dtype)
[2m[36m(RolloutWorker pid=2860)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pid=2860)[0m   return layer.apply(inputs)
[2m[36m(RolloutWorker pid=2860)[0m   out = tf1.layers.dense(feature, units=1, activation=None)
[2m[36m(RolloutWorker pid=2860)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pid=2860)[0m   out, units=act_space.shape[0], activation=None)
2022-03-06 05:54:38,128	INFO trainable.py:473 -- Restored on 172.17.0.2 from checkpoint: /home/ray/cs4246/06_03_2022_05:42:24/contrib/MADDPG/contrib_MADDPG_foodenv_41590_00000_0_2022-03-06_05-42-24/checkpoint_000020/checkpoint-20
2022-03-06 05:54:38,130	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 20, '_timesteps_total': 20480, '_time_total': 681.5899374485016, '_episodes_total': 1157}


Initial state at Time 0:
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 1
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '0' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 2
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '1' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Time: 3
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '1' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']]

Tim

# MADDPG Trainer (Comp)

In [126]:
coop_env_config['trainer'] = 'maddpg'
comp_env_config['trainer'] = 'maddpg'

e = FoodGame(comp_env_config)

# For custom NN models
# def gen_policy(i):
#     config = {
#         "model": {
#             "custom_model": "my_model",
#         },
#         "gamma": 0.99,
#     }
#     return (None, e.observation_space, e.action_space, config)

policies = {"policy"+str(i):  PolicySpec(
                        observation_space= e.observation_space,
                        action_space= e.action_space,
                        config={"agent_id": i}) for i in range(len(e.agents))}

policy_mapping = {'agent'+str(i): "policy"+str(i) for i in range(len(e.agents))}

def p(agent_id):
    print(agent_id)
    return policies[policy_keys[agent_id]]

config = dict()
config["num_gpus"] = 0
config["multiagent"] = {"policies": policies,  "policy_mapping_fn": lambda agent_id: policy_mapping[agent_id]}
config["env"] = "foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config
config["batch_mode"] = "complete_episodes"

# model
config["actor_hiddens"] = [256, 256]
config["actor_hidden_activation"] = "tanh"
config["critic_hiddens"] = [256, 256]
config["critic_hidden_activation"] = "tanh"
config["learning_starts"] = 1000 # in terms of samples
config["critic_lr"] = 1e-2 # in terms of samples
config["actor_lr"] = 1e-2 # in terms of samples

# Buffer params
config["buffer_size"] = 20000 # in batches

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning params
config["lr"] = 1e-2
config["target_network_update_freq"] = 100

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}

analysis = ray.tune.run(
    "contrib/MADDPG",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
#     checkpoint_freq=100,
    checkpoint_at_end=True)

Trial name,status,loc
contrib_MADDPG_foodenv_c2dbf_00000,PENDING,


[2m[36m(MADDPGTrainer pid=26986)[0m 2022-04-01 05:25:09,635	INFO trainer.py:2055 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(MADDPGTrainer pid=26986)[0m 2022-04-01 05:25:09,637	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(RolloutWorker pid=26985)[0m   "Box bound precision lowered by casting to {}".format(self.dtype)
[2m[36m(RolloutWorker pid=26985)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pid=26985)[0m   return layer.apply(inputs)
[2m[36m(RolloutWorker pid=26985)[0m   out = tf1.layers.dense(feature, units=1, activation=None)
[2m[36m(RolloutWorker pid=26985)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pi

Trial name,status,loc
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986




Trial name,status,loc
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 2020
  custom_metrics: {}
  date: 2022-04-01_05-25-20
  done: false
  episode_len_mean: 19.80392156862745
  episode_media: {}
  episode_reward_max: 18.0
  episode_reward_mean: -28.627450980392158
  episode_reward_min: -40.0
  episodes_this_iter: 51
  episodes_total: 51
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 1010
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 2020
    num_agent_steps_trained: 64
    num_steps_sampled: 1010
    num_steps_trained: 32
    num_steps_trained_this_iter: 32
    num_target_updates: 1
  iterations_since_restore: 1
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.24
    ram_util_percent: 53.9
  pid: 26986
  policy_reward_max:
    policy0: 9.0
    policy1: 9.0
  policy_reward_mean:
    polic

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,10,5.99324,1910,-29.7917,18,-40,19.8958


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 4820
  custom_metrics: {}
  date: 2022-04-01_05-25-25
  done: false
  episode_len_mean: 20.0
  episode_media: {}
  episode_reward_max: -20.0
  episode_reward_mean: -32.4
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 121
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 2330
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 4820
    num_agent_steps_trained: 4544
    num_steps_sampled: 2410
    num_steps_trained: 2272
    num_steps_trained_this_iter: 32
    num_target_updates: 12
  iterations_since_restore: 15
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: -10.0
    policy1: -10.0
  policy_reward_mean:
    policy0: -16.2
    policy1: -16.2
  policy_reward_min:
    policy0: -20.0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,25,10.8762,3410,-35.8,-20,-40,20


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 7820
  custom_metrics: {}
  date: 2022-04-01_05-25-30
  done: false
  episode_len_mean: 19.8
  episode_media: {}
  episode_reward_max: 22.0
  episode_reward_mean: -36.0
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 197
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 3890
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 7820
    num_agent_steps_trained: 9408
    num_steps_sampled: 3910
    num_steps_trained: 4704
    num_steps_trained_this_iter: 32
    num_target_updates: 25
  iterations_since_restore: 30
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 11.0
    policy1: 11.0
  policy_reward_mean:
    policy0: -18.0
    policy1: -18.0
  policy_reward_min:
    policy0: -20.0
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,40,15.6459,4925,-32.9,34,-40,19.55


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 10700
  custom_metrics: {}
  date: 2022-04-01_05-25-35
  done: false
  episode_len_mean: 19.2
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -27.2
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 272
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 5310
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 10700
    num_agent_steps_trained: 14144
    num_steps_sampled: 5350
    num_steps_trained: 7072
    num_steps_trained_this_iter: 32
    num_target_updates: 37
  iterations_since_restore: 44
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -13.6
    policy1: -13.6
  policy_reward_min:
    policy0: -20.0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,53,20.3021,6323,-20.06,34,-40,18.73


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 13706
  custom_metrics: {}
  date: 2022-04-01_05-25-41
  done: false
  episode_len_mean: 18.88
  episode_media: {}
  episode_reward_max: 24.0
  episode_reward_mean: -19.16
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 351
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 6797
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 13706
    num_agent_steps_trained: 19200
    num_steps_sampled: 6853
    num_steps_trained: 9600
    num_steps_trained_this_iter: 32
    num_target_updates: 50
  iterations_since_restore: 58
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 12.0
    policy1: 12.0
  policy_reward_mean:
    policy0: -9.58
    policy1: -9.58
  policy_reward_min:
    policy0: -20

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,66,25.0404,7715,-22.52,32,-40,19.26


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 16644
  custom_metrics: {}
  date: 2022-04-01_05-25-46
  done: false
  episode_len_mean: 19.25
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -22.9
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 428
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 8314
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 16644
    num_agent_steps_trained: 24128
    num_steps_sampled: 8322
    num_steps_trained: 12064
    num_steps_trained_this_iter: 32
    num_target_updates: 63
  iterations_since_restore: 72
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 29.7
    ram_util_percent: 54.8
  pid: 26986
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -11.45
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,80,29.8205,9158,-18.8,34,-40,18.6


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 19552
  custom_metrics: {}
  date: 2022-04-01_05-25-51
  done: false
  episode_len_mean: 18.81
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -20.62
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 505
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 9716
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 19552
    num_agent_steps_trained: 28928
    num_steps_sampled: 9776
    num_steps_trained: 14464
    num_steps_trained_this_iter: 32
    num_target_updates: 75
  iterations_since_restore: 86
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 27.9
    ram_util_percent: 54.9
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -10.31
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,94,34.4281,10610,-21.84,34,-40,18.52


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 22672
  custom_metrics: {}
  date: 2022-04-01_05-25-57
  done: false
  episode_len_mean: 18.18
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -20.76
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 592
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 11316
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 22672
    num_agent_steps_trained: 34240
    num_steps_sampled: 11336
    num_steps_trained: 17120
    num_steps_trained_this_iter: 32
    num_target_updates: 89
  iterations_since_restore: 101
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 28.1
    ram_util_percent: 55.0
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -10.38


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,109,39.2179,12144,-19.68,34,-40,18.34


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 25742
  custom_metrics: {}
  date: 2022-04-01_05-26-02
  done: false
  episode_len_mean: 18.71
  episode_media: {}
  episode_reward_max: 28.0
  episode_reward_mean: -19.02
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 674
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 12824
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 25742
    num_agent_steps_trained: 39488
    num_steps_sampled: 12871
    num_steps_trained: 19744
    num_steps_trained_this_iter: 32
    num_target_updates: 102
  iterations_since_restore: 116
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 14.0
    policy1: 14.0
  policy_reward_mean:
    policy0: -9.51
    policy1: -9.51
  policy_reward_min:
    policy0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,123,43.9627,13625,-15.78,32,-40,18.09


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 28730
  custom_metrics: {}
  date: 2022-04-01_05-26-07
  done: false
  episode_len_mean: 18.21
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -15.62
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 755
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 14285
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 28730
    num_agent_steps_trained: 44672
    num_steps_sampled: 14365
    num_steps_trained: 22336
    num_steps_trained_this_iter: 32
    num_target_updates: 115
  iterations_since_restore: 130
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 29.1
    ram_util_percent: 55.2
  pid: 26986
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -7.81


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,137,48.681,15080,-17.26,34,-40,18.23


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 31658
  custom_metrics: {}
  date: 2022-04-01_05-26-12
  done: false
  episode_len_mean: 17.72
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -13.84
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 839
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 15760
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 31658
    num_agent_steps_trained: 49920
    num_steps_sampled: 15829
    num_steps_trained: 24960
    num_steps_trained_this_iter: 32
    num_target_updates: 128
  iterations_since_restore: 144
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 26.5
    ram_util_percent: 55.2
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -6.92


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,151,53.4469,16564,-14.34,34,-40,17.87


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 34636
  custom_metrics: {}
  date: 2022-04-01_05-26-17
  done: false
  episode_len_mean: 18.07
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -14.94
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 920
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 17251
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 34636
    num_agent_steps_trained: 55104
    num_steps_sampled: 17318
    num_steps_trained: 27552
    num_steps_trained_this_iter: 32
    num_target_updates: 141
  iterations_since_restore: 158
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -7.47
    policy1: -7.47
  policy_reward_min:
    policy0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,165,58.3755,18069,-18.54,32,-40,18.67


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 37616
  custom_metrics: {}
  date: 2022-04-01_05-26-23
  done: false
  episode_len_mean: 18.1
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -15.6
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 1002
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 18808
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 37616
    num_agent_steps_trained: 60352
    num_steps_sampled: 18808
    num_steps_trained: 30176
    num_steps_trained_this_iter: 32
    num_target_updates: 155
  iterations_since_restore: 172
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 27.5
    ram_util_percent: 55.4
  pid: 26986
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -7.8
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,179,63.2207,19560,-11.86,34,-40,17.03


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 40638
  custom_metrics: {}
  date: 2022-04-01_05-26-28
  done: false
  episode_len_mean: 17.53
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -13.46
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 1089
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 20247
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 40638
    num_agent_steps_trained: 65728
    num_steps_sampled: 20319
    num_steps_trained: 32864
    num_steps_trained_this_iter: 32
    num_target_updates: 168
  iterations_since_restore: 186
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -6.73
    policy1: -6.73
  policy_reward_min:
    policy

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,194,67.968,21160,-15.74,34,-40,18.27


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 43840
  custom_metrics: {}
  date: 2022-04-01_05-26-33
  done: false
  episode_len_mean: 17.93
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -16.46
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 1179
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 21820
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 43840
    num_agent_steps_trained: 71360
    num_steps_sampled: 21920
    num_steps_trained: 35680
    num_steps_trained_this_iter: 32
    num_target_updates: 182
  iterations_since_restore: 201
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.0
    ram_util_percent: 49.1
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -8.23

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,209,72.8069,22776,-16.6,34,-40,18


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 47032
  custom_metrics: {}
  date: 2022-04-01_05-26-38
  done: false
  episode_len_mean: 18.41
  episode_media: {}
  episode_reward_max: 28.0
  episode_reward_mean: -16.82
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 1265
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 23436
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 47032
    num_agent_steps_trained: 76864
    num_steps_sampled: 23516
    num_steps_trained: 38432
    num_steps_trained_this_iter: 32
    num_target_updates: 196
  iterations_since_restore: 216
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 14.0
    policy1: 14.0
  policy_reward_mean:
    policy0: -8.41
    policy1: -8.41
  policy_reward_min:
    policy

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,223,77.4875,24270,-17.14,26,-40,18.47


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 50094
  custom_metrics: {}
  date: 2022-04-01_05-26-43
  done: false
  episode_len_mean: 18.27
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -15.94
  episode_reward_min: -40.0
  episodes_this_iter: 7
  episodes_total: 1349
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 25010
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 50094
    num_agent_steps_trained: 82176
    num_steps_sampled: 25047
    num_steps_trained: 41088
    num_steps_trained_this_iter: 32
    num_target_updates: 210
  iterations_since_restore: 230
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -7.97
    policy1: -7.97
  policy_reward_min:
    policy

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,237,82.3997,25805,-17.18,34,-40,18.49


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 53120
  custom_metrics: {}
  date: 2022-04-01_05-26-48
  done: false
  episode_len_mean: 18.57
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -17.54
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 1429
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 26501
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 53120
    num_agent_steps_trained: 87296
    num_steps_sampled: 26560
    num_steps_trained: 43648
    num_steps_trained_this_iter: 32
    num_target_updates: 223
  iterations_since_restore: 244
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 17.4
    ram_util_percent: 49.1
  pid: 26986
  policy_reward_max:
    policy0: 17.0
    policy1: 17.0
  policy_reward_mean:
    policy0: -8.77

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,252,87.2383,27416,-18.26,30,-40,18.93


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 56302
  custom_metrics: {}
  date: 2022-04-01_05-26-54
  done: false
  episode_len_mean: 18.5
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -17.2
  episode_reward_min: -40.0
  episodes_this_iter: 8
  episodes_total: 1516
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 28111
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 56302
    num_agent_steps_trained: 92864
    num_steps_sampled: 28151
    num_steps_trained: 46432
    num_steps_trained_this_iter: 32
    num_target_updates: 237
  iterations_since_restore: 259
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf: {}
  pid: 26986
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -8.6
    policy1: -8.6
  policy_reward_min:
    policy0: -

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,RUNNING,172.17.0.2:26986,267,92.0076,29003,-16.52,32,-40,18.26


Result for contrib_MADDPG_foodenv_c2dbf_00000:
  agent_timesteps_total: 59490
  custom_metrics: {}
  date: 2022-04-01_05-26-59
  done: false
  episode_len_mean: 18.46
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -16.32
  episode_reward_min: -40.0
  episodes_this_iter: 5
  episodes_total: 1601
  experiment_id: eb5006d1bbc74e1d929a5571fd0b2b83
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 29705
    learner:
      policy0:
        learner_stats: {}
      policy1:
        learner_stats: {}
    num_agent_steps_sampled: 59490
    num_agent_steps_trained: 98304
    num_steps_sampled: 29745
    num_steps_trained: 49152
    num_steps_trained_this_iter: 32
    num_target_updates: 251
  iterations_since_restore: 274
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.2
    ram_util_percent: 49.1
  pid: 26986
  policy_reward_max:
    policy0: 16.0
    policy1: 16.0
  policy_reward_mean:
    policy0: -8.16



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
contrib_MADDPG_foodenv_c2dbf_00000,TERMINATED,172.17.0.2:26986,277,95.1727,30057,-18.12,30,-40,18.86


[2m[36m(RolloutWorker pid=26985)[0m 2022-04-01 05:27:00,748	ERROR worker.py:432 -- SystemExit was raised from the worker.
[2m[36m(RolloutWorker pid=26985)[0m Traceback (most recent call last):
[2m[36m(RolloutWorker pid=26985)[0m   File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=26985)[0m   File "python/ray/_raylet.pyx", line 640, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=26985)[0m   File "python/ray/_raylet.pyx", line 589, in ray._raylet.execute_task.function_executor
[2m[36m(RolloutWorker pid=26985)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/function_manager.py", line 639, in actor_method_executor
[2m[36m(RolloutWorker pid=26985)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(RolloutWorker pid=26985)[0m   File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(RolloutWorker pid=26985)[0m     

# Results Analysis (Comp + MADDPG)

In [32]:
import ray.rllib.contrib.maddpg as maddpg

# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = maddpg.MADDPGTrainer(config=config, env="foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends
episode_reward = {'agent'+str(i):0 for i in range(len(e.agents))}
done = False
obs = e.reset()
print("Initial state at Time 0:")
e.render()
time = 1
while not done:
    print("Time:",time)
    time += 1
    action = trainer.compute_actions(obs,policy_id='policy0')
    for i in range(1,len(e.agents)):
        a = trainer.compute_actions(obs,policy_id='policy'+str(i))
        action['agent'+str(i)] = a['agent'+str(i)]
    obs, reward, dones, info = e.step(action)
    done = dones['__all__']
    e.render()
    for i in range(len(e.agents)):
        episode_reward['agent'+str(i)] += reward['agent'+str(i)]
print("Episode Ended")
print("Episode Rewards",episode_reward)

[2m[36m(RolloutWorker pid=1858)[0m   "Box bound precision lowered by casting to {}".format(self.dtype)
[2m[36m(RolloutWorker pid=1858)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pid=1858)[0m   return layer.apply(inputs)
[2m[36m(RolloutWorker pid=1858)[0m   out = tf1.layers.dense(feature, units=1, activation=None)
[2m[36m(RolloutWorker pid=1858)[0m   out, units=hidden, activation=activation)
[2m[36m(RolloutWorker pid=1858)[0m   out, units=act_space.shape[0], activation=None)
2022-03-06 05:13:20,798	INFO trainable.py:473 -- Restored on 172.17.0.2 from checkpoint: /home/ray/cs4246/06_03_2022_05:10:47/contrib/MADDPG/contrib_MADDPG_foodenv_d6889_00000_0_2022-03-06_05-10-47/checkpoint_000020/checkpoint-20
2022-03-06 05:13:20,799	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 20, '_timesteps_total': 640, '_time_total': 130.02117204666138, '_episodes_total': 1095}


Initial state at Time 0:
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 1
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '' '0' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '' '1' '']]

Time: 2
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '' '0' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '' '1' '' '']]

Time: 3
[['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' '' '' '' '' '' '' '']
 ['' '' '' 'F' '' '' '' '' '' '']
 ['' '' '' '0' '' '' '' '' '' '']
 ['' '' '' '' '' 'F' '' '' '' '']
 ['' '' '' '' '' '' '1' '' '' '']]

Tim

# QMIX Trainer (Coop)

In [133]:
coop_env_config['trainer'] = 'qmix'
comp_env_config['trainer'] = 'qmix'

In [123]:
e = FoodGame(coop_env_config)
tuple_obs_space = Tuple([e.observation_space for i in range(len(e.agents))])
tuple_act_space = Tuple([e.action_space for i in range(len(e.agents))])

register_env("grouped_foodenv", lambda config: FoodGame(config).with_agent_groups(
            groups={"agents": e.agent_names},
            obs_space=tuple_obs_space,
            act_space=tuple_act_space,
        ))

config = dict()
config["num_gpus"] = 0
config["env"] = "grouped_foodenv"
config["num_workers"] = 1
config["env_config"] = coop_env_config

# Learning params
config["lr"] = 1e-3
config["optim_alpha"] = 0.99
config["optim_eps"] = 0.00001
config["grad_norm_clipping"] = 10
config["target_network_update_freq"] = 100

# Buffer params
config["buffer_size"] = 20000 # in batches

# Exploration params
config["exploration_config"] = {"type": "EpsilonGreedy", "epsilon_timesteps": 25000, "final_epsilon": 0.00}

# Learning duration
config["timesteps_per_iteration"] = 100
config["learning_starts"] = 1000
config["train_batch_size"] = 32
config["rollout_fragment_length"] = 4 # no. of samples to rollout in each sample of the batch to add to buffer each timestep
config["model"]={
        "lstm_cell_size": 256,
        "max_seq_len": 20,
    }

config["mixing_embed_dim"] = 256
# eval
# config["evaluation_interval"] = 1
# config["evaluation_duration"] = 1

now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"timesteps_total": 30000}
analysis = ray.tune.run(
    "QMIX",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=100,
    checkpoint_at_end=True)

Trial name,status,loc
QMIX_grouped_foodenv_5371f_00000,PENDING,


[2m[36m(QMixTrainer pid=26724)[0m 2022-04-01 05:14:53,115	INFO simple_q.py:154 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(QMixTrainer pid=26724)[0m 2022-04-01 05:14:53,115	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724


[2m[36m(RolloutWorker pid=26723)[0m   for k, v in state_dict.items()


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 118
  custom_metrics: {}
  date: 2022-04-01_05-14-57
  done: false
  episode_len_mean: 19.666666666666668
  episode_media: {}
  episode_reward_max: 4.0
  episode_reward_mean: -22.666666666666668
  episode_reward_min: -40.0
  episodes_this_iter: 6
  episodes_total: 6
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 118
    learner:
      default_policy:
        learner_stats:
          grad_norm: 37.676612854003906
          loss: 10.890652656555176
          q_taken_mean: -1.9169570521304482
          target_mean: -0.15125996188113564
          td_error_abs: 2.2955468830309416
    num_agent_steps_sampled: 118
    num_steps_sampled: 118
    num_steps_trained: 236
    num_steps_trained_this_iter: 38
    num_target_updates: 1
  iterations_since_restore: 1
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,3,2.49261,559,-19.931,12,-40,19.2759


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 1512
  custom_metrics: {}
  date: 2022-04-01_05-15-03
  done: false
  episode_len_mean: 18.9
  episode_media: {}
  episode_reward_max: 24.0
  episode_reward_mean: -17.55
  episode_reward_min: -40.0
  episodes_this_iter: 17
  episodes_total: 80
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 1492
    learner:
      default_policy:
        learner_stats:
          grad_norm: 6.091006278991699
          loss: 1.0168132781982422
          q_taken_mean: -2.2065654754638673
          target_mean: -2.2836544036865236
          td_error_abs: 0.7652877807617188
    num_agent_steps_sampled: 1512
    num_steps_sampled: 1512
    num_steps_trained: 3127
    num_steps_trained_this_iter: 40
    num_target_updates: 13
  iterations_since_restore: 6
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.45
    ram_util_per

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,7,8.07036,1847,-17.898,24,-40,18.8469


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 2618
  custom_metrics: {}
  date: 2022-04-01_05-15-08
  done: false
  episode_len_mean: 18.69
  episode_media: {}
  episode_reward_max: 30.0
  episode_reward_mean: -18.18
  episode_reward_min: -40.0
  episodes_this_iter: 12
  episodes_total: 139
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 2530
    learner:
      default_policy:
        learner_stats:
          grad_norm: 19.242727279663086
          loss: 4.682595252990723
          q_taken_mean: 1.179016399383545
          target_mean: 1.6498706817626954
          td_error_abs: 1.6385274887084962
    num_agent_steps_sampled: 2618
    num_steps_sampled: 2618
    num_steps_trained: 5453
    num_steps_trained_this_iter: 40
    num_target_updates: 22
  iterations_since_restore: 10
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 23.9
    ram_util_perc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,12,13.6798,3064,-17.06,30,-40,18.33


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 3952
  custom_metrics: {}
  date: 2022-04-01_05-15-14
  done: false
  episode_len_mean: 17.63
  episode_media: {}
  episode_reward_max: 32.0
  episode_reward_mean: -13.26
  episode_reward_min: -40.0
  episodes_this_iter: 19
  episodes_total: 216
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 3952
    learner:
      default_policy:
        learner_stats:
          grad_norm: 15.08895206451416
          loss: 2.8729515075683594
          q_taken_mean: 1.2467153549194336
          target_mean: 1.651203727722168
          td_error_abs: 1.2765398979187013
    num_agent_steps_sampled: 3952
    num_steps_sampled: 3952
    num_steps_trained: 8583
    num_steps_trained_this_iter: 40
    num_target_updates: 35
  iterations_since_restore: 15
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 23.75
    ram_util_per

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,16,19.2474,4271,-9.66,34,-40,16.73


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 5157
  custom_metrics: {}
  date: 2022-04-01_05-15-20
  done: false
  episode_len_mean: 16.06
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -5.92
  episode_reward_min: -40.0
  episodes_this_iter: 15
  episodes_total: 293
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 5157
    learner:
      default_policy:
        learner_stats:
          grad_norm: 26.302133560180664
          loss: 5.062967777252197
          q_taken_mean: 9.802593994140626
          target_mean: 9.133711242675782
          td_error_abs: 1.7761575698852539
    num_agent_steps_sampled: 5157
    num_steps_sampled: 5157
    num_steps_trained: 11546
    num_steps_trained_this_iter: 40
    num_target_updates: 46
  iterations_since_restore: 19
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.0
    ram_util_perce

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,20,24.6401,5358,-5,34,-40,15.9


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 6308
  custom_metrics: {}
  date: 2022-04-01_05-15-26
  done: false
  episode_len_mean: 15.45
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: -2.5
  episode_reward_min: -40.0
  episodes_this_iter: 19
  episodes_total: 368
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 6208
    learner:
      default_policy:
        learner_stats:
          grad_norm: 21.795682907104492
          loss: 3.997018575668335
          q_taken_mean: 8.517296685112846
          target_mean: 8.993734571668837
          td_error_abs: 1.499956766764323
    num_agent_steps_sampled: 6308
    num_steps_sampled: 6308
    num_steps_trained: 14566
    num_steps_trained_this_iter: 36
    num_target_updates: 56
  iterations_since_restore: 24
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.5
    ram_util_percent

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,25,30.4504,6514,-1.6,34,-40,15.1


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 7388
  custom_metrics: {}
  date: 2022-04-01_05-15-32
  done: false
  episode_len_mean: 14.62
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 0.96
  episode_reward_min: -40.0
  episodes_this_iter: 17
  episodes_total: 444
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 7313
    learner:
      default_policy:
        learner_stats:
          grad_norm: 26.471118927001953
          loss: 4.46091365814209
          q_taken_mean: 6.800485992431641
          target_mean: 6.2359130859375
          td_error_abs: 1.539873218536377
    num_agent_steps_sampled: 7388
    num_steps_sampled: 7388
    num_steps_trained: 17573
    num_steps_trained_this_iter: 40
    num_target_updates: 66
  iterations_since_restore: 29
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 23.85
    ram_util_percent: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,30,36.2056,7609,2.78,34,-40,14.21


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 8464
  custom_metrics: {}
  date: 2022-04-01_05-15-38
  done: false
  episode_len_mean: 13.1
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 8.4
  episode_reward_min: -40.0
  episodes_this_iter: 14
  episodes_total: 526
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 8412
    learner:
      default_policy:
        learner_stats:
          grad_norm: 20.04689598083496
          loss: 2.567929744720459
          q_taken_mean: 6.935249328613281
          target_mean: 6.176515367296007
          td_error_abs: 1.228915320502387
    num_agent_steps_sampled: 8464
    num_steps_sampled: 8464
    num_steps_trained: 20806
    num_steps_trained_this_iter: 36
    num_target_updates: 76
  iterations_since_restore: 34
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 22.85
    ram_util_percent: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,35,42.1776,8696,7.94,34,-40,13.23


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 9552
  custom_metrics: {}
  date: 2022-04-01_05-15-44
  done: false
  episode_len_mean: 13.46
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 6.48
  episode_reward_min: -40.0
  episodes_this_iter: 18
  episodes_total: 610
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 9490
    learner:
      default_policy:
        learner_stats:
          grad_norm: 13.64406681060791
          loss: 2.415139675140381
          q_taken_mean: 4.395039367675781
          target_mean: 4.784038543701172
          td_error_abs: 1.1995048522949219
    num_agent_steps_sampled: 9552
    num_steps_sampled: 9552
    num_steps_trained: 23952
    num_steps_trained_this_iter: 40
    num_target_updates: 86
  iterations_since_restore: 39
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 23.5
    ram_util_percent

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,39,47.0019,9552,6.48,34,-40,13.46


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 10399
  custom_metrics: {}
  date: 2022-04-01_05-15-49
  done: false
  episode_len_mean: 11.89
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 10.82
  episode_reward_min: -40.0
  episodes_this_iter: 18
  episodes_total: 682
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 10345
    learner:
      default_policy:
        learner_stats:
          grad_norm: 14.509479522705078
          loss: 2.0883240699768066
          q_taken_mean: 8.024468463400137
          target_mean: 7.6453990106997285
          td_error_abs: 1.13653216154679
    num_agent_steps_sampled: 10399
    num_steps_sampled: 10399
    num_steps_trained: 26621
    num_steps_trained_this_iter: 46
    num_target_updates: 94
  iterations_since_restore: 43
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.05
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,43,52.3757,10399,10.82,34,-40,11.89


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 11252
  custom_metrics: {}
  date: 2022-04-01_05-15-55
  done: false
  episode_len_mean: 11.71
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 12.38
  episode_reward_min: -40.0
  episodes_this_iter: 19
  episodes_total: 756
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 11192
    learner:
      default_policy:
        learner_stats:
          grad_norm: 14.41120433807373
          loss: 2.503575563430786
          q_taken_mean: 5.074590725368924
          target_mean: 5.487166680230034
          td_error_abs: 1.2776251051161025
    num_agent_steps_sampled: 11252
    num_steps_sampled: 11252
    num_steps_trained: 29301
    num_steps_trained_this_iter: 45
    num_target_updates: 102
  iterations_since_restore: 47
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.65
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,47,57.508,11252,12.38,34,-40,11.71


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 12119
  custom_metrics: {}
  date: 2022-04-01_05-16-00
  done: false
  episode_len_mean: 10.48
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 15.44
  episode_reward_min: -40.0
  episodes_this_iter: 20
  episodes_total: 840
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 12047
    learner:
      default_policy:
        learner_stats:
          grad_norm: 12.133341789245605
          loss: 2.030532121658325
          q_taken_mean: 4.134061431884765
          target_mean: 4.340554809570312
          td_error_abs: 1.1806828498840332
    num_agent_steps_sampled: 12119
    num_steps_sampled: 12119
    num_steps_trained: 32371
    num_steps_trained_this_iter: 40
    num_target_updates: 110
  iterations_since_restore: 51
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.8
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,51,63.2108,12119,15.44,34,-40,10.48


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 12971
  custom_metrics: {}
  date: 2022-04-01_05-16-06
  done: false
  episode_len_mean: 10.79
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 15.02
  episode_reward_min: -40.0
  episodes_this_iter: 17
  episodes_total: 919
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 12914
    learner:
      default_policy:
        learner_stats:
          grad_norm: 22.82637596130371
          loss: 3.8069634437561035
          q_taken_mean: 7.1233573913574215
          target_mean: 6.523442077636719
          td_error_abs: 1.2393315315246582
    num_agent_steps_sampled: 12971
    num_steps_sampled: 12971
    num_steps_trained: 35173
    num_steps_trained_this_iter: 40
    num_target_updates: 118
  iterations_since_restore: 55
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.4
    ram_util

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,55,68.4804,12971,15.02,34,-40,10.79


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 13834
  custom_metrics: {}
  date: 2022-04-01_05-16-11
  done: false
  episode_len_mean: 10.04
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 17.32
  episode_reward_min: -20.0
  episodes_this_iter: 21
  episodes_total: 1007
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 13779
    learner:
      default_policy:
        learner_stats:
          grad_norm: 31.711084365844727
          loss: 3.842224597930908
          q_taken_mean: 4.40436019897461
          target_mean: 5.074214935302734
          td_error_abs: 1.5944124221801759
    num_agent_steps_sampled: 13834
    num_steps_sampled: 13834
    num_steps_trained: 38213
    num_steps_trained_this_iter: 40
    num_target_updates: 126
  iterations_since_restore: 59
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 26.3
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,59,73.8078,13834,17.32,34,-20,10.04


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 14690
  custom_metrics: {}
  date: 2022-04-01_05-16-17
  done: false
  episode_len_mean: 9.35
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 19.3
  episode_reward_min: -20.0
  episodes_this_iter: 26
  episodes_total: 1100
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 14641
    learner:
      default_policy:
        learner_stats:
          grad_norm: 22.439298629760742
          loss: 2.7357242107391357
          q_taken_mean: 6.140370941162109
          target_mean: 6.692203521728516
          td_error_abs: 1.2784499168395995
    num_agent_steps_sampled: 14690
    num_steps_sampled: 14690
    num_steps_trained: 41317
    num_steps_trained_this_iter: 40
    num_target_updates: 134
  iterations_since_restore: 63
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.950000000000003

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,63,79.5796,14690,19.3,34,-20,9.35


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 15522
  custom_metrics: {}
  date: 2022-04-01_05-16-23
  done: false
  episode_len_mean: 7.88
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 23.84
  episode_reward_min: -20.0
  episodes_this_iter: 28
  episodes_total: 1204
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 15478
    learner:
      default_policy:
        learner_stats:
          grad_norm: 19.723146438598633
          loss: 2.3781943321228027
          q_taken_mean: 11.560941808363971
          target_mean: 12.204405840705423
          td_error_abs: 1.204710231107824
    num_agent_steps_sampled: 15522
    num_steps_sampled: 15522
    num_steps_trained: 44681
    num_steps_trained_this_iter: 34
    num_target_updates: 142
  iterations_since_restore: 67
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.15
    ram_ut

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,67,85.7437,15522,23.84,34,-20,7.88


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,70,90.6404,16159,23.86,34,-20,7.77


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 16361
  custom_metrics: {}
  date: 2022-04-01_05-16-30
  done: false
  episode_len_mean: 7.82
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 23.56
  episode_reward_min: -20.0
  episodes_this_iter: 25
  episodes_total: 1310
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 16324
    learner:
      default_policy:
        learner_stats:
          grad_norm: 16.057506561279297
          loss: 1.4599661827087402
          q_taken_mean: 10.563768174913195
          target_mean: 10.343276977539062
          td_error_abs: 0.9056981404622396
    num_agent_steps_sampled: 16361
    num_steps_sampled: 16361
    num_steps_trained: 48092
    num_steps_trained_this_iter: 36
    num_target_updates: 150
  iterations_since_restore: 71
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.950000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,74,97.1288,17001,26.06,34,-20,6.77


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 17641
  custom_metrics: {}
  date: 2022-04-01_05-16-40
  done: false
  episode_len_mean: 6.19
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 27.22
  episode_reward_min: -20.0
  episodes_this_iter: 36
  episodes_total: 1511
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 17584
    learner:
      default_policy:
        learner_stats:
          grad_norm: 14.343544960021973
          loss: 1.4925106763839722
          q_taken_mean: 8.486500331333705
          target_mean: 8.867338634672619
          td_error_abs: 0.956913811819894
    num_agent_steps_sampled: 17641
    num_steps_sampled: 17641
    num_steps_trained: 54147
    num_steps_trained_this_iter: 42
    num_target_updates: 162
  iterations_since_restore: 77
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.8
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,77,102.314,17641,27.22,34,-20,6.19


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,80,107.173,18160,29.34,34,4,5.33


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 18264
  custom_metrics: {}
  date: 2022-04-01_05-16-46
  done: false
  episode_len_mean: 5.42
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 29.16
  episode_reward_min: 0.0
  episodes_this_iter: 21
  episodes_total: 1629
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 18218
    learner:
      default_policy:
        learner_stats:
          grad_norm: 26.16466522216797
          loss: 4.510859489440918
          q_taken_mean: 7.341932678222657
          target_mean: 6.622486877441406
          td_error_abs: 1.4319839477539062
    num_agent_steps_sampled: 18264
    num_steps_sampled: 18264
    num_steps_trained: 57540
    num_steps_trained_this_iter: 40
    num_target_updates: 168
  iterations_since_restore: 81
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.3
    ram_util_per

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,84,113.376,18884,28.86,34,8,5.57


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,87,117.988,19397,29.82,34,4,5.09


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 19502
  custom_metrics: {}
  date: 2022-04-01_05-16-57
  done: false
  episode_len_mean: 4.95
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 30.1
  episode_reward_min: 4.0
  episodes_this_iter: 21
  episodes_total: 1862
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 19473
    learner:
      default_policy:
        learner_stats:
          grad_norm: 9.443055152893066
          loss: 1.2100400924682617
          q_taken_mean: 9.628464642693014
          target_mean: 9.384708180147058
          td_error_abs: 0.8583131677964154
    num_agent_steps_sampled: 19502
    num_steps_sampled: 19502
    num_steps_trained: 64198
    num_steps_trained_this_iter: 51
    num_target_updates: 180
  iterations_since_restore: 88
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.950000000000003
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,91,124.009,20015,31.1,34,2,4.45


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 20115
  custom_metrics: {}
  date: 2022-04-01_05-17-03
  done: false
  episode_len_mean: 4.47
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 31.06
  episode_reward_min: 2.0
  episodes_this_iter: 23
  episodes_total: 1996
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 20115
    learner:
      default_policy:
        learner_stats:
          grad_norm: 13.17340087890625
          loss: 1.2109262943267822
          q_taken_mean: 8.927580043247767
          target_mean: 9.20964093889509
          td_error_abs: 0.8973127092633929
    num_agent_steps_sampled: 20115
    num_steps_sampled: 20115
    num_steps_trained: 67610
    num_steps_trained_this_iter: 35
    num_target_updates: 186
  iterations_since_restore: 92
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.75
    ram_util_pe

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,94,128.099,20420,30.58,34,2,4.71


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 20629
  custom_metrics: {}
  date: 2022-04-01_05-17-08
  done: false
  episode_len_mean: 4.75
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 30.5
  episode_reward_min: 12.0
  episodes_this_iter: 46
  episodes_total: 2105
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 20534
    learner:
      default_policy:
        learner_stats:
          grad_norm: 12.280204772949219
          loss: 0.8589314222335815
          q_taken_mean: 9.103630065917969
          target_mean: 9.511116981506348
          td_error_abs: 0.7040891647338867
    num_agent_steps_sampled: 20629
    num_steps_sampled: 20629
    num_steps_trained: 70406
    num_steps_trained_this_iter: 32
    num_target_updates: 190
  iterations_since_restore: 95
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.400000000000002


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,97,133.12,20938,31.7,34,20,4.15


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 21146
  custom_metrics: {}
  date: 2022-04-01_05-17-13
  done: false
  episode_len_mean: 4.14
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 31.72
  episode_reward_min: 12.0
  episodes_this_iter: 25
  episodes_total: 2226
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 21046
    learner:
      default_policy:
        learner_stats:
          grad_norm: 9.541802406311035
          loss: 0.928259551525116
          q_taken_mean: 9.25687717747044
          target_mean: 9.338269207928631
          td_error_abs: 0.7277887189710462
    num_agent_steps_sampled: 21146
    num_steps_sampled: 21146
    num_steps_trained: 73353
    num_steps_trained_this_iter: 37
    num_target_updates: 195
  iterations_since_restore: 99
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.7
    ram_util_per



Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,101,138.4,21449,32,34,16,4


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 21760
  custom_metrics: {}
  date: 2022-04-01_05-17-20
  done: false
  episode_len_mean: 4.47
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 31.06
  episode_reward_min: 8.0
  episodes_this_iter: 42
  episodes_total: 2369
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 21674
    learner:
      default_policy:
        learner_stats:
          grad_norm: 14.616578102111816
          loss: 1.419653058052063
          q_taken_mean: 9.31376720610119
          target_mean: 9.039560953776041
          td_error_abs: 0.9107217334565663
    num_agent_steps_sampled: 21760
    num_steps_sampled: 21760
    num_steps_trained: 76820
    num_steps_trained_this_iter: 42
    num_target_updates: 201
  iterations_since_restore: 103
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.266666666666666
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,105,143.472,21968,31.14,34,8,4.43


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 22276
  custom_metrics: {}
  date: 2022-04-01_05-17-25
  done: false
  episode_len_mean: 3.74
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 32.52
  episode_reward_min: 24.0
  episodes_this_iter: 30
  episodes_total: 2503
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 22202
    learner:
      default_policy:
        learner_stats:
          grad_norm: 16.694255828857422
          loss: 1.7530947923660278
          q_taken_mean: 9.015603807237413
          target_mean: 8.86401621500651
          td_error_abs: 1.0679110421074762
    num_agent_steps_sampled: 22276
    num_steps_sampled: 22276
    num_steps_trained: 80041
    num_steps_trained_this_iter: 36
    num_target_updates: 206
  iterations_since_restore: 108
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.700000000000003

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,110,149.985,22584,32.94,34,16,3.53


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 22787
  custom_metrics: {}
  date: 2022-04-01_05-17-31
  done: false
  episode_len_mean: 3.56
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 32.88
  episode_reward_min: 20.0
  episodes_this_iter: 29
  episodes_total: 2646
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 22730
    learner:
      default_policy:
        learner_stats:
          grad_norm: 24.606687545776367
          loss: 4.224145412445068
          q_taken_mean: 11.769339537009214
          target_mean: 11.389540452223558
          td_error_abs: 1.0418500655736678
    num_agent_steps_sampled: 22787
    num_steps_sampled: 22787
    num_steps_trained: 83151
    num_steps_trained_this_iter: 39
    num_target_updates: 211
  iterations_since_restore: 112
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.7000000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,114,154.275,22992,32.72,34,20,3.64


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 23301
  custom_metrics: {}
  date: 2022-04-01_05-17-36
  done: false
  episode_len_mean: 3.4
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.2
  episode_reward_min: 20.0
  episodes_this_iter: 31
  episodes_total: 2794
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 23248
    learner:
      default_policy:
        learner_stats:
          grad_norm: 7.695291042327881
          loss: 0.7724437713623047
          q_taken_mean: 10.666925048828125
          target_mean: 10.58785400390625
          td_error_abs: 0.6321249961853027
    num_agent_steps_sampled: 23301
    num_steps_sampled: 23301
    num_steps_trained: 86242
    num_steps_trained_this_iter: 40
    num_target_updates: 216
  iterations_since_restore: 117
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 24.799999999999997


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,119,160.437,23613,33.18,34,24,3.41


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 23923
  custom_metrics: {}
  date: 2022-04-01_05-17-42
  done: false
  episode_len_mean: 3.29
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.42
  episode_reward_min: 24.0
  episodes_this_iter: 62
  episodes_total: 2979
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 23868
    learner:
      default_policy:
        learner_stats:
          grad_norm: 23.289308547973633
          loss: 2.72460675239563
          q_taken_mean: 7.788140296936035
          target_mean: 7.172396659851074
          td_error_abs: 1.328223466873169
    num_agent_steps_sampled: 23923
    num_steps_sampled: 23923
    num_steps_trained: 90042
    num_steps_trained_this_iter: 32
    num_target_updates: 222
  iterations_since_restore: 121
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 14.4
    ram_util_pe

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,123,165.457,24133,33.48,34,24,3.26


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 24541
  custom_metrics: {}
  date: 2022-04-01_05-17-48
  done: false
  episode_len_mean: 3.05
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.9
  episode_reward_min: 28.0
  episodes_this_iter: 67
  episodes_total: 3178
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 24487
    learner:
      default_policy:
        learner_stats:
          grad_norm: 11.861846923828125
          loss: 1.2316670417785645
          q_taken_mean: 8.936837332589286
          target_mean: 9.080030168805804
          td_error_abs: 0.9232434953962053
    num_agent_steps_sampled: 24541
    num_steps_sampled: 24541
    num_steps_trained: 93961
    num_steps_trained_this_iter: 35
    num_target_updates: 228
  iterations_since_restore: 126
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.200000000000001

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,127,170.412,24643,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 25155
  custom_metrics: {}
  date: 2022-04-01_05-17-54
  done: false
  episode_len_mean: 3.02
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 33.96
  episode_reward_min: 30.0
  episodes_this_iter: 34
  episodes_total: 3381
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 25101
    learner:
      default_policy:
        learner_stats:
          grad_norm: 16.98355484008789
          loss: 2.1304714679718018
          q_taken_mean: 6.199623919547872
          target_mean: 6.531167537608045
          td_error_abs: 1.0865519097510805
    num_agent_steps_sampled: 25155
    num_steps_sampled: 25155
    num_steps_trained: 97906
    num_steps_trained_this_iter: 47
    num_target_updates: 234
  iterations_since_restore: 130
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 14.5
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,130,175.286,25155,33.96,34,30,3.02


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 25665
  custom_metrics: {}
  date: 2022-04-01_05-17-59
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 34
  episodes_total: 3551
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 25611
    learner:
      default_policy:
        learner_stats:
          grad_norm: 6.21291971206665
          loss: 0.5799204707145691
          q_taken_mean: 11.448091052827381
          target_mean: 11.515809558686756
          td_error_abs: 0.5690237681070963
    num_agent_steps_sampled: 25665
    num_steps_sampled: 25665
    num_steps_trained: 101177
    num_steps_trained_this_iter: 42
    num_target_updates: 239
  iterations_since_restore: 134
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.6
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,134,180.509,25665,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 26277
  custom_metrics: {}
  date: 2022-04-01_05-18-05
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 3755
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 26223
    learner:
      default_policy:
        learner_stats:
          grad_norm: 38.28487777709961
          loss: 1.0348012447357178
          q_taken_mean: 12.858234405517578
          target_mean: 13.833185195922852
          td_error_abs: 0.9793069958686829
    num_agent_steps_sampled: 26277
    num_steps_sampled: 26277
    num_steps_trained: 105063
    num_steps_trained_this_iter: 32
    num_target_updates: 245
  iterations_since_restore: 137
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.93333333333333

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,137,186.293,26277,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 26991
  custom_metrics: {}
  date: 2022-04-01_05-18-12
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 3993
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 26937
    learner:
      default_policy:
        learner_stats:
          grad_norm: 26.726844787597656
          loss: 2.6935086250305176
          q_taken_mean: 9.050679206848145
          target_mean: 8.311569213867188
          td_error_abs: 1.1577026844024658
    num_agent_steps_sampled: 26991
    num_steps_sampled: 26991
    num_steps_trained: 109464
    num_steps_trained_this_iter: 32
    num_target_updates: 252
  iterations_since_restore: 141
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 13.65
    ram_util

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,141,192.863,26991,34,34,34,3


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,145,197.774,27501,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 27705
  custom_metrics: {}
  date: 2022-04-01_05-18-19
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 4231
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 27651
    learner:
      default_policy:
        learner_stats:
          grad_norm: 13.317123413085938
          loss: 0.8504083156585693
          q_taken_mean: 12.401097153172348
          target_mean: 12.223786325165719
          td_error_abs: 0.6288442900686553
    num_agent_steps_sampled: 27705
    num_steps_sampled: 27705
    num_steps_trained: 113999
    num_steps_trained_this_iter: 33
    num_target_updates: 259
  iterations_since_restore: 146
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 14.5
    ram_uti

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,149,202.886,28011,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 28215
  custom_metrics: {}
  date: 2022-04-01_05-18-24
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 4401
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 28161
    learner:
      default_policy:
        learner_stats:
          grad_norm: 12.745993614196777
          loss: 1.1133652925491333
          q_taken_mean: 7.323152715509588
          target_mean: 7.683316317471591
          td_error_abs: 0.7976856231689453
    num_agent_steps_sampled: 28215
    num_steps_sampled: 28215
    num_steps_trained: 117194
    num_steps_trained_this_iter: 44
    num_target_updates: 264
  iterations_since_restore: 150
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.433333333333332

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,154,208.911,28623,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 28725
  custom_metrics: {}
  date: 2022-04-01_05-18-29
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 34
  episodes_total: 4571
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 28671
    learner:
      default_policy:
        learner_stats:
          grad_norm: 16.859647750854492
          loss: 0.932975709438324
          q_taken_mean: 11.696784019470215
          target_mean: 12.15079116821289
          td_error_abs: 0.6500687599182129
    num_agent_steps_sampled: 28725
    num_steps_sampled: 28725
    num_steps_trained: 120398
    num_steps_trained_this_iter: 32
    num_target_updates: 269
  iterations_since_restore: 155
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 14.5
    ram_util_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,157,212.956,29031,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 29337
  custom_metrics: {}
  date: 2022-04-01_05-18-35
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 68
  episodes_total: 4775
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 29283
    learner:
      default_policy:
        learner_stats:
          grad_norm: 30.223304748535156
          loss: 3.38950514793396
          q_taken_mean: 10.827490604285037
          target_mean: 11.640173709753787
          td_error_abs: 1.3852162216648911
    num_agent_steps_sampled: 29337
    num_steps_sampled: 29337
    num_steps_trained: 124167
    num_steps_trained_this_iter: 33
    num_target_updates: 275
  iterations_since_restore: 159
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.833333333333334

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,RUNNING,172.17.0.2:26724,161,218.812,29643,34,34,34,3


Result for QMIX_grouped_foodenv_5371f_00000:
  agent_timesteps_total: 29949
  custom_metrics: {}
  date: 2022-04-01_05-18-41
  done: false
  episode_len_mean: 3.0
  episode_media: {}
  episode_reward_max: 34.0
  episode_reward_mean: 34.0
  episode_reward_min: 34.0
  episodes_this_iter: 34
  episodes_total: 4979
  experiment_id: d11fbc22d1c74b2682ad267be9ec3218
  hostname: 1d29a0c222c3
  info:
    last_target_update_ts: 29895
    learner:
      default_policy:
        learner_stats:
          grad_norm: 14.164641380310059
          loss: 0.9975647926330566
          q_taken_mean: 12.541200183686756
          target_mean: 12.88115728469122
          td_error_abs: 0.8174606504894438
    num_agent_steps_sampled: 29949
    num_steps_sampled: 29949
    num_steps_trained: 128027
    num_steps_trained_this_iter: 42
    num_target_updates: 281
  iterations_since_restore: 163
  node_ip: 172.17.0.2
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 15.2
    ram_util

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
QMIX_grouped_foodenv_5371f_00000,TERMINATED,172.17.0.2:26724,164,223.68,30153,34,34,34,3


2022-04-01 05:18:44,048	INFO tune.py:636 -- Total run time: 235.17 seconds (234.67 seconds for the tuning loop).


# Results Analysis (QMIX + Coop)

In [39]:
import ray.rllib.agents.qmix as qmix

# list of lists: one list per checkpoint; each checkpoint list contains
# 1st the path, 2nd the metric value
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean")

best_checkpoint = sorted(checkpoints, key=lambda x: x[1])[-1]
last_checkpoint = checkpoints[-1]

trainer = qmix.QMixTrainer(config=config, env="grouped_foodenv")
trainer.restore(last_checkpoint[0])

# get weights: trainer.get_weights()

# run until episode ends

grouped_e = FoodGame(coop_env_config).with_agent_groups(
            groups={"agents": FoodGame(coop_env_config).agent_names},
            obs_space=tuple_obs_space,
            act_space=tuple_act_space,
        )

# episode_reward = {'agent'+str(i):0 for i in range(len(grouped_e.agents))}
# done = False
# obs = grouped_e.reset()
# print("Initial state at Time 0:")
# grouped_e.render()
# time = 1
# while not done:
#     print("Time:",time)
#     time += 1
#     action = trainer.compute_actions(obs,policy_id='default_policy')
#     for i in range(1,len(grouped_e.agents)):
#         a = trainer.compute_actions(obs,policy_id='default_policy')
#         action['agent'+str(i)] = a['agent'+str(i)]
#     obs, reward, dones, info = grouped_e.step(action)
#     done = dones['__all__']
#     grouped_e.render()
#     for i in range(len(grouped_e.agents)):
#         episode_reward['agent'+str(i)] += reward['agent'+str(i)]
# print("Episode Ended")
# print("Episode Rewards",episode_reward)

2022-03-06 05:20:59,824	INFO trainable.py:473 -- Restored on 172.17.0.2 from checkpoint: /home/ray/cs4246/06_03_2022_05:19:09/QMIX/QMIX_grouped_foodenv_01f18_00000_0_2022-03-06_05-19-09/checkpoint_000020/checkpoint-20
2022-03-06 05:20:59,825	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 20, '_timesteps_total': 818, '_time_total': 64.86543583869934, '_episodes_total': 2417}
[2m[36m(RolloutWorker pid=2256)[0m   for k, v in state_dict.items()


# QMIX Trainer (Comp)

In [137]:
e = FoodGame(comp_env_config)
tuple_obs_space = Tuple([e.observation_space for i in range(len(e.agents))])
tuple_act_space = Tuple([e.action_space for i in range(len(e.agents))])

register_env("grouped_foodenv", lambda config: FoodGame(config).with_agent_groups(
            groups={'group'+str(i):[e.agent_names[i]] for i in range(len(e.agents))},
            obs_space=tuple_obs_space,
            act_space=tuple_act_space,
        ))

config = dict()
config["num_gpus"] = 0
config["env"] = "grouped_foodenv"
config["num_workers"] = 1
config["env_config"] = comp_env_config
config["lr"] = 1e-2
config["learning_starts"] = 1000
config["target_network_update_freq"] = 500
config["rollout_fragment_length"] = 4
config["train_batch_size"] = 32
config["timesteps_per_iteration"] = 1000
# config["exploration_config"] = {"epsilon_timesteps": 10000, }
config["exploration_config"] = {"epsilon_timesteps": 15000, "final_epsilon": 0.0}


now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
os.mkdir('./'+dt_string)
log_dir = './'+dt_string+'/'

stop_criteria = {"training_iteration": 20}

analysis = ray.tune.run(
    "QMIX",
    config=config,
    local_dir=log_dir,
    stop=stop_criteria,
    checkpoint_freq=1,
    checkpoint_at_end=True)

Trial name,status,loc
QMIX_grouped_foodenv_8e6da_00000,PENDING,


[2m[36m(QMixTrainer pid=27216)[0m 2022-04-01 05:38:00,381	INFO simple_q.py:154 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
[2m[36m(QMixTrainer pid=27216)[0m 2022-04-01 05:38:00,381	INFO trainer.py:792 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
QMIX_grouped_foodenv_8e6da_00000,RUNNING,172.17.0.2:27216


2022-04-01 05:38:03,688	ERROR trial_runner.py:927 -- Trial QMIX_grouped_foodenv_8e6da_00000: Error processing event.
Traceback (most recent call last):
  File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 893, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 707, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/worker.py", line 1733, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::QMixTrainer.train()[39m (pid=27216, ip=172.17.0.2, repr=QMixTrainer)
  File "/home/ray/anaconda3/lib/python3.7/site-packages/ray/tune/trainable.py", line 315, in train
    resu

Result for QMIX_grouped_foodenv_8e6da_00000:
  date: 2022-04-01_05-38-03
  experiment_id: adc7f78f897e493d94465bc561c0a724
  hostname: 1d29a0c222c3
  node_ip: 172.17.0.2
  pid: 27216
  timestamp: 1648816683
  trial_id: 8e6da_00000
  


Trial name,status,loc
QMIX_grouped_foodenv_8e6da_00000,ERROR,172.17.0.2:27216

Trial name,# failures,error file
QMIX_grouped_foodenv_8e6da_00000,1,/home/ray/cs4246/01_04_2022_05:37:56/QMIX/QMIX_grouped_foodenv_8e6da_00000_0_2022-04-01_05-37-56/error.txt


[2m[36m(RolloutWorker pid=27215)[0m   for k, v in state_dict.items()


TuneError: ('Trials did not complete', [QMIX_grouped_foodenv_8e6da_00000])

In [136]:
{'group'+str(i):e.agent_names[i] for i in range(len(e.agents))}

{'group0': 'agent0', 'group1': 'agent1'}