In [None]:
import ray
import torch
from rllib_setup import get_env_continuous

env = get_env_continuous()
env_name = "VJS"
ray.init()
ray.rllib.utils.check_env(env)
from ray.tune import register_env
register_env(env_name,lambda config: get_env_continuous())


In [None]:
from ray.rllib.policy.policy import PolicySpec
test_env = get_env_continuous()
obs_space = test_env.observation_space
act_space = test_env.action_space
def policies(agent_ids):
    return {
        str(i): PolicySpec(
            # observation_space=obs_space,
            # action_space=act_space,
            config=config.overrides(agent_id=int(i[8:])),
        )
        for i in agent_ids
    }


In [None]:
from ray.tune.registry import get_trainable_cls
import os
config = (
    get_trainable_cls("MADDPG")
    .get_default_config()
    .environment(env=env_name)
    # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
    .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
)
(
    config.framework("tf")
    # .environment(env_config={"actions_are_logits": True})
    .training(num_steps_sampled_before_learning_starts=100)
    .multi_agent(
        policies=policies(test_env._agent_ids),
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: str(
            agent_id
        ),
    )
)


In [None]:
from ray import air, tune
stop = {
    "training_iteration": 10000,
}
results = tune.Tuner(
    "MADDPG",
    run_config=air.RunConfig(stop=stop, verbose=2),
    param_space=config,
).fit()


In [None]:
import ray
from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig
from ray.rllib.policy.policy import PolicySpec
from ray.tune import register_env
import rich  
import rllib_setup

env_name = "VJS"
register_env(
    env_name,
    lambda config: rllib_setup.get_env_continuous(),
)
test_env = rllib_setup.get_env_continuous()


def policies(agent_ids):
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    return {
        str(i): PolicySpec(
            observation_space=obs_space,
            action_space=act_space,
            # config=config.overrides(agent_id=int(i[8:])),
        )
        for i in agent_ids
    }




In [None]:
config = (
    DDPGConfig()
    .training(lr=0.01)
    .resources(num_gpus=1)
    .multi_agent(
        policies=policies(test_env._agent_ids),
        policy_mapping_fn=lambda agent_id, episode, **kwargs: str(agent_id),
    )
)
# config.batch_mode = "complete_episodes"
rich.print(config.to_dict())


In [None]:
algo = config.build(env=env_name)


In [None]:
algo.train()


In [None]:
from ray import tune
from ray import air


In [None]:
tune.Tuner(  
    "DDPG",
    run_config=air.RunConfig(stop={"episode_reward_mean": 200}),
    param_space=config.to_dict(),
).fit()


In [None]:
from ray.rllib.algorithms.sac import SACConfig
from ray.tune import register_env
import rllib_setup

env_name = "VJS"
register_env(
    env_name,
    lambda config: rllib_setup.get_env_continuous(),
)
test_env = rllib_setup.get_env_continuous()


def policies(agent_ids):
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    return {
        str(i): (
            None,
            obs_space,
            act_space,
            {}
            # config=config.overrides(agent_id=int(i[8:])),
        )
        for i in agent_ids
    }
    
    
config = SACConfig().training(gamma=0.9, lr=0.01)
config = config.resources(num_gpus=0)  
config = config.rollouts(num_rollout_workers=4).multi_agent(
        policies=policies(test_env._agent_ids),
        policy_mapping_fn=lambda agent_id, episode, **kwargs: str(agent_id),
    )
config.batch_mode = "complete_episodes"
print(config.to_dict())  
# Build a Algorithm object from the config and run 1 training iteration.
algo = config.build(env=env_name)  
algo.train()


In [None]:
from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig
config = MADDPGConfig()
print(config.replay_buffer_config)  
replay_config = config.replay_buffer_config.update(  
    {
        "capacity": 100000,
        "prioritized_replay_alpha": 0.8,
        "prioritized_replay_beta": 0.45,
        "prioritized_replay_eps": 2e-6,
    }
)
config.training(replay_buffer_config=replay_config)   
config = config.resources(num_gpus=0)   
config = config.rollouts(num_rollout_workers=4)   
config = config.environment(env=env_name)  
algo = config.build()  
algo.train()  


In [None]:
import ray
from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig
from ray.rllib.policy.policy import PolicySpec
from ray.tune import register_env

import rllib_setup

ray.init()
env_name = "VJS"
register_env(
    env_name,
    lambda config: rllib_setup.get_env_continuous(60),
)
test_env = rllib_setup.get_env_continuous()


def policies(agent_ids):
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    return {
        str(i): (
            None,
            obs_space,
            act_space,
            {}
            # config=config.overrides(agent_id=int(i[8:])),
        )
        for i in agent_ids
    }


config = (
    DDPGConfig().rollouts(num_rollout_workers=20)
    .training(lr=0.01)
    .resources(num_gpus=1)
    .multi_agent(
        policies=policies(test_env._agent_ids),
        policy_mapping_fn=lambda agent_id, episode, **kwargs: str(agent_id),
    )
    .environment(disable_env_checking=True)
)
config.batch_mode = "complete_episodes"
print(config.to_dict())
# Build a Algorithm object from the config and run one training iteration.
algo = config.build(env=env_name)
algo.train()
algo.evaluate()


In [None]:
for i in range(10):
    print(algo.training_step()['Machine_0']['learner_stats']['actor_loss'])


In [None]:
algo.evaluate()


In [None]:
from ray.rllib.algorithms.dqn import DQNConfig
from rllib_setup import get_env
from ray.tune import register_env
import ray

test_env = get_env()
env_name = "VJS_discrete"
register_env(env_name,lambda config: get_env())

ray.init()



In [None]:
def policies(agent_ids):
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    return {
        str(i): (
            None,
            obs_space,
            act_space,
            {}
            # config=config.overrides(agent_id=int(i[8:])),
        )
        for i in agent_ids
    }

config = DQNConfig().environment(env_name,disable_env_checking=True).rollouts(num_rollout_workers=4).training(model={"fcnet_hiddens": [64, 64]}).evaluation(evaluation_num_workers=1).multi_agent(
        policies=policies(test_env._agent_ids),
        policy_mapping_fn=lambda agent_id, episode, **kwargs: str(agent_id),
    )


In [None]:
algo = config.build()


In [None]:
for _ in range(5):
    print(algo.train())

algo.evaluate()


## DQN

In [15]:
from cgi import test
from rllib_setup import get_env
import ray
from ray import tune
from ray.tune.registry import register_env
from ray.rllib.algorithms.dqn import DQNConfig

alg_name = "DQN"
env_name = "VJS"
register_env(env_name,lambda config: get_env())

test_env = get_env()
obs_space = test_env.observation_space
act_space = test_env.action_space
def policies(agent_ids):
    return {
        str(i): (
            None,
            obs_space,
            act_space,
            {}
            # config=config.overrides(agent_id=int(i[8:])),
        )
        for i in agent_ids
    }






In [16]:
config = (
    DQNConfig()
    .environment(env=env_name, disable_env_checking=True)
    .rollouts(num_rollout_workers=10,create_env_on_local_worker=True,num_envs_per_worker=1,)
    .training(
        train_batch_size=200,
    )
    .multi_agent(
        policies=policies(test_env._agent_ids),
        policy_mapping_fn=(lambda agent_id, *args, **kwargs: agent_id),
    )
    .framework(framework="torch")
    .exploration(
        exploration_config={
            # The Exploration class to use.
            "type": "EpsilonGreedy",
            # Config for the Exploration class' constructor:
            "initial_epsilon": 0.1,
            "final_epsilon": 0.0,
            "epsilon_timesteps": 100000,  # Timesteps over which to anneal epsilon.
        }
    )
)
algo = config.build()
algo.train()
algo.evaluate()


2023-09-29 08:59:45,727	INFO algorithm.py:536 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


In [22]:
for j in range(10):
    for i in range(10):
        info = 
    print(info['info']['learner']['Machine_0']['mean_td_error'])




2695.794677734375




-6843.23046875
3328.9599609375




-3521.87451171875




-1483.706298828125
-5857.744140625




-4401.5693359375
1435.29736328125




188.28355407714844
1375.504150390625


In [23]:
info['info']['learner']['Machine_0']['mean_td_error']


1375.504150390625

ValueError: Cannot evaluate w/o an evaluation worker set in the Trainer or w/o an env on the local worker!
Try one of the following:
1) Set `evaluation_interval` >= 0 to force creating a separate evaluation worker set.
2) Set `create_env_on_driver=True` to force the local (non-eval) worker to have an environment to evaluate on.

[2m[33m(raylet)[0m [2023-09-29 10:23:59,570 E 3526324 3526324] (raylet) node_manager.cc:3069: 15 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 080095a5d5fb8ac87d836ac0e986b1e0de2a090de452cfe9d630aeee, IP: 192.168.3.6) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 192.168.3.6`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
[2m[33m(raylet)[0m [2023-09-29 10:25:00,786 E 3526324 3526324] (raylet) 

In [None]:
tune.run(
    alg_name,
    name="DQN",
    stop={"timesteps_total": 10000000},
    checkpoint_freq=10,
    config=config.to_dict(),
)


In [1]:
from ray.rllib.algorithms.algorithm import Algorithm
import rllib_setup
from ray.tune import register_env

env = rllib_setup.get_env(20,6)
env_name = "VJS"
register_env(
    env_name,
    lambda config: rllib_setup.get_env_continuous(
        20,6
    ),
)



In [22]:
algo = Algorithm.from_checkpoint('/home/yuan/ray_results/PPO6_20/PPO_VJS_aec24_00000_0_2023-10-06_11-08-54/checkpoint_003700')


algo.evaluate()




{'evaluation': {'sampler_results': {'episode_reward_max': 17974712.205365896,
   'episode_reward_min': 15759992.593395708,
   'episode_reward_mean': 16920473.39129135,
   'episode_len_mean': 7624.7,
   'episode_media': {},
   'episodes_this_iter': 10,
   'policy_reward_min': {'Machine_3': 1253082.302954793,
    'Machine_1': 1798822.2193907949,
    'Machine_2': 2878983.3106533387,
    'Machine_0': 1264634.8219270706,
    'Machine_5': 2647229.782274436,
    'Machine_4': 5036013.070080873},
   'policy_reward_max': {'Machine_3': 1660536.4115624428,
    'Machine_1': 3526111.0551816756,
    'Machine_2': 3299545.8714431,
    'Machine_0': 1561459.655945778,
    'Machine_5': 3134297.2315452183,
    'Machine_4': 6040116.070522367},
   'policy_reward_mean': {'Machine_3': 1364091.5848776877,
    'Machine_1': 2731664.328045578,
    'Machine_2': 3022144.0399926035,
    'Machine_0': 1426882.344321102,
    'Machine_5': 2888914.6108362232,
    'Machine_4': 5486776.483218161},
   'custom_metrics': {},
 

In [2]:
algo = Algorithm.from_checkpoint('/home/yuan/ray_results/PPO6_20_lstm/PPO_VJS_b6407_00000_0_2023-10-10_10-07-06/checkpoint_001120')




`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-10-11 10:26:27,133	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2023-10-11 10:26:39,078	INFO trainable.py:173 -- Trainable.setup took 14.118 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to redu

In [3]:
algo.evaluate()




{'evaluation': {'sampler_results': {'episode_reward_max': 17666806.658014834,
   'episode_reward_min': 16162207.896256506,
   'episode_reward_mean': 17037157.78231591,
   'episode_len_mean': 7618.8,
   'episode_media': {},
   'episodes_this_iter': 10,
   'policy_reward_min': {'Machine_1': 4302490.050000007,
    'Machine_0': 1117759.6995043159,
    'Machine_5': 1809157.5759562994,
    'Machine_2': 2467031.3549277303,
    'Machine_3': 942102.8003883362,
    'Machine_4': 4439623.050000007},
   'policy_reward_max': {'Machine_1': 5516603.550000009,
    'Machine_0': 1535569.8467799425,
    'Machine_5': 2516708.8859967114,
    'Machine_2': 3246056.7200411037,
    'Machine_3': 1547587.2218572497,
    'Machine_4': 5125526.550000009},
   'policy_reward_mean': {'Machine_1': 4791143.700000008,
    'Machine_0': 1351937.0583658218,
    'Machine_5': 2172449.5036802194,
    'Machine_2': 2783395.1844836688,
    'Machine_3': 1105832.6357861995,
    'Machine_4': 4832399.700000009},
   'custom_metrics': {

In [3]:
import numpy as np




In [4]:
algo.get_policy("Machine_0").compute_actions(np.array([ 72., 144.,  72., 144.,  72., 144.,  72., 144.,  72., 144.,  72.,
         144.,  72., 144.,  72., 144.,  72., 144.,  72., 144.,   0.,   0.,
           1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.]).reshape(1,43))


(array([[-0.18600105]], dtype=float32),
 [],
 {'vf_preds': array([-0.2056104], dtype=float32),
  'action_dist_inputs': array([[-0.18465035, -5.2225056 ]], dtype=float32),
  'action_prob': array([71.68005], dtype=float32),
  'action_logp': array([4.2722125], dtype=float32)})

In [13]:
[[-0.18600105]][0][0]


-0.18600105

In [5]:
from environment import Environment
def get_env_continuous(average_per_slot=50, machine_num=12):
    para = Environment.VehicleJobSchedulingParameters(
        average_per_slot=average_per_slot, machine_numbers=machine_num
    )
    para.action_space_continuous = True
    env = Environment.VehicleJobSchedulingEnvACE(parameter=para)
    return env

env = get_env_continuous(20,6)
env.reset()
reward_sums = {a: 0.0 for a in env.possible_agents}


In [21]:
env.action_space("Machine_0")


Box(1.0, 2.0, (1,), float32)

In [17]:
i=0
env.reset()
for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()
    reward_sums[agent] += reward
    action, *_ = algo.get_policy(agent).compute_actions(observation.reshape(1,43))
    print(action)
    env.step(action)
    i+=1
    if i>100:
        break


[[-0.18025556]]
[[-0.93157697]]
[[1521.1565]]
[[-18.84367]]
[[-0.97843575]]
[[-4329.4097]]
[[-0.9351009]]
[[-1349.0253]]
[[-0.9849549]]
[[36024.918]]
[[5203.2603]]
[[-0.92977226]]
[[13212.924]]
[[-1976.8417]]
[[10135.784]]
[[-16700.852]]
[[-24796.709]]
[[-0.9912483]]
[[17433.479]]
[[-0.9362991]]
[[1787.8755]]
[[-8938.839]]
[[-0.97973603]]
[[101436.06]]
[[-27568.69]]
[[-0.9444496]]
[[-6981.4136]]
[[12389.975]]
[[4186.49]]
[[-0.9907693]]
[[-955.98236]]
[[-23271.064]]
[[-1.0105983]]
[[-0.9424525]]
[[-48649.883]]
[[1397.5411]]
[[-8374.554]]
[[-0.94007516]]
[[-0.9783103]]
[[-3256.9146]]
[[1190.9998]]
[[-0.94110984]]
[[-0.9966648]]
[[35427.016]]
[[-775.4127]]
[[402.48297]]
[[-0.98352057]]
[[-0.938028]]
[[2363.547]]
[[1412.9741]]
[[-28900.477]]
[[-2839.2832]]
[[469.6115]]
[[-4158.781]]
[[4130.4185]]
[[-0.9419674]]
[[-3442.056]]
[[-0.92484224]]
[[-3797.473]]
[[-0.98916173]]
[[-0.9377638]]
[[8161.059]]
[[18311.672]]
[[-131.02962]]
[[-1.0004109]]
[[81.4695]]
[[-30173.771]]
[[27482.553]]
[[-82.31

In [11]:
reward_sums


{'Machine_0': -35766100743.842094,
 'Machine_1': -3064942.814621845,
 'Machine_2': -9359617387.80679,
 'Machine_3': -8970435469.520561,
 'Machine_4': -5179407.267839285,
 'Machine_5': -24161432801.123013}