# Training with SAC (rllib)

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import time

import ray
import ray.rllib.agents as agents
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print

import gym.spaces as spaces

import matplotlib.pyplot as plt
from IPython.display import clear_output

%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# This part implements live plotting

def live_plot(data_dict,
              xlabel='',
              figsize=(7,5),
              title='',
              save=False,
              fname='figure.pdf',
              erase=True):
    if erase:
        clear_output(wait=True)
    plt.figure(figsize=figsize)
    for label in data_dict:
        plt.plot(data_dict[label][1], data_dict[label][0], label=label)
    plt.title(title)
    plt.grid(True)
    plt.xlabel(xlabel)
    plt.legend(loc='upper left') # the plot evolves to the right
    if save:
        plt.savefig(fname, bbox_inches='tight')
    plt.show();

In [3]:
# These are the common rrlib settings, worth reading

if True:
    print(pretty_print(agents.trainer.COMMON_CONFIG.copy()))
else:
    COMMON_CONFIG = {
    # === Debugging ===
    # Whether to write episode stats and videos to the agent log dir
    "monitor": False,
    # Set the ray.rllib.* log level for the agent process and its workers.
    # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
    # periodically print out summaries of relevant internal dataflow (this is
    # also printed out once at startup at the INFO level).
    "log_level": "INFO",
    # Callbacks that will be run during various phases of training. These all
    # take a single "info" dict as an argument. For episode callbacks, custom
    # metrics can be attached to the episode by updating the episode object's
    # custom metrics dict (see examples/custom_metrics_and_callbacks.py). You
    # may also mutate the passed in batch data in your callback.
    "callbacks": {
        "on_episode_start": None,     # arg: {"env": .., "episode": ...}
        "on_episode_step": None,      # arg: {"env": .., "episode": ...}
        "on_episode_end": None,       # arg: {"env": .., "episode": ...}
        "on_sample_end": None,        # arg: {"samples": .., "worker": ...}
        "on_train_result": None,      # arg: {"trainer": ..., "result": ...}
        "on_postprocess_traj": None,  # arg: {
                                      #   "agent_id": ..., "episode": ...,
                                      #   "pre_batch": (before processing),
                                      #   "post_batch": (after processing),
                                      #   "all_pre_batches": (other agent ids),
                                      # }
    },
    # Whether to attempt to continue training if a worker crashes.
    "ignore_worker_failures": False,
    # Log system resource metrics to results.
    "log_sys_usage": True,
    # Enable TF eager execution (TF policies only).
    "eager": False,
    # Enable tracing in eager mode. This greatly improves performance, but
    # makes it slightly harder to debug since Python code won't be evaluated
    # after the initial eager pass.
    "eager_tracing": False,
    # Disable eager execution on workers (but allow it on the driver). This
    # only has an effect is eager is enabled.
    "no_eager_on_workers": False,

    # === Policy ===
    # Arguments to pass to model. See models/catalog.py for a full list of the
    # available model options.
    "model": MODEL_DEFAULTS,
    # Arguments to pass to the policy optimizer. These vary by optimizer.
    "optimizer": {},

    # === Environment ===
    # Discount factor of the MDP
    "gamma": 0.99,
    # Number of steps after which the episode is forced to terminate. Defaults
    # to `env.spec.max_episode_steps` (if present) for Gym envs.
    "horizon": None,
    # Calculate rewards but don't reset the environment when the horizon is
    # hit. This allows value estimation and RNN state to span across logical
    # episodes denoted by horizon. This only has an effect if horizon != inf.
    "soft_horizon": False,
    # Don't set 'done' at the end of the episode. Note that you still need to
    # set this if soft_horizon=True, unless your env is actually running
    # forever without returning done=True.
    "no_done_at_end": False,
    # Arguments to pass to the env creator
    "env_config": {},
    # Environment name can also be passed via config
    "env": None,
    # Whether to clip rewards prior to experience postprocessing. Setting to
    # None means clip for Atari only.
    "clip_rewards": None,
    # Whether to np.clip() actions to the action space low/high range spec.
    "clip_actions": True,
    # Whether to use rllib or deepmind preprocessors by default
    "preprocessor_pref": "deepmind",
    # The default learning rate
    "lr": 0.0001,

    # === Evaluation ===
    # Evaluate with every `evaluation_interval` training iterations.
    # The evaluation stats will be reported under the "evaluation" metric key.
    # Note that evaluation is currently not parallelized, and that for Ape-X
    # metrics are already only reported for the lowest epsilon workers.
    "evaluation_interval": None,
    # Number of episodes to run per evaluation period.
    "evaluation_num_episodes": 10,
    # Extra arguments to pass to evaluation workers.
    # Typical usage is to pass extra args to evaluation env creator
    # and to disable exploration by computing deterministic actions
    # TODO(kismuz): implement determ. actions and include relevant keys hints
    "evaluation_config": {},

    # === Resources ===
    # Number of actors used for parallelism
    "num_workers": 2,
    # Number of GPUs to allocate to the trainer process. Note that not all
    # algorithms can take advantage of trainer GPUs. This can be fractional
    # (e.g., 0.3 GPUs).
    "num_gpus": 0,
    # Number of CPUs to allocate per worker.
    "num_cpus_per_worker": 1,
    # Number of GPUs to allocate per worker. This can be fractional.
    "num_gpus_per_worker": 0,
    # Any custom resources to allocate per worker.
    "custom_resources_per_worker": {},
    # Number of CPUs to allocate for the trainer. Note: this only takes effect
    # when running in Tune.
    "num_cpus_for_driver": 1,

    # === Memory quota ===
    # You can set these memory quotas to tell Ray to reserve memory for your
    # training run. This guarantees predictable execution, but the tradeoff is
    # if your workload exceeeds the memory quota it will fail.
    # Heap memory to reserve for the trainer process (0 for unlimited). This
    # can be large if your are using large train batches, replay buffers, etc.
    "memory": 0,
    # Object store memory to reserve for the trainer process. Being large
    # enough to fit a few copies of the model weights should be sufficient.
    # This is enabled by default since models are typically quite small.
    "object_store_memory": 0,
    # Heap memory to reserve for each worker. Should generally be small unless
    # your environment is very heavyweight.
    "memory_per_worker": 0,
    # Object store memory to reserve for each worker. This only needs to be
    # large enough to fit a few sample batches at a time. This is enabled
    # by default since it almost never needs to be larger than ~200MB.
    "object_store_memory_per_worker": 0,

    # === Execution ===
    # Number of environments to evaluate vectorwise per worker.
    "num_envs_per_worker": 1,
    # Default sample batch size (unroll length). Batches of this size are
    # collected from workers until train_batch_size is met. When using
    # multiple envs per worker, this is multiplied by num_envs_per_worker.
    "sample_batch_size": 200,
    # Training batch size, if applicable. Should be >= sample_batch_size.
    # Samples batches will be concatenated together to this size for training.
    "train_batch_size": 200,
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "truncate_episodes",
    # Use a background thread for sampling (slightly off-policy, usually not
    # advisable to turn on unless your env specifically requires it)
    "sample_async": False,
    # Element-wise observation filter, either "NoFilter" or "MeanStdFilter"
    "observation_filter": "NoFilter",
    # Whether to synchronize the statistics of remote filters.
    "synchronize_filters": True,
    # Configure TF for single-process operation by default
    "tf_session_args": {
        # note: overriden by `local_tf_session_args`
        "intra_op_parallelism_threads": 2,
        "inter_op_parallelism_threads": 2,
        "gpu_options": {
            "allow_growth": True,
        },
        "log_device_placement": False,
        "device_count": {
            "CPU": 1
        },
        "allow_soft_placement": True,  # required by PPO multi-gpu
    },
    # Override the following tf session args on the local worker
    "local_tf_session_args": {
        # Allow a higher level of parallelism by default, but not unlimited
        # since that can cause crashes with many concurrent drivers.
        "intra_op_parallelism_threads": 8,
        "inter_op_parallelism_threads": 8,
    },
    # Whether to LZ4 compress individual observations
    "compress_observations": False,
    # Wait for metric batches for at most this many seconds. Those that
    # have not returned in time will be collected in the next iteration.
    "collect_metrics_timeout": 180,
    # Smooth metrics over this many episodes.
    "metrics_smoothing_episodes": 100,
    # If using num_envs_per_worker > 1, whether to create those new envs in
    # remote processes instead of in the same worker. This adds overheads, but
    # can make sense if your envs can take much time to step / reset
    # (e.g., for StarCraft). Use this cautiously; overheads are significant.
    "remote_worker_envs": False,
    # Timeout that remote workers are waiting when polling environments.
    # 0 (continue when at least one env is ready) is a reasonable default,
    # but optimal value could be obtained by measuring your environment
    # step / reset and model inference perf.
    "remote_env_batch_wait_ms": 0,
    # Minimum time per iteration
    "min_iter_time_s": 0,
    # Minimum env steps to optimize for per train call. This value does
    # not affect learning, only the length of iterations.
    "timesteps_per_iteration": 0,
    # This argument, in conjunction with worker_index, sets the random seed of
    # each worker, so that identically configured trials will have identical
    # results. This makes experiments reproducible.
    "seed": None,

    # === Offline Datasets ===
    # Specify how to generate experiences:
    #  - "sampler": generate experiences via online simulation (default)
    #  - a local directory or file glob expression (e.g., "/tmp/*.json")
    #  - a list of individual file paths/URIs (e.g., ["/tmp/1.json",
    #    "s3://bucket/2.json"])
    #  - a dict with string keys and sampling probabilities as values (e.g.,
    #    {"sampler": 0.4, "/tmp/*.json": 0.4, "s3://bucket/expert.json": 0.2}).
    #  - a function that returns a rllib.offline.InputReader
    "input": "sampler",
    # Specify how to evaluate the current policy. This only has an effect when
    # reading offline experiences. Available options:
    #  - "wis": the weighted step-wise importance sampling estimator.
    #  - "is": the step-wise importance sampling estimator.
    #  - "simulation": run the environment in the background, but use
    #    this data for evaluation only and not for learning.
    "input_evaluation": ["is", "wis"],
    # Whether to run postprocess_trajectory() on the trajectory fragments from
    # offline inputs. Note that postprocessing will be done using the *current*
    # policy, not the *behaviour* policy, which is typically undesirable for
    # on-policy algorithms.
    "postprocess_inputs": False,
    # If positive, input batches will be shuffled via a sliding window buffer
    # of this number of batches. Use this if the input data is not in random
    # enough order. Input is delayed until the shuffle buffer is filled.
    "shuffle_buffer_size": 0,
    # Specify where experiences should be saved:
    #  - None: don't save any experiences
    #  - "logdir" to save to the agent log dir
    #  - a path/URI to save to a custom output directory (e.g., "s3://bucket/")
    #  - a function that returns a rllib.offline.OutputWriter
    "output": None,
    # What sample batch columns to LZ4 compress in the output data.
    "output_compress_columns": ["obs", "new_obs"],
    # Max output file size before rolling over to a new file.
    "output_max_file_size": 64 * 1024 * 1024,

    # === Multiagent ===
    "multiagent": {
        # Map from policy ids to tuples of (policy_cls, obs_space,
        # act_space, config). See rollout_worker.py for more info.
        "policies": {},
        # Function mapping agent ids to policy ids.
        "policy_mapping_fn": None,
        # Optional whitelist of policies to train, or None for all policies.
        "policies_to_train": None,
    },}

batch_mode: truncate_episodes
callbacks: <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>
clip_actions: true
collect_metrics_timeout: 180
compress_observations: false
custom_resources_per_worker: {}
eager: -1
eager_tracing: false
env_config: {}
evaluation_config: {}
evaluation_num_episodes: 10
evaluation_num_workers: 0
exploration_config:
  type: StochasticSampling
explore: true
extra_python_environs_for_driver: {}
extra_python_environs_for_worker: {}
fake_sampler: false
framework: tf
gamma: 0.99
ignore_worker_failures: false
in_evaluation: false
input: sampler
input_evaluation:
- is
- wis
local_tf_session_args:
  inter_op_parallelism_threads: 8
  intra_op_parallelism_threads: 8
log_level: WARN
log_sys_usage: true
lr: 0.0001
memory: 0
memory_per_worker: 0
metrics_smoothing_episodes: 100
min_iter_time_s: 0
model:
  conv_activation: relu
  conv_filters: null
  custom_action_dist: null
  custom_model: null
  custom_model_config: {}
  custom_options: -1
  custom_preprocessor: null
  d

In [4]:
# SAC default parameters, worth reading too:

if True:
    print(pretty_print(agents.sac.DEFAULT_CONFIG.copy()))
else:
    DEFAULT_CONFIG = with_common_config({
    # === Model ===
    "twin_q": True,
    "use_state_preprocessor": False,
    "policy": "GaussianLatentSpacePolicy",
    # RLlib model options for the Q function
    "Q_model": {
        "hidden_activation": "relu",
        "hidden_layer_sizes": (256, 256),
    },
    # RLlib model options for the policy function
    "policy_model": {
        "hidden_activation": "relu",
        "hidden_layer_sizes": (256, 256),
    },
    # Unsquash actions to the upper and lower bounds of env's action space
    "normalize_actions": True,

    # === Learning ===
    # Update the target by \tau * policy + (1-\tau) * target_policy
    "tau": 5e-3,
    # Target entropy lower bound. This is the inverse of reward scale,
    # and will be optimized automatically.
    "target_entropy": "auto",
    # Disable setting done=True at end of episode.
    "no_done_at_end": True,
    # N-step target updates
    "n_step": 1,

    # === Evaluation ===
    # The evaluation stats will be reported under the "evaluation" metric key.
    "evaluation_interval": 1,
    # Number of episodes to run per evaluation period.
    "evaluation_num_episodes": 1,
    # Extra configuration that disables exploration.
    "evaluation_config": {
        "exploration_enabled": False,
    },

    # === Exploration ===
    # Number of env steps to optimize for before returning
    "timesteps_per_iteration": 100,
    "exploration_enabled": True,

    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
    # each worker will have a replay buffer of this size.
    "buffer_size": int(1e6),
    # If True prioritized replay buffer will be used.
    # TODO(hartikainen): Make sure this works or remove the option.
    "prioritized_replay": False,
    "prioritized_replay_alpha": 0.6,
    "prioritized_replay_beta": 0.4,
    "prioritized_replay_eps": 1e-6,
    "prioritized_replay_beta_annealing_timesteps": 20000,
    "final_prioritized_replay_beta": 0.4,
    "compress_observations": False,

    # === Optimization ===
    "optimization": {
        "actor_learning_rate": 3e-4,
        "critic_learning_rate": 3e-4,
        "entropy_learning_rate": 3e-4,
    },
    # If not None, clip gradients during optimization at this value
    "grad_norm_clipping": None,
    # How many steps of the model to sample before learning starts.
    "learning_starts": 1500,
    # Update the replay buffer with this many samples at once. Note that this
    # setting applies per-worker if num_workers > 1.
    "sample_batch_size": 1,
    # Size of a batched sampled from replay buffer for training. Note that
    # if async_updates is set, then each worker returns gradients for a
    # batch of this size.
    "train_batch_size": 256,
    # Update the target network every `target_network_update_freq` steps.
    "target_network_update_freq": 0,

    # === Parallelism ===
    # Whether to use a GPU for local optimization.
    "num_gpus": 0,
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
    # you"re using the Async or Ape-X optimizers.
    "num_workers": 0,
    # Whether to allocate GPUs for workers (if > 0).
    "num_gpus_per_worker": 0,
    # Whether to allocate CPUs for workers (if > 0).
    "num_cpus_per_worker": 1,
    # Whether to compute priorities on workers.
    "worker_side_prioritization": False,
    # Prevent iterations from going lower than this time span.
    "min_iter_time_s": 1,

    # DEPRECATED:
    "per_worker_exploration": -1,
    "exploration_fraction": -1,
    "schedule_max_timesteps": -1,
    "exploration_initial_eps": -1,
    "exploration_final_eps": -1,
    })

Q_model:
  fcnet_activation: relu
  fcnet_hiddens:
  - 256
  - 256
  hidden_activation: -1
  hidden_layer_sizes: -1
_deterministic_loss: false
_use_beta_distribution: false
batch_mode: truncate_episodes
buffer_size: 1000000
callbacks: <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>
clip_actions: true
collect_metrics_timeout: 180
compress_observations: false
custom_resources_per_worker: {}
eager: -1
eager_tracing: false
env_config: {}
evaluation_config: {}
evaluation_num_episodes: 10
evaluation_num_workers: 0
exploration_config:
  type: StochasticSampling
explore: true
extra_python_environs_for_driver: {}
extra_python_environs_for_worker: {}
fake_sampler: false
final_prioritized_replay_beta: 0.4
framework: tf
gamma: 0.99
grad_norm_clipping: -1
ignore_worker_failures: false
in_evaluation: false
initial_alpha: 1.0
input: sampler
input_evaluation:
- is
- wis
learning_starts: 1500
local_tf_session_args:
  inter_op_parallelism_threads: 8
  intra_op_parallelism_threads: 8
log_level: WAR

In [5]:
# This command starts ray.
ray.init(num_gpus=1, object_store_memory=1000000000)  # can also be useful to set redis_max_memory

2020-07-11 18:54:57,138	INFO resource_spec.py:212 -- Starting Ray with 9.28 GiB memory available for workers and up to 0.93 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-11 18:54:58,028	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '192.168.0.100',
 'raylet_ip_address': '192.168.0.100',
 'redis_address': '192.168.0.100:6379',
 'object_store_address': 'tcp://127.0.0.1:55833',
 'raylet_socket_name': 'tcp://127.0.0.1:59087',
 'webui_url': 'localhost:8265',
 'session_dir': 'C:\\Users\\Yann\\AppData\\Local\\Temp\\ray\\session_2020-07-11_18-54-57_132772_11936'}

Traceback (most recent call last):
  File "C:\Users\Yann\anaconda3\lib\site-packages\ray\dashboard/dashboard.py", line 960, in <module>
    metrics_export_address=metrics_export_address)
  File "C:\Users\Yann\anaconda3\lib\site-packages\ray\dashboard/dashboard.py", line 513, in __init__
    build_dir = setup_static_dir(self.app)
  File "C:\Users\Yann\anaconda3\lib\site-packages\ray\dashboard/dashboard.py", line 414, in setup_static_dir
    "&& npm run build)", build_dir)
FileNotFoundError: [Errno 2] Dashboard build directory not found. If installing from source, please follow the additional steps required to build the dashboard(cd python/ray/dashboard/client && npm ci && npm run build): 'C:\\Users\\Yann\\anaconda3\\lib\\site-packages\\ray\\dashboard\\client/build'



In [None]:
NB_TIME_STEPS_EPISODE = 100

# environment:
env_name = 'gym_tmrl:gym-tmrl-v0'

In [None]:
# Policies definition:

# these must be copy-pasted from the gym environement definition, 
# because we don't want to instantiate a simulator just to retrieve them:

obs_space = spaces.Dict({
    'objective_position': spaces.Box(low=-np.inf, high=np.inf, shape=(2,)),
    'sensor_value': spaces.Box(low=-np.inf, high=np.inf, shape=(2,))  # vector toward the closest drone within the detection radius
    })
act_space = spaces.Box(low_bounds, high_bounds)

policy_1 = (None,   # None means we use the default policy
            obs_space,  # observation space
            act_space,  # action space
            {"gamma": 0.99})  # policy config

## Multi-Agent training: classic RL setting
Now, we train our policy in the non-real-time setting.

In [None]:
# configuration dictionnary for the gym environment:
env_config = {
    "low_bounds": low_bounds,
    "high_bounds": high_bounds,
    "ep_max_length": NB_TIME_STEPS_EPISODE,
    "drones_names": ['d1', 'd2'],
    # "detection_radius": DETECTION_RADIUS,
    "collision_radius": COLLISION_RADIUS,
    "real_time": False,
}

In [None]:
def env_creator(env_config):
    return RllibMultiAgentEnv(env_name, config=env_config)

register_env("my_multiagent_env", env_creator)

In [None]:
# rllib trainer config:

config = agents.sac.DEFAULT_CONFIG.copy()

# Heap memory to reserve for the trainer process (0 for unlimited). This
# can be large if your are using large train batches, replay buffers, etc.
config["memory"] = 12000000000
# Object store memory to reserve for the trainer process. Being large
# enough to fit a few copies of the model weights should be sufficient.
# This is enabled by default since models are typically quite small.
config["object_store_memory"] = 1000000000
# Heap memory to reserve for each worker. Should generally be small unless
# your environment is very heavyweight.
config["memory_per_worker"] = 0
# Object store memory to reserve for each worker. This only needs to be
# large enough to fit a few sample batches at a time. This is enabled
# by default since it almost never needs to be larger than ~200MB.
config["object_store_memory_per_worker"] = 0

config["num_gpus"] = 1.0
config["num_gpus_per_worker"] = 0

config["horizon"] = 100
config["timesteps_per_iteration"] = 100
config["train_batch_size"] = 256

config["normalize_actions"] = False  # bug if set to true? default true
config["buffer_size"] = int(1e6)  # default 1e6

config["learning_starts"] = int(1e5) # default 1500 no effect?

# config["batch_size"] = ...
# ...
config["env_config"] = env_config
config["multiagent"] = { # we can use several policies, each agent must be mapped to its policy
                            "policies": {
                                 # the first tuple value is None -> uses default policy
                                 "policy1": policy_1
                            },
                            "policy_mapping_fn": lambda agent_id: "policy1"
                       }

In [None]:
# rllib trainer:
trainer = agents.sac.SACTrainer(env="my_multiagent_env", config=config)

In [None]:
checkpoint = "/home/yann/ray_results/SAC_my_multiagent_env_2020-02-14_18-24-33jnyhtjgx/checkpoint_2578/checkpoint-2578"
trainer.restore(checkpoint)

In [None]:
# This is for live plotting:
time_start = time.time()
time_end = time_start
idle_time = 0
data_dict = {
    'mean_reward':[[],[]],
    'max_reward':[[],[]],
    'min_reward':[[],[]]
}

In [None]:
# This is to append live plotting:
idle_time += time.time() - time_end

# --- Training Algorithm: ---

max_training_iterations = 10000

# This is for early stopping:
early_stopping_iterations = 1000

max_avg_reward = -1.0 * np.inf
best_iteration = -1
early_stopping_counter = 0

for i in range(max_training_iterations):
    result = trainer.train()  # trainer step
    # early stopping:
    early_stopping_counter += 1
    if 'policy1' in result['policy_reward_mean']:
        rew_mean = result['policy_reward_mean']['policy1']
        if rew_mean > max_avg_reward:
            max_avg_reward = rew_mean
            best_iteration = i
            early_stopping_counter = 0
        timestamp = time.time() - time_start - idle_time
        data_dict['mean_reward'][0].append(rew_mean)
        data_dict['mean_reward'][1].append(timestamp)
        data_dict['max_reward'][0].append(result['policy_reward_max']['policy1'])
        data_dict['max_reward'][1].append(timestamp)
        data_dict['min_reward'][0].append(result['policy_reward_min']['policy1'])
        data_dict['min_reward'][1].append(timestamp)
        live_plot(data_dict, xlabel='time (s)', figsize=(7,5), title='PPO multiagent training')
        print(f"iteration {i} - policy reward mean: {result['policy_reward_mean']} max: {result['policy_reward_max']}")
        print(pretty_print(result))
    if early_stopping_counter >= early_stopping_iterations:
        print(f"The agent did not learn for the past {early_stopping_counter} iterations")
        break
    # checkpoints:
    if i % 100 == 0:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)

time_end = time.time()

live_plot(data_dict, xlabel='time (s)', figsize=(7,5), title='SAC multiagent usual setting', save=False, fname='ppo1testsuite.pdf', erase=False)
checkpoint = trainer.save()
print("checkpoint saved at", checkpoint)

The algorithm may not have converged yet after many more iterations than in the single-agent setting! This is to be expected, as adding a second learning agent makes the environment much more difficult and non-Markov (we must study the effect of non-Markov environment in PPO). Also, the more the training progresses, and the more often the two drones collide in at least one episode. This needs further interpretation (check whether PPO has been implemented with the entropy term meant to ensure exploration).

## Visual evaluation of the learnt policy

Rllib allows to load a policy directly from a checkpoint:

In [None]:
# ppo 1 agent:
checkpoint ="/home/yann/ray_results/PPO_my_multiagent_env_2020-02-14_13-18-02j59jujiv/checkpoint_10/checkpoint-10"
trainer.restore(checkpoint)

Let us test this policy against its training setting:

In [None]:
DRONE1_NAME = 'd1'
DRONE2_NAME = 'd2'
DRONE3_NAME = 'd3'

In [None]:
env_config["drones_names"] = [DRONE1_NAME]
test_env = env_creator(env_config)

In [None]:
obs_n_init = test_env.reset()
print(pretty_print(obs_n_init))
test_env.env.render()

In [None]:
obs_n = obs_n_init
hist_rew_n = {}
for drone_name in env_config['drones_names']:
        hist_rew_n[drone_name] = [0]
done = False
i = 0
while done == False and i < 50:
    actions = {}
    for drone_name in env_config['drones_names']:
        actions[drone_name] = trainer.compute_action(observation=obs_n[drone_name], policy_id='policy1')
        print("DEBUG: action ", actions[drone_name])
    obs_n, rew_n, done_n, _ = test_env.step(action_dict=actions)
    for drone_name in env_config['drones_names']:
        hist_rew_n[drone_name].append(rew_n[drone_name]+hist_rew_n[drone_name][-1])
    done = done_n['__all__']
    clear_output(wait=True)
    test_env.env.render(datadict=hist_rew_n,
                        xlabel="time step",
                        title="Cumulated rewards",
                        figsize=(10, 5),
                        save=False,
                        fname=f"imgsav/ppo2train1test{i}.png",
                        dpi=200)
    time.sleep(0.1)  # actually matplotlib is long enough to render the environment
    i += 1

The drone has learnt not to get too close to the objective, even if it sees no other drone!

In [None]:
env_config["drones_names"] = [DRONE1_NAME, DRONE2_NAME, DRONE3_NAME]
test_env = env_creator(env_config)

obs_n_init = test_env.reset()
print(pretty_print(obs_n_init))
test_env.env.render()

In [None]:
obs_n = obs_n_init
hist_rew_n = {}
for drone_name in env_config['drones_names']:
        hist_rew_n[drone_name] = [0]
done = False
i = 0
while done == False and i < 100:
    actions = {}
    for drone_name in env_config['drones_names']:
        actions[drone_name] = trainer.compute_action(observation=obs_n[drone_name], policy_id='policy1')
        print("DEBUG: act ", actions[drone_name], "obs ", obs_n[drone_name])
    obs_n, rew_n, done_n, _ = test_env.step(action_dict=actions)
    for drone_name in env_config['drones_names']:
        hist_rew_n[drone_name].append(rew_n[drone_name]+hist_rew_n[drone_name][-1])
    done = done_n['__all__']
    clear_output(wait=True)
    test_env.env.render(datadict=hist_rew_n,
                        xlabel="time step",
                        title="Cumulated rewards",
                        figsize=(10, 5),
                        save=True,
                        fname=f"imgsav/ppo2train2test{i}.png",
                        dpi=200)
    time.sleep(0.1)  # actually matplotlib is long enough to render the environment
    i += 1

The dwo drones seem to have learnt a cooperative behavior in which the two of them get at specific positions at either side of the objective

In [None]:
ray.shutdown()