## What would happen if consensus / coverage agents are trained with reinforcement learning algorithms?

### Import packages

In [70]:
import abc
import tensorflow as tf
import numpy as np
import random

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
# from tf_agents.environments import suite_gym
from tf_agents.specs import tensor_spec
from tf_agents.networks import network
from tf_agents.networks import q_network
from tf_agents.networks import actor_distribution_network

from tf_agents.policies import py_policy
from tf_agents.policies import random_py_policy
from tf_agents.policies import scripted_py_policy

from tf_agents.policies import tf_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies import actor_policy
from tf_agents.policies import q_policy
from tf_agents.policies import greedy_policy

from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.agents.ppo import ppo_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

In [3]:
tf.__version__

'2.0.0'

In [53]:
# Hyperparameters
num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 1000  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

### Build the environment for agents

In [57]:

# ref: https://render.githubusercontent.com/view/ipynb?commit=0fe0a2acf8efc1402ece6a49547951ce178cf2a3&enc_url=68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f74656e736f72666c6f772f6167656e74732f306665306132616366386566633134303265636536613439353437393531636531373863663261332f646f63732f7475746f7269616c732f325f656e7669726f6e6d656e74735f7475746f7269616c2e6970796e62&nwo=tensorflow%2Fagents&path=docs%2Ftutorials%2F2_environments_tutorial.ipynb&repository_id=157936206&repository_type=Repository#Creating-your-own-Python-Environment
class NetworkControlEnv(py_environment.PyEnvironment):

    def __init__(self, N=5, dt=0.1):
        # Number of agents in the environment. Hope it works...
        self.N = N
        self.dt = dt
        # Action space: Probably set it to be a continuous 2D space for agents velocity now.
        # Ref: https://github.com/tensorflow/agents/issues/105
        self._action_spec = array_spec.BoundedArraySpec(shape=(2,self.N), dtype=np.float32, 
                                                        minimum=-1, maximum=1, name='action')
        # Observation space: The agent should be able to get information from other agents' locations, so the
        # observation space would be possibly all the relative distances from others.
        # The distances would be expressed in terms of locations, so it's going to be 2D for each agent.
        self._observation_spec = array_spec.BoundedArraySpec(shape=(2,self.N), dtype=np.float32, 
                                                             name='observation')
        # CAVEAT: Tensorflow doesn't want numpy arrays for states. It only wants tensors for some reason.
        self.boundaries = [-1.6, 1.6, -1, 1] # xmin, xmax, ymin, ymax
        self._state = [ [ random.uniform(self.boundaries[0],self.boundaries[1]) for i in range(self.N) ],
                        [ random.uniform(self.boundaries[2],self.boundaries[3]) for i in range(self.N) ] ]

        # Optional: specify state boundaries
        
        self._episode_ended = False
        self.max_episode = 1000
        self.episode_count = 0

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = [ [ random.uniform(self.boundaries[0],self.boundaries[1]) for i in range(self.N) ],
                        [ random.uniform(self.boundaries[2],self.boundaries[3]) for i in range(self.N) ] ]
        # self._state = [ [ 0*i for i in range(self.N) ] for j in range(2)] #np.zeros((2,self.N))
        self._episode_ended = False
        self.episode_count = 0
        return ts.restart(np.array(self._state, dtype=np.float32))

    def _step(self, action):

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start a new episode.
            return self.reset()

        # Update agent locations based on actions (2xN matrix). This call might also change self._episode_ended.
        rewards = self.move(action)

        # Observation is the stuff agents would collect to inform their decision, so I let it be the locations for now.
        # Reward is the cost function that we determine based on agent state. 
        self.episode_count += 1
        # print(self.episode_count)
        if self._episode_ended or self.episode_count >= self.max_episode:
            # print("ended!")
            return ts.termination(np.array(self._state, dtype=np.float32), rewards)
        else:
            # First argument is observation
            return ts.transition(np.array(self._state, dtype=np.float32), reward=rewards, discount=1.0)
        
    def move(self, action):
        # Moves the agents for one time step.
        # Optional: Include boundary checking and collision checking for different tasks.
        # Outputs / Affects: 
        # 1. The function returns a list of rewards for each agent;
        # 2. The function modifies the self._episode_ended field if needed.
        
        # Update agent locations
        # self._state += self.dt * action
        for i in range(self.N):
            self._state[0][i] += self.dt * action[0][i]
            self._state[1][i] += self.dt * action[1][i]
        
        # For the simplest consensus task, each agent get the same reward.
        # We can also be more advanced, and assign different rewards.
        # Attempt 1: Set the reward as the negative sum of distances from each other. Max(reward) = 0 at consensus.
        #            Give different reward to different agents based on the total distance.
        # Attempt 2: Turns out Tensorflor_agents currently can't handle non-scalar rewards (or they could, but the
        # documentation is really lacking). I'll try to return a scalar but let the meta-agent handle the reward
        # distribution instead.
        rewards = 0
        # rewards = np.zeros((self.N,))
        state = np.array(self._state)
        for i in range(self.N):
            rewards -= np.sum( np.linalg.norm(state[:,[i]] - state, axis=0) )
        return rewards

In [58]:
# Verify that the environment is built correctly
testEnv = NetworkControlEnv()
utils.validate_py_environment(testEnv, episodes=5)
# If it doesn't print anything, then it worked

In [59]:
# To actually use the environment, wrappers might be needed. E.g. TFPyEnvironment, TimeLimit, etc.
# Why? For example, TF environment is different: https://render.githubusercontent.com/view/ipynb?commit=9b6f28dd282639d856a71772fcb597e62d4b888b&enc_url=68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f74656e736f72666c6f772f6167656e74732f396236663238646432383236333964383536613731373732666362353937653632643462383838622f646f63732f7475746f7269616c732f325f656e7669726f6e6d656e74735f7475746f7269616c2e6970796e62&nwo=tensorflow%2Fagents&path=docs%2Ftutorials%2F2_environments_tutorial.ipynb&repository_id=157936206&repository_type=Repository#TensorFlow-Environments
# We might also need to discretize action space via this wrapper: https://github.com/tensorflow/agents/blob/cf1a4a67950aaaf9c24b138b08f516b8a895fa8d/tf_agents/environments/wrappers.py#L333

# Construct two copies of environments, one for training and one for testing.
N = 5
dt = 0.1
dur = 100 # Duration
nac = [[100 for i in range(N)] for j in range(2)] # Number of discretized actions
train_env = tf_py_environment.TFPyEnvironment( wrappers.TimeLimit( wrappers.ActionDiscretizeWrapper(NetworkControlEnv(N,dt), num_actions=nac), duration=100 ) )
eval_env = tf_py_environment.TFPyEnvironment( wrappers.TimeLimit( wrappers.ActionDiscretizeWrapper(NetworkControlEnv(N,dt), num_actions=nac), duration=100 ) )

### Construct agents

#### Individual, homogeneous agents

In [75]:
# Let's just assume that the agent knows everything and coordinates everything for now. 
fc_layer_params = (20,) # layer params inside the agent's network

# q_net = q_network.QNetwork(
#     train_env.observation_spec(),
#     train_env.action_spec(),
#     fc_layer_params=fc_layer_params)

actor_net = actor_distribution_network.ActorDistributionNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)
value_net = actor_distribution_network.ActorDistributionNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

# agent = dqn_agent.DqnAgent(
#     train_env.time_step_spec(),
#     train_env.action_spec(),
#     q_network=q_net,
#     optimizer=optimizer,
#     td_errors_loss_fn=common.element_wise_squared_loss,
#     train_step_counter=train_step_counter)

agent = ppo_agent.PPOAgent( # reinforce_agent.ReinforceAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    actor_net=actor_net,
    value_net=value_net,
#     actor_network=actor_net,
    optimizer=optimizer,
#     normalize_returns=True,
    train_step_counter=train_step_counter)

agent.initialize()




#### Write a policy script to properly handle the agent network output (and input??)

In [None]:
# Define a scripted policy
# Ref: https://www.tensorflow.org/agents/tutorials/3_policies_tutorial#example_2_scripted_python_policy


In [None]:
N = 5 # Number of agents
agents = []
for i in range(N):
    pass

#### Question: Should I train only one agent, or should I train several copies of it separately together?

### Define a cost function

In [None]:
# The cost function is used to train agents to go to a specific formation, while avoiding hitting any existing obstacles.
# For example, if we expect consensus, then the cost function should make sure the reward is maximized when all agents
# convene to one single point.
# At the same time, any collision with obstacle will lose points / rewards.

### Train the agents

#### Replay buffer and replay observer for the agent during training

In [61]:
# Replay buffer "is used to contain the observation and action pairs so they can be used for training".
# Ref: https://towardsdatascience.com/tf-agents-tutorial-a63399218309
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length) # replay_buffer_capacity
replay_observer = [replay_buffer.add_batch]

dataset = replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
    num_steps=2).prefetch(3) # num_steps which specifies the number of consecutive items to return, after permutation
    
iterator = iter(dataset)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


#### Training metrics and driver (Driver is optional)

In [76]:
train_metrics = [
            tf_metrics.NumberOfEpisodes(),
            tf_metrics.EnvironmentSteps(),
            tf_metrics.AverageReturnMetric(),
            tf_metrics.AverageEpisodeLengthMetric(),
]

def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]


eval_policy = agent.policy
collect_policy = agent.collect_policy

# Ref: https://www.tensorflow.org/agents/api_docs/python/tf_agents/drivers/driver/Driver
# Driver executes a common data collection loop
driver = dynamic_step_driver.DynamicStepDriver(
            train_env,
            collect_policy,
            observers=replay_observer + train_metrics,
    num_steps=1)

In [77]:
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
    for _ in range(steps):
        collect_step(env, policy, buffer)


#### Training loop

In [78]:
episode_len = []

final_time_step, policy_state = driver.run()

# Two things must happen during the training loop:
#    1. collect data from the environment
#    2. use that data to train the agent's neural network(s)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    for _ in range(collect_steps_per_iteration):
        collect_step(train_env, agent.collect_policy, replay_buffer)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)


# for i in range(num_iterations):
#     final_time_step, _ = driver.run(final_time_step, policy_state)

#     experience, _ = next(iterator)
#     train_loss = agent.train(experience=experience)
#     step = agent.train_step_counter.numpy()

#     if step % log_interval == 0:
#         print('step = {0}: loss = {1}'.format(step, train_loss.loss))
#         episode_len.append(train_metrics[3].result().numpy())
#         print('Average episode length: {}'.format(train_metrics[3].result().numpy()))

#     if step % eval_interval == 0:
#         avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
#         print('step = {0}: Average Return = {1}'.format(step, avg_return))
# plt.plot(episode_len)
# plt.show()

TypeError: The two structures do not match:
  Trajectory(step_type=., observation=., action=., policy_info={'dist_params': {'logits': .}}, next_step_type=., reward=., discount=.)
vs.
  Trajectory(step_type=., observation=., action=., policy_info=(), next_step_type=., reward=., discount=.)

### Deploy the trained agents in a new envionment and initial values to check their performance

### Can we interperate the learned policies?

In [None]:
# Daft Draft

In [3]:
# Pass-by-reference test
class testObject():
    def __init__(self):
        self.x = 0
    def increment(self):
        self.x += 1
        
class testEnvironment():
    def __init__(self, x):
        self.x = x
    def inc(self):
        self.x.increment()
    def inc2(self, x):
        x.increment()
        
x = testObject()
y = testObject()
z = testEnvironment(x)
z.inc()
print(x.x)
z.inc2(y)
print(y.x)

1
1


In [7]:
np.linalg.norm(np.array([[1,2,3],[4,5,6]]),axis=0)

array([4.12310563, 5.38516481, 6.70820393])