# Usage and Plotting of the Multi-Agent Environment

In [1]:
%matplotlib inline

In [2]:
from flock_env import DiscreteActionFlock

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import animation
from IPython.display import HTML, Image
from tqdm import tqdm

In [4]:
matplotlib.use('Agg')
matplotlib.rc('animation', html='html5')

In [5]:
from dqn import Agent
from collections import deque
import torch

## Global Variables

- `N_AGENTS`: Number of boid agents to include in the flock environment
- `RECORD_STEP`: Number of episodes before producing a new plot

In [6]:
N_AGENTS = 30

In [7]:
RECORD_STEP = 20

## Initialize Training Environment

The environment is initialized with:

- `n_agents`: The number of agents included in the flock
- `speed`: Distance agents move each step (the size of environment is 1.0x1.0)
- `n_steps`: Number of simulation steps
- `rotation_size`: Size of one unit of rotation, given as a fraction of Π (i.e. 0.1 is a rotation size of 0.1Π)
- `n_actions`: Number of rotation actions, e.g. value 3 will mean the action space will then be `[-0.1Π, 0, 0.1Π]`
- `proximity_threshold`: Distance under which boids are penalised for being too close
- `obstacles`: List/tuple of triples describing circular enviromental obstacles. Each triple should be in the format `(x, y, radius)`

In [9]:
env = DiscreteActionFlock(n_agents=N_AGENTS,
                          speed=0.0125,
                          n_steps=10_000,
                          flock_reward_scaling=1,
                          obstacle_penalty_scaling=100,
                          rotation_size=0.0225,
                          n_actions=5,
                          distant_threshold=0.05,
                          proximity_threshold=0.008, 
                          obstacles=[(0.5, 0.5, 0.2), (0.1, 0.1, 0.1)])

## Initialize DQN Agent

The DQN agent is initialized with parameters taken from the environment as well as parameters controlling the DQN itself. Here the `state_size` and `action_size` parameters are the size of the observation and action spaces for a single agent, although the environment requires an array of values for all the agents.

In [10]:
agent = Agent(state_size=env.observation_space.shape[0],
              action_size=env.action_space.n,
              n_agents=env.n_agents, 
              buffer_size=int(1e5),
              batch_size=512,
              gamma=0.99,
              tau=1e-3,
              learning_rate=5e-4,
              update_every=2)

## Plotting

This function resets and runs the enviroment with actions from the network. The result is returned as an animated matplotlib quiver plot

In [11]:
def test_plot(steps=2000, eps=0):
    state = env.reset()
    
    # Record positions, headings and rewards
    pos = []
    rot = []
    rwd = []
    
    # Run the model taking actions from the RL agent
    for _ in range(steps):
        state, reward, _, _ = env.step(agent.act(state, eps))
        pos.append(env.x[:, :env.n_agents].copy())
        rot.append(env.theta.copy())
        rwd.append(reward)
    
    pos = np.stack(pos)
    rot = np.stack(rot)
    rwd = np.stack(rwd)
    # Scale rewards to use as colours for the plot
    rwd = 255*(rwd-rwd.min())/(rwd.max()-rwd.min())  
    
    d = np.append(pos, rot[:, np.newaxis, :], axis=1)
    d = np.append(d, rwd[:, np.newaxis, :], axis=1)
    
    fig, ax = plt.subplots(1,1, figsize=(8, 8))
    
    plt.tight_layout()
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    
    obstacles = env.x[:, -env.n_obstacles:].T
    radii = env.obstacle_radii[0]
    for o in zip(obstacles, radii):
        draw_circle = plt.Circle((o[0][0], o[0][1]), 
                                 o[1],
                                 fill=True, 
                                 alpha=0.2, 
                                 color='r')
        ax.add_artist(draw_circle)
    
    q = ax.quiver(d[0][0], d[0][1], 
                  np.cos(d[0][2]), 
                  np.sin(d[0][2]), d[0][3], 
                  cmap=plt.get_cmap('winter'))
    
    def update_quiver(f):
        """Updates the values of the quiver plot"""
        q.set_offsets(f[:2].T)
        q.set_UVC(np.cos(f[2]), np.sin(f[2]), f[3])
        return q,

    anim = animation.FuncAnimation(fig, 
                                   update_quiver, 
                                   frames=d[1:],
                                   interval=50, 
                                   blit=False)
    
    return anim

## Training Loop

In [12]:
def dqn(n_episodes=200, max_t=10_000, eps_start=1.0, eps_end=0.01, eps_decay=0.95):
    scores = []
    plots = []
    
    eps = eps_start 
    
    for i_episode in tqdm(range(n_episodes+1)):
        states = env.reset()
        inner_scores = list()
        score = 0
    
        for t in range(max_t):
            # Call the agent with the local observations for each agent
            # then actions is a 2d array of actions for each agent
            actions = agent.act(states, eps)
            
            next_states, rewards, done, _ = env.step(actions)
            agent.step(states, actions, rewards, next_states, done)
            states = next_states
            score += np.mean(rewards)

            if done:
                break
                
            inner_scores.append(score)
            eps = max(eps * eps_decay, eps_end)

        scores.append(inner_scores)
        
        # Generate a new animated plot after a fixed number of steps
        if i_episode%RECORD_STEP==0:
            plots.append(test_plot())

    return np.array(scores), plots

## Run Agent

**NOTE: This cell below takes about 1hr to run on my machine** 

In [13]:
scores, plots = dqn(max_t=5_000, n_episodes=300, eps_decay=0.98)

100%|██████████| 301/301 [1:02:25<00:00, 12.44s/it]


In [14]:
len(plots)

16

In [17]:
plots[4]

## Save Animations

In [None]:
# for i, p in enumerate(plots):
#     p.save(f"videos/{22:03}_{RECORD_STEP*i+1:03}.mp4")