# Reinforcement Learning Project

## Setup

To be able to run this notebook properly please make sure to install the pettingzoo package and dependencies. This can be done by running the following command

`pip install pettingzoo[mpe]`

### Imports

In [13]:
from pettingzoo.mpe import simple_world_comm_v2
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2022-05-02 11:41:28.861045: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-02 11:41:28.861063: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Environment Initialisation

In [68]:
MAX_CYCLES = 250
NUM_OF_EPISODES = 1


env = simple_world_comm_v2.env(num_good=2, num_adversaries=4, num_obstacles=1,
                num_food=2, max_cycles=MAX_CYCLES, num_forests=2, continuous_actions=False)
env.reset()
print(f"Agents: {env.agents}")
print()
agent_mapping = {k: v for v, k in enumerate(env.agents)}

Agents: ['leadadversary_0', 'adversary_0', 'adversary_1', 'adversary_2', 'agent_0', 'agent_1']



### Policy Function

In [69]:
def random_policy(actions):
    return random.randint(0, actions-1)

### Inspection Functions

A collection of inspection functions to help minimise clutter in the training loop below.

In [94]:
def matching_agent_only(agent, desired_agent, function, function_args):
    if agent == desired_agent:
        function(agent, *function_args)


def print_agent_rewards(agent, reward):
    print(f"{agent}:{reward}")
    
def print_agent_state(agent, observation):
    print(f"{agent}: {observation}")
    
def print_iter_info(agent,observation,reward,done,info):
    print(f"Current Agent: {agent}")
    print(f"Obs: {observation}")
    print(f"Rew: {reward}")      
    print(f"Done: {done}")
    print(f"Info: {info}")

### Running the environment

The `env.render(mode='human')` call will pop open a new window that shows the environment at each time step.

On my machine at least this window can only be closed while the cell is running but then freezes and is unable to be closed afterwards. In these cases restarting the kernel closed the window and any others which may have been opened due to running the cell multiple times.

Eventually running the cell enough times without restarting the kernal will cause the render call to throw an exception and not run. In this case just restart the kernal and it will begin working again.

In [70]:
class agent_stub:
    
    def policy(self, state):
        return random_policy(env.action_space(agent).n)
    
    def save_action_state(self, action, state):
        pass
    
    def save_memory(self, state, reward, done):
        pass

In [90]:
class good_agent:
    
    def __init__(self, agent_name, epsilon=1, epsilon_min = 0.1, epsilon_decay = 0.95, batch_size=8):
        self.action_space = 5
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
        self.loss_function = keras.losses.Huber()
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.agent_name = agent_name
        self.agent_file = agent_name+".h5"
        self.agent_target_file = agent_name+"_target.h5"
        self.history = []
        self.action_taken = None
        self.previous_state = None
        self.batch_size = batch_size

    
    def build_model(self):
        inputs = layers.Input(shape=(28,))
        layer1 = layers.Dense(64, activation="relu")(inputs)
        layer2 = layers.Dense(64, activation="relu")(layer1)
        layer3 = layers.Dense(64, activation="relu")(layer2)
        action = layers.Dense(self.action_space, activation="linear")(layer3)

        return keras.Model(inputs=inputs, outputs=action)
    
    def exploration_decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = min(self.epsilon_min, self.epsilon*self.epsilon_decay)
            
    def save_model(self):
        self.model.save(self.agent_file)
        self.target_model.save(self.agent_target_file)
    
    def load_model(self):
        self.model.load_weights(self.agent_file)
        self.target_model.load_weights(self.agent_target_file)
        
    def policy(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        action_vals = self.model.predict(state)
        return np.argmax(action_vals[0])
    
    def save_action_state(self, action, state):
        self.action_taken = action
        self.previous_state = state
        
    def save_memory(self, state, reward, done):
        if self.previous_state is None and self.action_taken is None:
            self.previous_state = state
            self.action_taken = 0
        self.history.append((self.previous_state, self.action_taken, reward, state, done))
    
    def get_memory(self):
        return self.history
    
    def replay(self):
        if len(self.history) < self.batch_size:
            pass
        sample_batch = random.sample(self.history, self.batch_size)
        for state, action, reward, next_state, done in sample_batch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.brain.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        self.exploration_decay()

In [100]:
env.reset()
print(NUM_OF_EPISODES)
print(len(env.agents))
reward_array = np.zeros((NUM_OF_EPISODES,len(env.agents)))

cumulative_reward = np.zeros(len(env.agents))
i = 0
agent_models = {}
for key in agent_mapping:
    if key == "leadadversary_0":
        agent_models[key] = agent_stub()
    if "adversary" in key:
        agent_models[key] = agent_stub()
    if "agent" in key:
        if "agent_0" == key:
            print("Making object")
            agent_models[key] = good_agent(key)
        else:
            agent_models[key] = agent_stub()
cycle = 0
for episode in range(NUM_OF_EPISODES):
    env.reset()
    for agent in env.agent_iter():
        observation, reward, done, info = env.last()
        cumulative_reward[agent_mapping[agent]] += reward
        agent_models[agent].save_memory(observation, reward, done)      

    #     Renders the environment for each step in a seperate window.
#         env.render(mode='human')

    #     Steps the environment forward.
        if done:
            env.step(None)
            reward_array[episode,agent_mapping[agent]] = cumulative_reward[agent_mapping[agent]]
        else:
            action_to_take = agent_models[agent].policy(observation)
            env.step(action_to_take)
            agent_models[agent].save_action_state(action_to_take, observation)


1
6
Making object
251.0


In [97]:
# Closs the render window.
env.close()

### Print Reward Array

In [76]:
print(reward_array)

[[  -22.80785677   -24.78345896   -11.25015002   -37.30979195
  -1036.55329913 -3737.49148099]]


In [89]:
agent_models["agent_0"].history

[(array([ 0.        ,  0.        , -0.0851061 , -0.7582769 ,  0.01539359,
          0.04526638, -0.02942275,  0.40837634,  0.49800873,  0.539667  ,
          0.8872498 ,  1.3266324 , -0.3086799 ,  1.1537169 ,  0.        ,
          0.        ,  0.07954748,  0.10836259, -0.57221186,  0.301394  ,
          0.78135043,  0.41936207, -0.59125865,  0.4112703 , -1.        ,
         -1.        ,  0.        ,  0.        ], dtype=float32),
  0,
  0.0,
  array([ 0.        ,  0.        , -0.0851061 , -0.7582769 ,  0.01539359,
          0.04526638, -0.02942275,  0.40837634,  0.49800873,  0.539667  ,
          0.8872498 ,  1.3266324 , -0.3086799 ,  1.1537169 ,  0.        ,
          0.        ,  0.07954748,  0.10836259, -0.57221186,  0.301394  ,
          0.78135043,  0.41936207, -0.59125865,  0.4112703 , -1.        ,
         -1.        ,  0.        ,  0.        ], dtype=float32),
  False),
 (array([ 0.        ,  0.        , -0.0851061 , -0.7582769 ,  0.01539359,
          0.04526638, -0.02942275,