# Reinforcement Learning Project

## Setup

To be able to run this notebook properly please make sure to install the pettingzoo package and dependencies. This can be done by running the following command

`pip install pettingzoo[mpe]`

### Imports

In [None]:
from pettingzoo.mpe import simple_world_comm_v2
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pprint import pprint

### Environment Initialisation

In [None]:
MAX_CYCLES = 250
NUM_OF_EPISODES = 10


env = simple_world_comm_v2.env(num_good=2, num_adversaries=4, num_obstacles=1,
                num_food=2, max_cycles=MAX_CYCLES, num_forests=2, continuous_actions=False, seed=42)
env.reset()
print(f"Agents: {env.agents}")
print()
agent_mapping = {k: v for v, k in enumerate(env.agents)}

### Policy Function

In [None]:
def random_policy(actions):
    return random.randint(0, actions-1)

### Inspection Functions

A collection of inspection functions to help minimise clutter in the training loop below.

In [None]:
def matching_agent_only(agent, desired_agent, function, function_args):
#     To use this function, pass the agent and desired agent as the first two arguments,
#     then pass the function reference and the arguments for the function as a tuple for the final argument.
    if agent == desired_agent:
        function(*function_args)


def print_agent_rewards(agent, reward):
    print(f"{agent}:{reward}")
    
def print_agent_state(agent, observation):
    print(f"{agent}: {observation}")
    
def print_iter_info(agent,observation,reward,done,info):
    print(f"Current Agent: {agent}")
    print(f"Obs: {observation}")
    print(f"Rew: {reward}")      
    print(f"Done: {done}")
    print(f"Info: {info}")

def get_current_step(env):
    return env.env.env.steps

def np_array_no_e(array):
    np.set_printoptions(suppress=True)
    print(array)
    np.set_printoptions(suppress=False)

### Running the environment

The `env.render(mode='human')` call will pop open a new window that shows the environment at each time step.

On my machine at least this window can only be closed while the cell is running but then freezes and is unable to be closed afterwards. In these cases restarting the kernel closed the window and any others which may have been opened due to running the cell multiple times.

Eventually running the cell enough times without restarting the kernal will cause the render call to throw an exception and not run. In this case just restart the kernal and it will begin working again.

In [None]:
class agent_stub:
    def __init__(self):
        self.learning_steps = 123456789123456789
    
    def policy(self, state):
        return random_policy(env.action_space(agent).n)
    
    def save_action_state(self, action, state):
        pass
    
    def save_memory(self, state, reward, done):
        pass
    
    def replay(self):
        pass

    def predict(self, state):
        pass

In [None]:
class good_agent_stub:
    def __init__(self):
        self.learning_steps = 123456789123456789
    
    def policy(self, state):
        return 0
    
    def save_action_state(self, action, state):
        pass
    
    def save_memory(self, state, reward, done):
        pass
    
    def replay(self):
        pass

    def predict(self, state):
        pass

In [None]:
class good_agent:
    
    def __init__(self, agent_name, epsilon=1, epsilon_min = 0.1, 
                 epsilon_decay = 0.95, batch_size=16, learning_decay_rate = 0.95,
                learning_steps = 25):
        self.state_size = 28
        self.action_space = 5
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
        self.loss_function = keras.losses.Huber()
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.agent_name = agent_name
        self.agent_file = agent_name+".h5"
        self.agent_target_file = agent_name+"_target.h5"
        self.history = []
        self.action_taken = None
        self.previous_state = None
        self.batch_size = batch_size
        self.gamma = learning_decay_rate
        self.learning_steps = learning_steps

    
    def build_model(self):
        model = keras.Sequential()
        model.add(layers.Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(64, activation="relu"))
        model.add(layers.Dense(64, activation="relu"))
        model.add(layers.Dense(self.action_space, activation="relu"))
        model.compile(loss=keras.losses.Huber(), 
                      optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0))
 
        return model
    
    def exploration_decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = min(self.epsilon_min, self.epsilon*self.epsilon_decay)
            
    def save_model(self):
        self.model.save(self.agent_file)
        self.target_model.save(self.agent_target_file)
    
    def load_model(self):
        self.model.load_weights(self.agent_file)
        self.target_model.load_weights(self.agent_target_file)
        
    def policy(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        action_vals = self.predict(state)
        return np.argmax(action_vals[0])
    
    def save_action_state(self, action, state):
        self.action_taken = action
        self.previous_state = state
        
    def save_memory(self, state, reward, done):
        if self.previous_state is None and self.action_taken is None:
            self.previous_state = state
            self.action_taken = 0
        self.history.append((self.previous_state, self.action_taken, reward, state, done))
    
    def get_memory(self):
        return self.history
    
    def replay(self):
        if len(self.history) < self.batch_size:
            return
        sample_batch = random.sample(self.history, self.batch_size)
        for state, action, reward, next_state, done in sample_batch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.predict(next_state)[0])
            target_f = self.predict(state)
            target_f[0][action] = target
            state = np.reshape(state, (1,28))
            self.model.fit(state, target_f, epochs=1, verbose=0)
        self.exploration_decay()
    
    def get_learning_steps(self):
        return self.learning_steps
    
    def predict(self, state):
#         print(f"Before Reshape: {state}")
        state = np.reshape(state, (1,28))
#         print(f"After Reshape: {state}")
        return self.model.predict(state)

In [None]:
env.reset()
# print(NUM_OF_EPISODES)
# print(len(env.agents))
reward_array = np.zeros((NUM_OF_EPISODES,len(env.agents)))

cumulative_reward = np.zeros(len(env.agents))
i = 0
agent_models = {}
for key in agent_mapping:
    if key == "leadadversary_0":
        agent_models[key] = agent_stub()
    if "adversary" in key:
        agent_models[key] = agent_stub()
    if "agent" in key:
        if "agent_0" == key:
#             print("Making object")
            agent_models[key] = good_agent(key)
        else:
            agent_models[key] = good_agent_stub()
    
            
            
for episode in range(NUM_OF_EPISODES):
    print(f"Episode {episode+1} out of {NUM_OF_EPISODES})
    env.reset()
    for agent in env.agent_iter():
        observation, reward, done, info = env.last()
        cumulative_reward[agent_mapping[agent]] += reward
        agent_models[agent].save_memory(observation, reward, done)

    #     Renders the environment for each step in a seperate window.
#         if (get_current_step(env) % agent_models[agent].learning_steps) == 0:
        agent_models[agent].replay()

#             print(get_current_step(env) % agent_models[agent].learning_steps)
#             prediction = agent_models[agent].predict(observation)
#             if prediction is not None:
#                 print(prediction)
#                 print(np.argmax(prediction))
            
            
#         env.render(mode='human')
    #     Steps the environment forward.
        if done:
            env.step(None)
            reward_array[episode,agent_mapping[agent]] = cumulative_reward[agent_mapping[agent]]
        else:
            action_to_take = agent_models[agent].policy(observation)
            env.step(action_to_take)
            agent_models[agent].save_action_state(action_to_take, observation)


In [None]:
# Closs the render window.
env.close()

### Print Reward Array

In [None]:
np_array_no_e(reward_array)