# Reinforcement Learning Project

## Setup

To be able to run this notebook properly please make sure to install the pettingzoo package and dependencies. This can be done by running the following command

`pip install pettingzoo[mpe]`

### Imports

In [None]:
from pettingzoo.mpe import simple_world_comm_v2
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pprint import pprint
from matplotlib import pyplot as plt
import time
import datetime

### Environment Initialisation

In [None]:
MAX_CYCLES = 50
# Keep as a multiple of 10
NUM_OF_EPISODES = 500
MEANING_OF_LIFE = 42
ENVIRONMENT_NAME = "simple_world_comm"

env = simple_world_comm_v2.env(num_good=2, num_adversaries=4, num_obstacles=1,
                num_food=2, max_cycles=MAX_CYCLES, num_forests=2, continuous_actions=False)



env.seed(seed=MEANING_OF_LIFE)
env.reset()
print(f"Agents: {env.agents}")
print()
agent_mapping = {k: v for v, k in enumerate(env.agents)}
inv_agent_map = {v: k for k, v in agent_mapping.items()}
NUM_OF_AGENTS = len(env.agents)


### Policy Function

In [None]:
def random_policy(actions):
    return random.randint(0, actions-1)

### Inspection Functions

A collection of inspection functions to help minimise clutter in the training loop below.

In [None]:
def matching_agent_only(agent, desired_agent, function, function_args):
#     To use this function, pass the agent and desired agent as the first two arguments,
#     then pass the function reference and the arguments for the function as a tuple for the final argument.
    if agent == desired_agent:
        function(*function_args)


def print_agent_rewards(agent, reward):
    print(f"{agent}:{reward}")
    
def print_agent_state(agent, observation):
    print(f"{agent}: {observation}")
    
def print_iter_info(agent,observation,reward,done,info):
    print(f"Current Agent: {agent}")
    print(f"Obs: {observation}")
    print(f"Rew: {reward}")      
    print(f"Done: {done}")
    print(f"Info: {info}")

def get_current_step(env):
    return env.env.env.steps

def np_array_no_e(array):
    np.set_printoptions(suppress=True)
    print(array)
    np.set_printoptions(suppress=False)

### Running the environment

The `env.render(mode='human')` call will pop open a new window that shows the environment at each time step.

On my machine at least this window can only be closed while the cell is running but then freezes and is unable to be closed afterwards. In these cases restarting the kernel closed the window and any others which may have been opened due to running the cell multiple times.

Eventually running the cell enough times without restarting the kernal will cause the render call to throw an exception and not run. In this case just restart the kernal and it will begin working again.

In [None]:
class agent_stub:
    def __init__(self):
        self.target_update_steps = 123456789123456789
    
    def policy(self, state):
        return random_policy(env.action_space(agent).n)
    
    def save_action_state(self, action, state):
        pass
    
    def save_memory(self, state, reward, done):
        pass
    
    def replay(self):
        pass

    def predict(self, state):
        pass

    def update_target_network_weights(self):
        pass

In [None]:
class good_agent_stub:
    def __init__(self):
        self.target_update_steps = 123456789123456789
    
    def policy(self, state):
        return 0
    
    def save_action_state(self, action, state):
        pass
    
    def save_memory(self, state, reward, done):
        pass
    
    def replay(self):
        pass

    def predict(self, state):
        pass
    
    def update_target_network_weights(self):
        pass

In [None]:
class agent:
    
    def __init__(self, agent_name, state_size, epsilon=1, epsilon_min = 0.1, 
                 epsilon_decay = 0.975, batch_size=16, learning_decay_rate = 0.95,
                target_update_steps = 10, action_space=5, experience_replay = True):
        
        self.state_size = state_size
        self.action_space = 5
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.agent_name = agent_name
        self.history = []
        self.action_taken = None
        self.previous_state = None
        self.batch_size = batch_size
        self.gamma = learning_decay_rate
        self.target_update_steps = target_update_steps
        self.update_target_network_weights()
        self.replay_enabled = experience_replay
    
    def build_model(self):
        model = keras.Sequential()
        model.add(layers.Dense(self.state_size, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(32, activation="relu"))
        model.add(layers.Dense(32, activation="relu"))
        model.add(layers.Dense(32, activation="relu"))
        model.add(layers.Dense(self.action_space, activation="relu"))
        model.compile(loss=keras.losses.Huber(), 
                      optimizer = keras.optimizers.Adam(learning_rate=0.0005))
 
        return model
    
    def exploration_decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = min(self.epsilon_min, self.epsilon*self.epsilon_decay)
            
    def save_model(self, agent_filename, target_filename):
        self.model.save(agent_filename+".h5")
        self.target_model.save(target_filename+".h5")
    
    def load_model(self, agent_filename, target_filename):
        self.model.load_weights(agent_filename)
        self.target_model.load_weights(target_filename)
        
    def policy(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        action_vals = self.predict(state)
        return np.argmax(action_vals[0])
    
    def save_action_state(self, action, state):
        self.action_taken = action
        self.previous_state = state
        
    def save_history(self, state, reward, done):
        if self.previous_state is None and self.action_taken is None:
            self.previous_state = state
            self.action_taken = 0
        self.history.append((self.previous_state, self.action_taken, reward, state, done))
    
    def get_history(self):
        return self.history
    
    def replay(self):
        if self.replay_enabled:
            if len(self.history) < self.batch_size:
                return
            sample_batch = random.sample(self.history, self.batch_size)
            targets = []
            states = []

            for state, action, reward, next_state, done in sample_batch:
                target = reward
                if not done:
                    target = reward + self.gamma * np.amax(self.target_predict(next_state)[0])
                target_f = self.predict(state)
                target_f[0][action] = target
                targets.append(target_f[0])
                states.append(state)      
            
#             print(np.asarray(targets).shape)
#             print(np.asarray(states).shape)
            
            self.model.fit(np.asarray(states),np.asarray(targets), epochs=1, verbose=0, batch_size=self.batch_size)
        else:
#             print(self.history[-1])
            state, action, reward, next_state, done = self.history[-1]
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.target_predict(next_state)[0])
            target_f = self.predict(state)
            target_f[0][action] = target
            state = np.reshape(state, (1,self.state_size))
            self.model.fit(state,target_f, epochs=1, verbose=0,)
        
        self.exploration_decay()
    
    def get_learning_steps(self):
        return self.learning_steps
    
    def predict(self, state):
#         print(f"Before Reshape: {state}")
        state = np.reshape(state, (1,self.state_size))
#         print(f"After Reshape: {state}")
        return self.model.predict(state)

    def target_predict(self, state):
        state = np.reshape(state, (1,self.state_size))
        return self.target_model.predict(state)

    def update_target_network_weights(self):
        self.target_model.set_weights(self.model.get_weights())

In [None]:
env.seed(seed=MEANING_OF_LIFE)
env.reset()
reward_array = np.zeros((NUM_OF_EPISODES,len(env.agents)))

cumulative_reward = np.zeros(len(env.agents))
i = 0
agent_models = {}

ten_percent_episodes = int(NUM_OF_EPISODES/10)
save_models = True


# Creates the models. Models are shared between multiple adversaries and agents so for the first one of
# each type it creates the models while subsequent agents get the reference to the model passed as their
# 'model'
for key in agent_mapping:
    if key == "leadadversary_0":
        agent_models[key] = agent(key, env.observation_space(key)._shape[0], 
                                  action_space=env.action_space("leadadversary_0").n)
    elif "adversary" in key:
        if key == "adversary_0":    
            agent_models[key] = agent(key, env.observation_space(key)._shape[0])
        else:
            agent_models[key] = agent_models["adversary_0"]
    elif "agent" in key:
        if "agent_0" == key:
            agent_models[key] = agent(key, env.observation_space(key)._shape[0])
        else:
            agent_models[key] = agent_models["agent_0"]

cycle_scores = np.zeros((len(env.agents), NUM_OF_EPISODES,MAX_CYCLES+1))
step = -1
episode_times = []  


for episode in range(NUM_OF_EPISODES):
    print(f"Episode {episode+1} out of {NUM_OF_EPISODES}")
    env.seed(seed=MEANING_OF_LIFE)
    env.reset()
    cumulative_reward.fill(0)
    if episode > 0:
        print(f"Time taken for previous episode: {time.time()-episode_start_time}")
    episode_start_time = time.time()
    
    for agent in env.agent_iter():
        step_time = time.time()    
        if step != get_current_step(env):
#             print(f"Step: {get_current_step(env)}/{MAX_CYCLES}")
            step = get_current_step(env)
#         print(agent)
        
        observation, reward, done, info = env.last()
        cumulative_reward[agent_mapping[agent]] += reward
        cycle_scores[agent_mapping[agent]][episode][step] = reward
        
        agent_models[agent].save_history(observation, reward, done)
        
        if agent in ["adversaryleader_0", "adversary_2", "agent_1"]:
            agent_models[agent].replay()
            if (step % agent_models[agent].target_update_steps) == 0:
                agent_models[agent].update_target_network_weights()

            if (episode % ten_percent_episodes == 0) and save_models:
                agent_models[agent].save_model(
                    ENVIRONMENT_NAME+"_"+agent+"_"+str(episode)+"_of_"+str(NUM_OF_EPISODES)+"_model",
                ENVIRONMENT_NAME+"_"+agent+"_"+str(episode)+"_of_"+str(NUM_OF_EPISODES)+"_target_model")

            
    #     Renders the environment for each step in a seperate window.        
#         env.render(mode='human')
    
    
    #     Steps the environment forward.
        if done:
            env.step(None)
            reward_array[episode,agent_mapping[agent]] = cumulative_reward[agent_mapping[agent]]
        else:
            action_to_take = agent_models[agent].policy(observation)
            env.step(action_to_take)
            agent_models[agent].save_action_state(action_to_take, observation)
        
#         print(f"Step Time: {time.time()-step_time}")
   

    episode_time = time.time()-episode_start_time
    episode_times.append(episode_time)
    print(f"Episode Time: {episode_time}")


In [None]:
# Closs the render window.
# env.close()

### Print Reward Array

In [None]:
np_array_no_e(reward_array)

x = 1

with open(f"{ENVIRONMENT_NAME}_DQN_{NUM_OF_EPISODES}ep.csv", "w+") as f:
    f.write("episode,adversary_0,agent_0\n")
    for j, i in enumerate(reward_array):
        f.write(f"{x}, {i[0]},{i[1]}, {i[2]},{i[3]},{i[4]},{i[5]}\n")
        x += 1

In [None]:
# np_array_no_e(cycle_scores[0])
# print()
# print()
# print()
# np_array_no_e(cycle_scores[1])
# np_array_no_e(cycle_scores[1].sum(axis=1))

In [None]:
print(str(datetime.timedelta(seconds=np.mean(episode_times))))

In [None]:
for i in range(0,NUM_OF_AGENTS):
    plt.figure(figsize=(15,15))
    plt.plot(range(1,NUM_OF_EPISODES+1), reward_array[:,i])
    plt.title(inv_agent_map[i])
    plt.xlabel("Episode")
    plt.ylabel("Score")
    plt.savefig(f"{ENVIRONMENT_NAME}_{inv_agent_map[i]}_learning_"+str(NUM_OF_EPISODES)+"_episodes.png")
    plt.show()

In [None]:
print(agent_models)

In [None]:
pprint(vars(env.env.env))

In [None]:
agent_models["adversary_0"].save_model(
    ENVIRONMENT_NAME+"_adversary_0_"+str(NUM_OF_EPISODES)+"_ep_final_model", 
    ENVIRONMENT_NAME+"_adversary_0_"+str(NUM_OF_EPISODES)+"_final_target_model")

In [None]:
agent_models["agent_0"].save_model(
    ENVIRONMENT_NAME+"_agent_0_"+str(NUM_OF_EPISODES)+"_final_model", 
    ENVIRONMENT_NAME+"_agent_0_"+str(NUM_OF_EPISODES)+"_final_target_model")