## Code Implementation of the PPO Clip Algorithm on Pettingzoo's Simple World Comm Multi-Agent Environment

In [None]:
!pip install tensorflow
!pip install tensorflow_probability
!pip install pettingzoo[mpe]

In [None]:
import os
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_probability as tfp
import numpy as np
import matplotlib.pyplot as plt
import random

from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from pettingzoo.mpe import simple_world_comm_v2, simple_push_v2

### Neural Network Classes
These classes intialise and call the seperate actor and critic networks of the algorithm. They both use the same structured network of an input layer of the current agents state two dense hidden layers of 256 nuerons each as default both using the Rectified Linear Unit (relu) activation functions. For the actor the final output layer is a dense layer, with the same number of neurons as there is actions in the current agents action space. It uses a softmax activation function causing all the output values to sum to one, giving an effective distribution with the larger favoured action though to obtain the largest reward. The critic has a single output as it outputs the "value" of the particular state being evaluted.

In [None]:
class Actor_Net(keras.Model):
    def __init__(self, n_actions, layer_1_neur=256, layer_2_neur=256):
        super(Actor_Net, self).__init__()

        self.layer_1 = Dense(layer_1_neur, activation='relu')
        self.layer_2 = Dense(layer_2_neur, activation='relu')
        self.layer_3 = Dense(n_actions, activation='softmax')

    def call(self, state):
        x = self.layer_1(state)
        x = self.layer_2(x)
        x = self.layer_3(x)

        return x


class Critic_Net(keras.Model):
    def __init__(self, layer_1_neur=256, layer_2_neur=256):
        super(Critic_Net, self).__init__()
        self.layer_1 = Dense(layer_1_neur, activation='relu')
        self.layer_2 = Dense(layer_2_neur, activation='relu')
        self.layer_3 = Dense(1, activation=None)

    def call(self, state):
        x = self.layer_1(state)
        x = self.layer_2(x)
        q = self.layer_3(x)

        return q

### Creating the Networks
To deal with the multi agent environemnt in which each type of agent will need its own set of actor and critic networks the following functions were created to initialise the 3 respective actor and critic networks required for the learning for the leadadversary, the adversaries and the agents.

In [None]:
def compile_nets(env, agent, alpha):
    actor_net = Actor_Net(env.action_space(agent).n)
    actor_net.compile(optimizer=Adam(learning_rate=alpha))
    critic_net = Critic_Net()
    critic_net.compile(optimizer=Adam(learning_rate=alpha))
    return [agent, actor_net, critic_net]

def create_agent_nets(env, alpha):
    agent_nets = []
    for agent in env.agents:
        if "leadadversary_" in agent:
            try:
                lead_adversary_Nets
            except:
                lead_adversary_Nets = compile_nets(env, agent, alpha)
                agent_nets.append(lead_adversary_Nets)
        elif "adversary_" in agent:
            try:
                adversary_Nets
            except:
                adversary_Nets = compile_nets(env, agent, alpha)
                agent_nets.append(adversary_Nets)
        elif "agent_" in agent:
            try:
                agent_Network
            except:
                agent_Network = compile_nets(env, agent, alpha)
                agent_nets.append(agent_Network)
    return agent_nets

### Memory Storing Class
Stores each batch of episode steps before learning takes place.
Generates "minibatches" of shuffled data to run in each epoch during learning 

In [None]:
class n_step_Memory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.batch_size = batch_size

    def create_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
            np.array(self.actions),\
            np.array(self.probs),\
            np.array(self.vals),\
            np.array(self.rewards),\
            np.array(self.dones),\
            batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

### PPO agent Class
Contains functions which:
- Assigns the agent the appropriate set of neural nets to make sure each type of agent only trains one set
- chooses the action the next action of the agent using its policy and its current state
- Updates the agents actor and critic networks one using the clip objective function and the other using a mean squared error function

In [None]:
class PPOAgent:
    def __init__(self, agent, agent_nets, n_actions, input_dims, gamma=0.99, alpha=0.0003,
                   lamda=0.95, epsilon=0.2, batch_size=64,
                 n_epochs=10):
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_epochs = n_epochs
        self.lamda = lamda

        self.agent = agent
        self.actor, self.critic = self.assign_agent_nets(agent_nets)
        self.memory = n_step_Memory(batch_size) 
        self.score_history = []
        self.score = 0
        self.action = None
        self.probs = None
        self.val = None

        self.learn_iters =  0
        self.avg_score = 0
        self.n_steps = 0
        self.observation = None
        
    def assign_agent_nets(self, agent_nets):
        for agent_net in agent_nets:
            if ("leadadversary_" in agent_net[0]) and ("leadadversary_" in self.agent):
                return agent_net[1], agent_net[2]
            elif "adversary_" in agent_net[0] and ("adversary_" in self.agent) and ("leadadversary_" not in agent_net[0]):
                return agent_net[1], agent_net[2]
            elif "agent_" in agent_net[0] and ("agent_" in self.agent):
                return agent_net[1], agent_net[2]

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        
        action_probs = self.actor(state)
        cat_probs = tfp.distributions.Categorical(action_probs)
        action = cat_probs.sample()
        log_prob = cat_probs.log_prob(action)
        crit_val = self.critic(state)

        action = action.numpy()[0]
        crit_val = crit_val.numpy()[0]
        log_prob = log_prob.numpy()[0]

        return action, log_prob, crit_val 

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
                reward_arr, dones_arr, batches = \
                self.memory.create_batches()

            values = vals_arr
            A_hat = np.zeros(len(reward_arr), dtype=np.float32)
            #calculate GAE advanatage estimates 
            for t in range(len(reward_arr)-1):
                discount = 1
                A_t = 0
                for k in range(t, len(reward_arr)-1):
                    A_t += discount*(reward_arr[k] + self.gamma*values[k+1] * (
                        1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.lamda
                A_hat[t] = A_t
            
            # start each epoch of learning 
            for minibatch in batches:
                with tf.GradientTape(persistent=True) as tape:
                    states = tf.convert_to_tensor(state_arr[minibatch])
                    old_probs = tf.convert_to_tensor(old_prob_arr[minibatch])
                    actions = tf.convert_to_tensor(action_arr[minibatch])

                    probs = self.actor(states)
                    dist = tfp.distributions.Categorical(probs)
                    new_probs = dist.log_prob(actions)

                    critic_value = self.critic(states)

                    critic_value = tf.squeeze(critic_value, 1)
                    
                    # calculate the two ratios - inital is exponential as currently logged
                    r_theta = tf.math.exp(new_probs) / tf.math.exp(old_probs)
                    
                    # this is the implementation of the Lclip objective function 
                    weighted_r_theta = A_hat[minibatch] * r_theta
                    weighted_clipped_r_theta = A_hat[minibatch] * tf.clip_by_value(r_theta,
                                                     1-self.epsilon,
                                                     1+self.epsilon)
                    
                    # negative as it is doing gradient ascent rather then descent
                    actor_loss = -tf.math.minimum(weighted_r_theta,
                                                  weighted_clipped_r_theta)
                    actor_loss = tf.math.reduce_mean(actor_loss)
                    
                    # used mean square error for the critic loss 
                    returns = A_hat[minibatch] + values[minibatch]
            
                    critic_loss = keras.losses.MSE(critic_value, returns)

                actor_params = self.actor.trainable_variables
                actor_grads = tape.gradient(actor_loss, actor_params)
                critic_params = self.critic.trainable_variables
                critic_grads = tape.gradient(critic_loss, critic_params)
                self.actor.optimizer.apply_gradients(
                        zip(actor_grads, actor_params))
                self.critic.optimizer.apply_gradients(
                        zip(critic_grads, critic_params))

        self.memory.clear_memory()

### House Keeping Functions

In [None]:
def save_models(agent_net, episode):
    print('... saving models ...')
    agent_net[1].save(save_model_path + "ep_"+ str(episode)+"/" + agent_net[0] + "_" + 'PPO_actor_simp_world_ep_'+ str(episode))
    agent_net[2].save(save_model_path + "ep_"+ str(episode)+"/" + agent_net[0] + "_" + 'PPO_critic_simp_world_ep_'+ str(episode))

def load_models(agent_name, episode):
    print('... loading models ...')
    actor = keras.models.load_model(save_model_path + "ep_"+ str(episode)+"/" + agent_name + "_" + 'PPO_actor_simp_world_ep_'+ str(episode))
    critic = keras.models.load_model(save_model_path + "ep_"+ str(episode)+"/" + agent_name + "_" + 'PPO_critic_simp_world_ep_'+ str(episode))
    return [agent_name, actor, critic]

def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

def run_plot(agents, figure_file):
    for agent in agents:
        x = [i+1 for i in range(len(agent.score_history))]
        plot_learning_curve(x, agent.score_history, figure_file)
        
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory : ", dir)
    else:
        print("Directory already existed : ", dir)
    return dir

### Training Algorithm
Below runs the PPO-Clip algorithm for 
- the Simple World Comm environment https://www.pettingzoo.ml/mpe/simple_world_comm
- the simple push environment https://www.pettingzoo.ml/mpe/simple_push 

In [None]:
if __name__ == '__main__':
    episodes = 2001
    
    #################### below runs simple world comm ###########################
    
    MAX_STEPS = 500
    save_model_path = "models/simple_world_comm/"
    env = simple_world_comm_v2.env(num_good=2, num_adversaries=4, num_obstacles=1,
                num_food=2, max_cycles=MAX_STEPS, num_forests=2, continuous_actions=False)
   
    ##################uncomment below and comment above to run simple push ################### 
    
    #MAX_STEPS = 250   
    #save_model_path = "models/simple_push/"  
    #env = simple_push_v2.env(max_cycles=MAX_STEPS, continuous_actions=False)
    
    #########################################################################################
    
    
    create_dir(save_model_path)
    env.reset()
    N = 20
    batch_size = 5
    n_epochs = 4
    alpha = 0.0003
    agent_nets = create_agent_nets(env, alpha) 
    
    ##code for loading a model if run crashes 
   # model_names = ["leadadversary_0", "adversary_0", "agent_0"]
    
  #  for model in model_names:
   #     agent_nets.append(load_models(savmodel, 700))
     
    agents =[]
    for agent in env.agents:
        agents.append(PPOAgent(agent, agent_nets, env.action_space(agent).n, env.observation_space(agent).shape[0], batch_size=batch_size,
                  alpha=alpha, n_epochs=n_epochs))
    
    

    for episode in range(0, episodes):
        env.reset()
        for agent in agents:
            agent.score = 0
        done = False
        
        while not done:
            for agent in agents:
                agent.observation = env.observe(agent.agent)
                agent.action, agent.prob, agent.val = agent.choose_action(agent.observation)
                env.step(agent.action)
            
            for agent in agents:
                done = env.dones[agent.agent]
                reward = env.rewards[agent.agent]
                agent.n_steps += 1
                agent.score += reward
                agent.memory.store_memory(agent.observation, agent.action, agent.prob, agent.val, reward, done)
                if agent.n_steps % N == 0:
                    agent.learn()
                    agent.learn_iters += 1
                    
        for agent in agents:
            
            agent.score_history.append(agent.score)
            agent.avg_score = np.mean(agent.score_history[-100:])
                
            log = agent.agent + " episode " + str(episode) + " score " + str(agent.score) + " avg score " + str(agent.avg_score) +" time_steps " + str(agent.n_steps) + " learning_steps " + str(agent.learn_iters)
            print(log)
            text_file = open(save_model_path+"learning_log.txt", "a")
            log = log + "\n"
            n = text_file.write(log)
            text_file.close()
            
        if episode % 50 == 0:
            for agent in agent_nets:
                save_models(agent, episode)
            figure_file = save_model_path + 'ep_'+ str(episode)+'/Avg_Learing_plot_ep_' + str(episode)+ '.png'
            run_plot(agents,figure_file)
            
    figure_file = save_model_path +'Avg_Learing_plot_FINAL.png'            
    run_plot(agents, figure_file)

### Credits
philtabor - https://github.com/philtabor/Youtube-Code-Repository/tree/master/ReinforcementLearning