In [1]:
from pettingzoo.mpe import simple_world_comm_v2, simple_push_v2
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pprint import pprint
from matplotlib import pyplot as plt
import time
import datetime

In [2]:
MAX_CYCLES = 50
# Keep as a multiple of 10
NUM_OF_EPISODES = 1
MEANING_OF_LIFE = 42
ENVIRONMENT_NAME = "simple_world_comm"

env = simple_world_comm_v2.env(num_good=2, num_adversaries=4, num_obstacles=1,
                num_food=2, max_cycles=MAX_CYCLES, num_forests=2, continuous_actions=False)



env.seed(seed=MEANING_OF_LIFE)
env.reset()
print(f"Agents: {env.agents}")
print()
agent_mapping = {k: v for v, k in enumerate(env.agents)}
inv_agent_map = {v: k for k, v in agent_mapping.items()}
NUM_OF_AGENTS = len(env.agents)


Agents: ['leadadversary_0', 'adversary_0', 'adversary_1', 'adversary_2', 'agent_0', 'agent_1']



In [3]:
def random_policy(actions):
    return random.randint(0, actions-1)

In [4]:
class agent:
    
    def __init__(self, agent_name, state_size, epsilon=1, epsilon_min = 0.1, 
                 epsilon_decay = 0.975, batch_size=16, learning_decay_rate = 0.95,
                target_update_steps = 10, action_space=5, experience_replay = True):
        
        self.state_size = state_size
        self.action_space = 5
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.agent_name = agent_name
        self.history = []
        self.action_taken = None
        self.previous_state = None
        self.batch_size = batch_size
        self.gamma = learning_decay_rate
        self.target_update_steps = target_update_steps
        self.update_target_network_weights()
        self.replay_enabled = experience_replay
    
    def build_model(self):
        model = keras.Sequential()
        model.add(layers.Dense(self.state_size, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(32, activation="relu"))
        model.add(layers.Dense(32, activation="relu"))
        model.add(layers.Dense(32, activation="relu"))
        model.add(layers.Dense(self.action_space, activation="relu"))
        model.compile(loss=keras.losses.Huber(), 
                      optimizer = keras.optimizers.Adam(learning_rate=0.00025))
 
        return model
    
    def exploration_decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = min(self.epsilon_min, self.epsilon*self.epsilon_decay)
            
    def save_model(self, agent_filename, target_filename):
        self.model.save(agent_filename+".h5")
        self.target_model.save(target_filename+".h5")
    
    def load_model(self, agent_filename, target_filename):
        self.model.load_weights(agent_filename)
        self.target_model.load_weights(target_filename)
        
    def policy(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        action_vals = self.predict(state)
        return np.argmax(action_vals[0])
    
    def save_action_state(self, action, state):
        self.action_taken = action
        self.previous_state = state
        
    def save_history(self, state, reward, done):
        if self.previous_state is None and self.action_taken is None:
            self.previous_state = state
            self.action_taken = 0
        self.history.append((self.previous_state, self.action_taken, reward, state, done))
    
    def get_history(self):
        return self.history
    
    def replay(self):
        if self.replay_enabled:
            if len(self.history) < self.batch_size:
                return
            sample_batch = random.sample(self.history, self.batch_size)
            targets = []
            states = []

            for state, action, reward, next_state, done in sample_batch:
                target = reward
                if not done:
                    target = reward + self.gamma * np.amax(self.target_predict(next_state)[0])
                target_f = self.predict(state)
                target_f[0][action] = target
                targets.append(target_f[0])
                states.append(state)      
            
#             print(np.asarray(targets).shape)
#             print(np.asarray(states).shape)
            
            self.model.fit(np.asarray(states),np.asarray(targets), epochs=1, verbose=0, batch_size=self.batch_size)
        else:
#             print(self.history[-1])
            state, action, reward, next_state, done = self.history[-1]
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.target_predict(next_state)[0])
            target_f = self.predict(state)
            target_f[0][action] = target
            state = np.reshape(state, (1,self.state_size))
            self.model.fit(state,target_f, epochs=1, verbose=0,)
        
        self.exploration_decay()
    
    def get_learning_steps(self):
        return self.learning_steps
    
    def predict(self, state):
#         print(f"Before Reshape: {state}")
        state = np.reshape(state, (1,self.state_size))
#         print(f"After Reshape: {state}")
        return self.model.predict(state)

    def target_predict(self, state):
        state = np.reshape(state, (1,self.state_size))
        return self.target_model.predict(state)

    def update_target_network_weights(self):
        self.target_model.set_weights(self.model.get_weights())

In [5]:

def play_episode(adversary_leader_model, adversary_leader_target_model, 
                 adversary_model, adversary_target_model,
                 agent_model, agent_target_model):

    env.seed(seed=MEANING_OF_LIFE)
    env.reset()
    reward_array = np.zeros((NUM_OF_EPISODES,len(env.agents)))

    cumulative_reward = np.zeros(len(env.agents))
    i = 0
    agent_models = {}

    ten_percent_episodes = int(NUM_OF_EPISODES/10)
    save_models = True


    # Creates the models. Models are shared between multiple adversaries and agents so for the first one of
    # each type it creates the models while subsequent agents get the reference to the model passed as their
    # 'model'
    for key in agent_mapping:
        if key == "leadadversary_0":
            agent_models[key] = agent(key, env.observation_space(key)._shape[0], 
                                      action_space=env.action_space("leadadversary_0").n)
            agent_models[key].load_model(adversary_leader_model, adversary_leader_target_model)
        elif "adversary" in key:
            if key == "adversary_0":    
                agent_models[key] = agent(key, env.observation_space(key)._shape[0])
                agent_models[key].load_model(adversary_model, adversary_target_model)
            else:
                agent_models[key] = agent_models["adversary_0"]
        elif "agent" in key:
            if "agent_0" == key:
                agent_models[key] = agent(key, env.observation_space(key)._shape[0])
                agent_models[key].load_model(agent_model, agent_target_model)
            else:
                agent_models[key] = agent_models["agent_0"]


    step = -1
    episode_times = []  

    for episode in range(NUM_OF_EPISODES):
        env.seed(seed=MEANING_OF_LIFE)
        env.reset()

        for agent in env.agent_iter(): 
            observation, reward, done, info = env.last()
        #     Renders the environment for each step in a seperate window.        
            env.render(mode='human')

        #     Steps the environment forward.
            if done:
                env.step(None)
            else:
                action_to_take = agent_models[agent].policy(observation)
                env.step(action_to_take)

    #         print(f"Step Time: {time.time()-step_time}")



In [None]:
adversary_model = "models/simple_world_comm/simple_world_comm_adversary_2_0_of_500_model.h5"
adversary_target_model = "models/simple_world_comm/simple_world_comm_adversary_2_0_of_500_target_model.h5"
adversary_leader_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_0_of_500_model.h5"
adversary_leader_target_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_0_of_500_target_model.h5"
agent_model = "models/simple_world_comm/simple_world_comm_agent_1_0_of_500_model.h5"
agent_target_model = "models/simple_world_comm/simple_world_comm_agent_1_0_of_500_target_model.h5"

play_episode(adversary_leader_model, adversary_leader_target_model, 
                 adversary_model, adversary_target_model,
                 agent_model, agent_target_model)

In [None]:
adversary_model = "models/simple_world_comm/simple_world_comm_adversary_2_250_of_500_model.h5"
adversary_target_model = "models/simple_world_comm/simple_world_comm_adversary_2_250_of_500_target_model.h5"
adversary_leader_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_250_of_500_model.h5"
adversary_leader_target_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_250_of_500_target_model.h5"
agent_model = "models/simple_world_comm/simple_world_comm_agent_1_250_of_500_model.h5"
agent_target_model = "models/simple_world_comm/simple_world_comm_agent_1_250_of_500_target_model.h5"

play_episode(adversary_leader_model, adversary_leader_target_model, 
                 adversary_model, adversary_target_model,
                 agent_model, agent_target_model)

In [None]:
adversary_model = "models/simple_world_comm/simple_world_comm_adversary_2_500_final_model.h5"
adversary_target_model = "models/simple_world_comm/simple_world_comm_adversary_2_500_final_target_model.h5"
adversary_leader_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_500_final_model.h5"
adversary_leader_target_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_500_final_target_model.h5"
agent_model = "models/simple_world_comm/simple_world_comm_agent_1_500_final_model.h5"
agent_target_model = "models/simple_world_comm/simple_world_comm_agent_1_500_final_target_model.h5"

play_episode(adversary_leader_model, adversary_leader_target_model, 
                 adversary_model, adversary_target_model,
                 agent_model, agent_target_model)

In [None]:

MAX_CYCLES = 50
# Keep as a multiple of 10
NUM_OF_EPISODES = 100
MEANING_OF_LIFE = 42
ENVIRONMENT_NAME = "simple_push"

env = simple_push_v2.env(max_cycles=MAX_CYCLES, continuous_actions=False)


env.seed(seed=MEANING_OF_LIFE)
env.reset()
print(f"Agents: {env.agents}")
print()
agent_mapping = {k: v for v, k in enumerate(env.agents)}
inv_agent_map = {v: k for k, v in agent_mapping.items()}



In [None]:
adversary_model = "models/simple_world_comm/simple_world_comm_adversary_2_500_final_model.h5"
adversary_target_model = "models/simple_world_comm/simple_world_comm_adversary_2_500_final_target_model.h5"
adversary_leader_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_500_final_model.h5"
adversary_leader_target_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_500_final_target_model.h5"
agent_model = "models/simple_world_comm/simple_world_comm_agent_1_500_final_model.h5"
agent_target_model = "models/simple_world_comm/simple_world_comm_agent_1_500_final_target_model.h5"

play_episode(adversary_leader_model, adversary_leader_target_model, 
                 adversary_model, adversary_target_model,
                 agent_model, agent_target_model)

In [None]:
adversary_model = "models/simple_world_comm/simple_world_comm_adversary_2_500_final_model.h5"
adversary_target_model = "models/simple_world_comm/simple_world_comm_adversary_2_500_final_target_model.h5"
adversary_leader_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_500_final_model.h5"
adversary_leader_target_model = "models/simple_world_comm/simple_world_comm_adversaryleader_0_500_final_target_model.h5"
agent_model = "models/simple_world_comm/simple_world_comm_agent_1_500_final_model.h5"
agent_target_model = "models/simple_world_comm/simple_world_comm_agent_1_500_final_target_model.h5"

play_episode(adversary_leader_model, adversary_leader_target_model, 
                 adversary_model, adversary_target_model,
                 agent_model, agent_target_model)