# Reinforcement Learning Project

## Setup

To be able to run this notebook properly please make sure to install the pettingzoo package and dependencies. This can be done by running the following command

`pip install pettingzoo[mpe]`

### Imports

In [1]:
from pettingzoo.mpe import simple_world_comm_v2
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pprint import pprint
from matplotlib import pyplot as plt
import time

### Environment Initialisation

In [2]:
MAX_CYCLES = 250
NUM_OF_EPISODES = 10
MEANING_OF_LIFE = 42

env = simple_world_comm_v2.env(num_good=2, num_adversaries=4, num_obstacles=1,
                num_food=2, max_cycles=MAX_CYCLES, num_forests=2, continuous_actions=False)
env.reset()
print(f"Agents: {env.agents}")
print()
agent_mapping = {k: v for v, k in enumerate(env.agents)}

Agents: ['leadadversary_0', 'adversary_0', 'adversary_1', 'adversary_2', 'agent_0', 'agent_1']



### Policy Function

In [3]:
def random_policy(actions):
    return random.randint(0, actions-1)

### Inspection Functions

A collection of inspection functions to help minimise clutter in the training loop below.

In [4]:
def matching_agent_only(agent, desired_agent, function, function_args):
#     To use this function, pass the agent and desired agent as the first two arguments,
#     then pass the function reference and the arguments for the function as a tuple for the final argument.
    if agent == desired_agent:
        function(*function_args)


def print_agent_rewards(agent, reward):
    print(f"{agent}:{reward}")
    
def print_agent_state(agent, observation):
    print(f"{agent}: {observation}")
    
def print_iter_info(agent,observation,reward,done,info):
    print(f"Current Agent: {agent}")
    print(f"Obs: {observation}")
    print(f"Rew: {reward}")      
    print(f"Done: {done}")
    print(f"Info: {info}")

def get_current_step(env):
    return env.env.env.steps

def np_array_no_e(array):
    np.set_printoptions(suppress=True)
    print(array)
    np.set_printoptions(suppress=False)

### Running the environment

The `env.render(mode='human')` call will pop open a new window that shows the environment at each time step.

On my machine at least this window can only be closed while the cell is running but then freezes and is unable to be closed afterwards. In these cases restarting the kernel closed the window and any others which may have been opened due to running the cell multiple times.

Eventually running the cell enough times without restarting the kernal will cause the render call to throw an exception and not run. In this case just restart the kernal and it will begin working again.

In [5]:
class agent_stub:
    def __init__(self):
        self.target_update_steps = 123456789123456789
    
    def policy(self, state):
        return random_policy(env.action_space(agent).n)
    
    def save_action_state(self, action, state):
        pass
    
    def save_memory(self, state, reward, done):
        pass
    
    def replay(self):
        pass

    def predict(self, state):
        pass

    def update_target_network_weights(self):
        pass

In [6]:
class good_agent_stub:
    def __init__(self):
        self.target_update_steps = 123456789123456789
    
    def policy(self, state):
        return 0
    
    def save_action_state(self, action, state):
        pass
    
    def save_memory(self, state, reward, done):
        pass
    
    def replay(self):
        pass

    def predict(self, state):
        pass
    
    def update_target_network_weights(self):
        pass

In [7]:
class good_agent:
    
    def __init__(self, agent_name, epsilon=1, epsilon_min = 0.1, 
                 epsilon_decay = 0.95, batch_size=16, learning_decay_rate = 0.95,
                target_update_steps = 25):
        
        self.state_size = 28
        self.action_space = 5
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
        self.loss_function = keras.losses.Huber()
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.agent_name = agent_name
        self.agent_file = agent_name+".h5"
        self.agent_target_file = agent_name+"_target.h5"
        self.history = []
        self.action_taken = None
        self.previous_state = None
        self.batch_size = batch_size
        self.gamma = learning_decay_rate
        self.target_update_steps = target_update_steps
        self.update_target_network_weights()

    
    def build_model(self):
        model = keras.Sequential()
        model.add(layers.Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(64, activation="relu"))
        model.add(layers.Dense(64, activation="relu"))
        model.add(layers.Dense(self.action_space, activation="relu"))
        model.compile(loss=keras.losses.Huber(), 
                      optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0))
 
        return model
    
    def exploration_decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = min(self.epsilon_min, self.epsilon*self.epsilon_decay)
            
    def save_model(self):
        self.model.save(self.agent_file)
        self.target_model.save(self.agent_target_file)
    
    def load_model(self):
        self.model.load_weights(self.agent_file)
        self.target_model.load_weights(self.agent_target_file)
        
    def policy(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        action_vals = self.predict(state)
        return np.argmax(action_vals[0])
    
    def save_action_state(self, action, state):
        self.action_taken = action
        self.previous_state = state
        
    def save_memory(self, state, reward, done):
        if self.previous_state is None and self.action_taken is None:
            self.previous_state = state
            self.action_taken = 0
        self.history.append((self.previous_state, self.action_taken, reward, state, done))
    
    def get_memory(self):
        return self.history
    
    def replay(self):
        if len(self.history) < self.batch_size:
            return
        sample_batch = random.sample(self.history, self.batch_size)
#         i = 0
        for state, action, reward, next_state, done in sample_batch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.target_predict(next_state)[0])
            target_f = self.predict(state)
            target_f[0][action] = target
            state = np.reshape(state, (1,28))
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        self.exploration_decay()
    
    def get_learning_steps(self):
        return self.learning_steps
    
    def predict(self, state):
#         print(f"Before Reshape: {state}")
        state = np.reshape(state, (1,28))
#         print(f"After Reshape: {state}")
        return self.model.predict(state)

    def target_predict(self, state):
        state = np.reshape(state, (1,28))
        return self.target_model.predict(state)

    def update_target_network_weights(self):
        self.target_model.set_weights(self.model.get_weights())

In [None]:
env.reset()
# print(NUM_OF_EPISODES)
# print(len(env.agents))
reward_array = np.zeros((NUM_OF_EPISODES,len(env.agents)))

cumulative_reward = np.zeros(len(env.agents))
i = 0
agent_models = {}
for key in agent_mapping:
    if key == "leadadversary_0":
        agent_models[key] = agent_stub()
    if "adversary" in key:
        agent_models[key] = agent_stub()
    if "agent" in key:
        if "agent_0" == key:
#             print("Making object")
            agent_models[key] = good_agent(key)
        else:
            agent_models[key] = agent_models
    
step = -1
            
for episode in range(NUM_OF_EPISODES):
    print(f"Episode {episode+1} out of {NUM_OF_EPISODES}")
    env.reset()
    if episode > 0:
        print(f"Time taken for previous episode: {time.time()-start_time}")
    start_time = time.time()
    
    for agent in env.agent_iter():
        step_time = time.time()    
        if step != get_current_step(env):
            print(f"Step: {get_current_step(env)}")
            step = get_current_step(env)
        
        observation, reward, done, info = env.last()
        cumulative_reward[agent_mapping[agent]] += reward
        agent_models[agent].save_memory(observation, reward, done)

    #     Renders the environment for each step in a seperate window.
        
        
        agent_models[agent].replay()
        
        
        if (get_current_step(env) % agent_models[agent].target_update_steps) == 0:
#             if agent == "agent_0":
#                 print("MODULO OUTPUT")
#                 print(get_current_step(env) % agent_models[agent].target_update_steps)
            agent_models[agent].update_target_network_weights()
#             print(get_current_step(env) % agent_models[agent].learning_steps)
#             prediction = agent_models[agent].predict(observation)
#             if prediction is not None:
#                 print(prediction)
#                 print(np.argmax(prediction))
            
            
#         env.render(mode='human')
    #     Steps the environment forward.
        if done:
            env.step(None)
            reward_array[episode,agent_mapping[agent]] = cumulative_reward[agent_mapping[agent]]
        else:
            action_to_take = agent_models[agent].policy(observation)
            env.step(action_to_take)
            agent_models[agent].save_action_state(action_to_take, observation)
        print(f"Step Time: {time.time()-step_time}")


2022-05-02 17:20:14.849402: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 17:20:14.857587: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 17:20:14.858264: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 17:20:14.858922: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Episode 1 out of 10
Step: 0
Step Time: 0.00030541419982910156
Step Time: 0.00022935867309570312
Step Time: 0.00036334991455078125
Step Time: 0.0002028942108154297
Step Time: 0.0033245086669921875
Step Time: 0.0021584033966064453
Step: 1
Step Time: 0.0003094673156738281
Step Time: 0.00024509429931640625
Step Time: 0.00025725364685058594
Step Time: 0.0002396106719970703
Step Time: 0.0002536773681640625
Step Time: 0.0010058879852294922
Step: 2
Step Time: 0.0002593994140625
Step Time: 0.00027298927307128906
Step Time: 0.0002231597900390625
Step Time: 0.0002224445343017578
Step Time: 0.00022721290588378906
Step Time: 0.0016944408416748047
Step: 3
Step Time: 0.00025653839111328125
Step Time: 0.0002262592315673828
Step Time: 0.00023651123046875
Step Time: 0.00027179718017578125
Step Time: 0.00022745132446289062
Step Time: 0.001538991928100586
Step: 4
Step Time: 0.0004892349243164062
Step Time: 0.0008559226989746094
Step Time: 0.0002930164337158203
Step Time: 0.00022792816162109375
Step Time: 

Step Time: 1.605837345123291
Step Time: 0.0013720989227294922
Step: 41
Step Time: 0.0004513263702392578
Step Time: 0.00032830238342285156
Step Time: 0.0003082752227783203
Step Time: 0.00031280517578125
Step Time: 1.6688580513000488
Step Time: 0.0020248889923095703
Step: 42
Step Time: 0.0002651214599609375
Step Time: 0.00043463706970214844
Step Time: 0.0002639293670654297
Step Time: 0.0002646446228027344
Step Time: 1.8487491607666016
Step Time: 0.0016286373138427734
Step: 43
Step Time: 0.0003631114959716797
Step Time: 0.0002493858337402344
Step Time: 0.00022840499877929688
Step Time: 0.0002200603485107422
Step Time: 1.7629616260528564
Step Time: 0.0010979175567626953
Step: 44
Step Time: 0.00025391578674316406
Step Time: 0.0003933906555175781
Step Time: 0.00030803680419921875
Step Time: 0.00021648406982421875
Step Time: 1.588315725326538
Step Time: 0.0012638568878173828
Step: 45
Step Time: 0.0002849102020263672
Step Time: 0.0002491474151611328
Step Time: 0.00023245811462402344
Step Time:

Step Time: 1.587284803390503
Step Time: 0.0010941028594970703
Step: 82
Step Time: 0.0002760887145996094
Step Time: 0.00022172927856445312
Step Time: 0.00023937225341796875
Step Time: 0.00022745132446289062
Step Time: 1.5722827911376953
Step Time: 0.0013849735260009766
Step: 83
Step Time: 0.0005829334259033203
Step Time: 0.0002682209014892578
Step Time: 0.0002219676971435547
Step Time: 0.00023698806762695312
Step Time: 1.7144432067871094
Step Time: 0.0015382766723632812
Step: 84
Step Time: 0.0003159046173095703
Step Time: 0.00030541419982910156
Step Time: 0.0008261203765869141
Step Time: 0.0004353523254394531
Step Time: 1.7954769134521484
Step Time: 0.0011620521545410156
Step: 85
Step Time: 0.00022077560424804688
Step Time: 0.00087738037109375
Step Time: 0.00039887428283691406
Step Time: 0.00020384788513183594
Step Time: 1.897106409072876
Step Time: 0.0014426708221435547
Step: 86
Step Time: 0.0003695487976074219
Step Time: 0.0003418922424316406
Step Time: 0.0002570152282714844
Step Time

In [None]:
# Closs the render window.
env.close()

### Print Reward Array

In [None]:
np_array_no_e(reward_array)