# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [1]:
import numpy as np
import os
import tensorflow as tf
import re

from ppo_multi.history import *
from ppo_multi.models import *
from ppo_multi.trainer import Trainer
from unityagents import *

### Hyperparameters

In [2]:
### General parameters
max_steps = 5e5 # Set maximum number of steps to run environment.
run_path = "ppo-hunter13" # The sub-directory name for model and summary statistics
load_model = True # Whether to load a saved model.
train_model =  True # Whether to train the model.
summary_freq = 1000 # Frequency at which to save training statistics.
save_freq = 5000 # Frequency at which to save model.
env_name = "hunter" # Name of the training environment file.
curriculum_file = None

### Algorithm-specific parameters for tuning
gamma = 0.997 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 1e-3 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 10240 # How large the experience buffer should be before gradient descent.
learning_rate = 5e-5 # Model learning rate.
hidden_units = 64 # Number of units in hidden layer.
batch_size = 1024 # How many experiences per gradient descent update step.

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffe_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

### Load the environment

In [3]:
env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file, worker_id = 2)
print(str(env))


INFO:unityagents.environment:
'HunterAcademy' started successfully!


Unity Academy name: HunterAcademy
        Number of brains: 2
        Reset Parameters :
		hunterSpeed -> 0.1
		ringRadius -> 20.0
		hunteeSpeed -> 0.15
		sphereRadius -> 2.0
Unity brain name: HunteeBrain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 6
        Action space type: continuous
        Action space size (per agent): 2
        Memory space size (per agent): 0
        Action descriptions: , 
Unity brain name: HunterBrain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 6
        Action space type: continuous
        Action space size (per agent): 2
        Memory space size (per agent): 0
        Action descriptions: , 


### Train the Agent(s)

In [4]:
tf.reset_default_graph()

# graphs = {}
# for brain in env.external_brain_names:
#     graphs[brain] =  tf.Graph()

if curriculum_file == "None":
    curriculum_file = None

#curriculum is disabled
# def get_progress(brain):
#     if curriculum_file is not None:
#         if env._curriculum.measure_type == "progress":
#             return steps / max_steps
#         elif env._curriculum.measure_type == "reward":
#             return last_reward
#         else:
#             return None
#     else:
#         return None

# Create the Tensorflow model graph
models = {}

for brain in env.external_brain_names:
    with tf.variable_scope(re.sub('[^0-9a-zA-Z]+', '-', brain)):
        models[brain] = create_agent_model(env.brains[brain], lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps)


# is_continuous = (env.brains[brain_name].action_space_type == "continuous")
# use_observations = (env.brains[brain_name].number_observations > 0)
# use_states = (env.brains[brain_name].state_space_size > 0)
model_path = './models/{}'.format(run_path)
summary_paths = {}
for brain in env.external_brain_names:
    summary_paths[brain] = './summaries/{}'.format(run_path+'_'+brain)
    if not os.path.exists(summary_paths[brain]):
        os.makedirs(summary_paths[brain])

if not os.path.exists(model_path):
    os.makedirs(model_path)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps = {}
    last_rewards = {}
    summary_writers = {}
    for brain in env.external_brain_names:
        steps[brain], last_rewards[brain] = sess.run([models[brain].global_step, models[brain].last_reward])    
        summary_writers[brain] = tf.summary.FileWriter(summary_paths[brain])
#         if train_model:
#             trainers[brain].write_text(summary_writers[brain], 'Hyperparameters', hyperparameter_dict, steps)

#     info = env.reset(train_mode=train_model, progress=get_progress())
    info = env.reset(train_mode=train_model)
    trainers = {}
    for brain in env.external_brain_names:
        trainers[brain] = Trainer(models[brain], sess, info[brain],
           (env.brains[brain].action_space_type == "continuous"),
            (env.brains[brain].number_observations > 0),
             (env.brains[brain].state_space_size > 0),
              train_model)

        
        
        
    while min([steps[b] for b in env.external_brain_names]) <= max_steps:
        if env.global_done:
#             info = env.reset(train_mode=train_model, progress=get_progress())
            info = env.reset(train_mode=train_model)
        # Decide and take an action
        take_action_epsi = {}
        take_action_actions = {}
        take_action_a_dist = {}
        take_action_value = {}
        for brain in env.external_brain_names:
            (take_action_epsi[brain], take_action_actions[brain], take_action_a_dist[brain], take_action_value[brain]
                ) = trainers[brain].take_action(info[brain], env, brain, steps[brain])

        new_info = env.step(take_action_actions)
        for brain in env.external_brain_names:
            trainers[brain].add_experiences(info[brain], new_info[brain], take_action_epsi[brain],
                            take_action_actions[brain], take_action_a_dist[brain], take_action_value[brain])

        info = new_info
        for brain in env.external_brain_names:
            trainers[brain].process_experiences(info[brain], time_horizon, gamma, lambd)
            if len(trainers[brain].training_buffer['actions']) > buffer_size and train_model:
                    # Perform gradient descent with experience buffer
                    trainers[brain].update_model(batch_size, num_epoch)
            if steps[brain] % summary_freq == 0 and steps[brain] != 0 and train_model:
                # Write training statistics to tensorboard.
                trainers[brain].write_summary(summary_writers[brain], brain, steps[brain], env._curriculum.lesson_number)
            if steps[brain] % save_freq == 0 and steps[brain] != 0 and train_model:
                # Save Tensorflow model
                # This does not need to be for each brain 
                save_model(sess, model_path=model_path, steps=steps[brain], saver=saver)
            steps[brain] += 1
            sess.run(models[brain].increment_step)
            if len(trainers[brain].stats['cumulative_reward']) > 0:
                mean_reward = np.mean(trainers[brain].stats['cumulative_reward'])
                sess.run(models[brain].update_reward, feed_dict={models[brain].new_reward: mean_reward})
                last_reward = sess.run(models[brain].last_reward)
    for brain in env.external_brain_names:
        # Final save Tensorflow model
        if steps[brain] != 0 and train_model:
            save_model(sess, model_path=model_path, steps=steps[brain], saver=saver)
env.close()
nodes = []
for brain in env.external_brain_names:
    scope = (re.sub('[^0-9a-zA-Z]+', '-', brain)) + '/'
    nodes +=[scope + x for x in ["action","value_estimate","action_probs"]]
    
export_graph(model_path, env_name, target_nodes=','.join(nodes))

Loading Model...
Saved Model
Brain : HunteeBrain. Step: 356000. Mean Reward: 0.497947019868. Std of Reward: 0.712351128057.
Brain : HunterBrain. Step: 356000. Mean Reward: 0.0417762592641. Std of Reward: 0.401382803244.
Brain : HunteeBrain. Step: 357000. Mean Reward: 0.396325301205. Std of Reward: 0.756975825376.
Brain : HunterBrain. Step: 357000. Mean Reward: 0.0762353173678. Std of Reward: 0.433021941047.
Brain : HunteeBrain. Step: 358000. Mean Reward: 0.45987804878. Std of Reward: 0.741301580943.
Brain : HunterBrain. Step: 358000. Mean Reward: 0.028860659191. Std of Reward: 0.39179231734.
Brain : HunteeBrain. Step: 359000. Mean Reward: 0.353964497041. Std of Reward: 0.767354032867.
Brain : HunterBrain. Step: 359000. Mean Reward: 0.0594347695723. Std of Reward: 0.425898706917.
Brain : HunteeBrain. Step: 360000. Mean Reward: 0.345380116959. Std of Reward: 0.765549339483.
Saved Model
Brain : HunterBrain. Step: 360000. Mean Reward: -0.00399866081727. Std of Reward: 0.369251863612.
Saved

Brain : HunteeBrain. Step: 399000. Mean Reward: 0.184139784946. Std of Reward: 0.817327510873.
Brain : HunterBrain. Step: 399000. Mean Reward: 0.141110768333. Std of Reward: 0.477831141802.
Brain : HunteeBrain. Step: 400000. Mean Reward: 0.212043010753. Std of Reward: 0.817636324683.
Saved Model
Brain : HunterBrain. Step: 400000. Mean Reward: 0.0988611592677. Std of Reward: 0.447513207149.
Saved Model
Brain : HunteeBrain. Step: 401000. Mean Reward: 0.1615625. Std of Reward: 0.823448450275.
Brain : HunterBrain. Step: 401000. Mean Reward: 0.198772913329. Std of Reward: 0.494851075607.
Brain : HunteeBrain. Step: 402000. Mean Reward: 0.249450549451. Std of Reward: 0.821705359666.
Brain : HunterBrain. Step: 402000. Mean Reward: 0.138545031366. Std of Reward: 0.473024148064.
Brain : HunteeBrain. Step: 403000. Mean Reward: 0.348779069767. Std of Reward: 0.774636612498.
Brain : HunterBrain. Step: 403000. Mean Reward: 0.0824715727771. Std of Reward: 0.434559299813.
Brain : HunteeBrain. Step: 40

Brain : HunteeBrain. Step: 442000. Mean Reward: 0.407469879518. Std of Reward: 0.767628711979.
Brain : HunterBrain. Step: 442000. Mean Reward: 0.0741918242941. Std of Reward: 0.429181214875.
Brain : HunteeBrain. Step: 443000. Mean Reward: 0.444397590361. Std of Reward: 0.737355620987.
Brain : HunterBrain. Step: 443000. Mean Reward: 0.0736255252455. Std of Reward: 0.432352731415.
Brain : HunteeBrain. Step: 444000. Mean Reward: 0.345941176471. Std of Reward: 0.790313785083.
Brain : HunterBrain. Step: 444000. Mean Reward: 0.104331216611. Std of Reward: 0.451442055361.
Brain : HunteeBrain. Step: 445000. Mean Reward: 0.203031914894. Std of Reward: 0.829454357332.
Saved Model
Brain : HunterBrain. Step: 445000. Mean Reward: 0.136348454197. Std of Reward: 0.480524328617.
Saved Model
Brain : HunteeBrain. Step: 446000. Mean Reward: 0.217978723404. Std of Reward: 0.829147960242.
Brain : HunterBrain. Step: 446000. Mean Reward: 0.16445971593. Std of Reward: 0.49211619186.
Brain : HunteeBrain. Step:

Brain : HunteeBrain. Step: 485000. Mean Reward: 0.251847826087. Std of Reward: 0.817524004897.
Saved Model
Brain : HunterBrain. Step: 485000. Mean Reward: 0.117762823046. Std of Reward: 0.46877182885.
Saved Model
Brain : HunteeBrain. Step: 486000. Mean Reward: 0.196577540107. Std of Reward: 0.817359808666.
Brain : HunterBrain. Step: 486000. Mean Reward: 0.138558606508. Std of Reward: 0.47525467229.
Brain : HunteeBrain. Step: 487000. Mean Reward: 0.170263157895. Std of Reward: 0.830860324072.
Brain : HunterBrain. Step: 487000. Mean Reward: 0.136927962586. Std of Reward: 0.475939920654.
Brain : HunteeBrain. Step: 488000. Mean Reward: 0.0887128712871. Std of Reward: 0.843947189196.
Brain : HunterBrain. Step: 488000. Mean Reward: 0.197277541843. Std of Reward: 0.504824574117.
Brain : HunteeBrain. Step: 489000. Mean Reward: 0.190322580645. Std of Reward: 0.828462043975.
Brain : HunterBrain. Step: 489000. Mean Reward: 0.145994898074. Std of Reward: 0.483076845901.
Brain : HunteeBrain. Step: 

AssertionError: action is not in graph

### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [5]:
nodes = []
for brain in env.external_brain_names:
    scope = (re.sub('[^0-9a-zA-Z]+', '-', brain)) + '/'
    nodes +=[scope + x for x in ["action","value_estimate","action_probs"]]
    
export_graph(model_path, env_name, target_nodes=','.join(nodes))

INFO:tensorflow:Froze 20 variables.


INFO:tensorflow:Froze 20 variables.


Converted 20 variables to const ops.
134 ops in the final graph.
