## Import necessary files

In [1]:
import tensorflow as tf
import torch
import numpy as np
import matplotlib.pyplot as plt
from training.utils import get_var_name, training_session_unity, plot_multiple_sessions, get_root_dir

print(torch.__version__)
print(tf.__version__)
print(np.__version__)

0.4.0
1.13.1
1.15.3


## Import classes containing algorithm and model 

In [2]:
from unityagents import UnityEnvironment
from models.ddpg_agent import Agent, SoftAgent
from models.a2c_agent import ACAgent
from training.ddpg import ddpg
from training.a2c import a2c

In [3]:
## Load environment

In [4]:
env = UnityEnvironment(file_name="Reacher_Linux_multi/Reacher_Linux/Reacher.x86_64", no_graphics = True)
seed=0

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

train_mode = True
action_size = brain.vector_action_space_size
env_info = env.reset(train_mode=train_mode)[brain_name] # reset the environment
state = env_info.vector_observations[0]
state_size = len(state)
num_agents = len(env_info.agents)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## Settings and model selection

In [7]:
n_episodes = 500
max_t = 1000
n_training_sessions = 1
model_type = 'ddpg'

## Train and plot the scores

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
if model_type == 'ddpg':
    ddpg_agent = Agent(state_size, action_size, seed)
    scores_vanilla_x, scores_vanilla_mean, scores_vanilla_std, scores_vanilla_y = training_session_unity(ddpg,
                                                                                                         ddpg_agent,
                                                                                                         env,
                                                                                                         brain_name,
                                                                                                         env_info,
                                                                                                         get_var_name(
                                                                                                             ddpg),
                                                                                                         n_episodes,
                                                                                                         max_t)
    plot_multiple_sessions(scores_vanilla_x, scores_vanilla_mean, scores_vanilla_std, label=get_var_name(ddpg),
                           color='gray')
    plt.plot(np.arange(len(scores_vanilla_y)), scores_vanilla_y, alpha=.1)
elif model_type == 'a2c':
    a2c_agent = ACAgent(state_size, action_size, seed, num_agents, rollout_length=5,lr=1e-4,lr_decay=.95,gamma=.95,value_loss_weight=1,gradient_clip=5)
    scores_a2c_x, scores_a2c_mean, scores_a2c_std, scores_a2c_y = training_session_unity(a2c, a2c_agent, env, brain_name, env_info, get_var_name(ddpg), n_episodes, max_t)
    plot_multiple_sessions(scores_a2c_x, scores_a2c_mean, scores_a2c_std, label=get_var_name(a2c), color='blue')
    plt.plot(np.arange(len(scores_a2c_y)), scores_a2c_y, alpha=.1)

plt.legend()
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(get_root_dir() + '/unity_environment_model_{}_batch_size_128.png'.format(model_type))
plt.show()

Episode 1	Total Average Score: 0.15	Mean: 0.15	Min: 0.00	Max: 0.76	Duration: 5.04
Episode 2	Total Average Score: 0.45	Mean: 0.75	Min: 0.00	Max: 1.97	Duration: 5.08
Episode 3	Total Average Score: 0.51	Mean: 0.64	Min: 0.00	Max: 2.23	Duration: 5.17
Episode 4	Total Average Score: 0.44	Mean: 0.23	Min: 0.00	Max: 1.27	Duration: 5.15
Episode 5	Total Average Score: 0.39	Mean: 0.15	Min: 0.00	Max: 1.37	Duration: 5.31
Episode 6	Total Average Score: 0.42	Mean: 0.61	Min: 0.00	Max: 1.33	Duration: 5.40
Episode 7	Total Average Score: 0.39	Mean: 0.20	Min: 0.00	Max: 0.66	Duration: 5.32
Episode 8	Total Average Score: 0.48	Mean: 1.07	Min: 0.16	Max: 2.44	Duration: 5.91
Episode 9	Total Average Score: 0.54	Mean: 1.05	Min: 0.16	Max: 2.04	Duration: 5.67
Episode 10	Total Average Score: 0.56	Mean: 0.77	Min: 0.00	Max: 1.82	Duration: 6.00
Episode 10	Total Average Score: 0.56
Episode 11	Total Average Score: 0.56	Mean: 0.48	Min: 0.00	Max: 2.07	Duration: 5.84
Episode 12	Total Average Score: 0.61	Mean: 1.19	Min: 0.12	M

## Training session

We can call the method training session with an algorithm of our choice.

In [None]:
def training_session_unity(algorithm, agent, env, brain_name, env_info, model_name, n_episodes, max_t, buckets=5):
    num_agents = len(env_info.agents)
    scores_y = np.asarray(algorithm(agent, env, num_agents, True, brain_name, model_name, n_episodes, max_t))
    print(scores_y)
    solved_episodes = len(scores_y)
    bucket_size = solved_episodes // buckets
    scores_mean = np.zeros(solved_episodes+1)
    scores_std = np.zeros(solved_episodes+1)
    scores_x = np.arange(buckets+1) * (solved_episodes // buckets)

    for bucket in range(buckets):
        start = bucket*bucket_size
        end = start + bucket_size
        scores_mean[bucket+1] = scores_y[start:end].mean(0)
        scores_std[bucket+1] = scores_y[start:end].std(0)

    return scores_x, scores_mean, scores_std, scores_y

## Algorithms

The current implemented algorithms are A2C and DDPG. The difference is that the A2C algorithm also requires to store the probabilities for the actor and state values for the critic network. 

In [5]:
def a2c(agent, env, num_agents, train_mode, brain_name, model_name, n_episodes=2000, max_t=1000, print_every=10, learn_every=20, num_learn=10, goal_score=30, rollout_length=5):
    total_scores_deque = deque(maxlen=100)
    total_scores = []

    for i_episode in range(1, n_episodes + 1):
        # Reset Env and Agent
        env_info = env.reset(train_mode=train_mode)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        #agent.reset()

        start_time = time.time()
        steps_taken = 0
        for t in range(max_t):
            steps_taken +=1
            actions, log_probs, state_values = agent.act(states)      # select actions for 20 envs
            env_inst = env.step(actions)[brain_name]     # send the actions to the environment
            next_states = env_inst.vector_observations                          # get the next states
            rewards = env_inst.rewards                                          # get the rewards
            dones = env_inst.local_done                                         # see if episode has finished
            not_dones = [1-done for done in dones]        

            for state, action, reward, next_state, not_done, log_prob, state_value in zip(states, actions, rewards, next_states, not_dones, log_probs, state_values):
                agent.step(state, action, reward, next_state, not_done, log_prob, state_value)  # send actions to the agent


In [6]:
def ddpg(agent, env, num_agents, train_mode, brain_name, model_name, n_episodes=2000, max_t=1000, print_every=10, learn_every=20, num_learn=10, goal_score=1):
    total_scores_deque = deque(maxlen=100)
    total_scores = []

    for i_episode in range(1, n_episodes + 1):
        # Reset Env and Agent
        env_info = env.reset(train_mode=train_mode)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        agent.reset()

        start_time = time.time()

        for t in range(max_t):
            actions = agent.act(states)

            env_info = env.step(actions)[brain_name]  # send all actions to the environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)

            dones = env_info.local_done  # see if episode finished

            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                agent.step(state, action, reward, next_state, done)  # send actions to the agent

            scores += env_info.rewards  # update the score (for each agent)
            states = next_states  # roll over states to next time step

            if t % learn_every == 0:
                for _ in range(num_learn):
                    agent.start_learn()

            if np.any(dones):  # exit loop if episode finished
                break

