# Continuous Control

---




# DDPG Agent 

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from collections import deque
from statistics import mean, stdev

from ENV import environment_loader

In [None]:
%load_ext autoreload
%autoreload 2
from AGENT import DDPG_Agent

In [None]:
env, state_size, action_size, brain_name, num_agents = environment_loader("Reacher_Single/Reacher.exe", no_graphics = False)

In [None]:
def DDPG_eval(agent, env, max_t=100):
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    episode_length = 0                                     # measure number of steps allowed before done flag
    for i in range(max_t):
        # actions = np.random.randn(num_agents, action_size) # select an action (for each agent); randomly!
        actions =  np.random.randn(num_agents, action_size) if agent is None else agent.act(states[0]) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to the environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        episode_length +=1
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Episode length is {}'.format(episode_length))
    print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [None]:
DDPG_eval(None, env, max_t=1)

In [None]:
def DDPG_train(agent, env, max_t=100, num_episodes = 1, print_every=100, existing_score=[]):
    score_history = existing_score
    print("Initial Score History: ", score_history)
    scores_deque = deque(maxlen=print_every)
    last_running_mean = float('-inf')

    for episode in range(num_episodes):
        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
        agent.reset()                                          # reset the noise in the agent
        states = env_info.vector_observations                  # get the current state (for each agent)
        scores = np.zeros(num_agents)                          # initialize the score (for each agent)
        score = 0

        for i in range(max_t):
            actions = agent.act(states[0])                     # select an action (for each agent)
            #actions = np.clip(actions, -1, 1)                 # all actions between -1 and 1. This is already done in the model
            env_info = env.step(actions)[brain_name]           # send all actions to the environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            #print("s {}, a {}, r {}, n_s {}, d{}".format(states[0], actions, rewards[0], next_states[0], dones[0]))
            agent.step(states[0], actions, rewards[0], next_states[0], dones[0])

            #scores += env_info.rewards                         # update the score (for each agent)
            score += env_info.rewards[0]

            states = next_states                               # roll over states to next time step
            if np.any(dones):                                  # exit loop if episode finished
                break
        

        #returns_in_episode = np.mean(scores)
        returns_in_episode = score
        scores_deque.append(returns_in_episode)
        score_history.append(returns_in_episode)
        if episode > print_every:
            if mean(scores_deque) > last_running_mean:
                    print("")
                    print('Last {} was better, going to save it'.format(print_every))
                    torch.save(new_agent.actor_local.state_dict(), 'checkpoint_actor.pth')
                    torch.save(new_agent.critic_local.state_dict(), 'checkpoint_critic.pth')
                    last_running_mean = mean(scores_deque)

        print("\r", 'Total score (averaged over agents) {} episode: {} | \tAvarage in last {} is {}'.format(episode, returns_in_episode, print_every, np.mean(scores_deque)), end="")
        ##print("\r", 'Scores in {} episode: {}'.format(episode, scores), end="")
    

    return score_history

In [None]:
import os.path

filename="checkpoint_actor.pth"
path = ""

def fileAtLocation(filename, path):
    return os.path.exists(path + filename)

def load_previous(new_agent):
    loaded_agent = new_agent
    if fileAtLocation(filename, path):
        print("Found previous trained Agent, going to load them!")
        loaded_agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
        loaded_agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))
    return loaded_agent

In [None]:
new_agent = DDPG_Agent(state_size, action_size, 1, actor_hidden = [128, 128, 128], critic_hidden=[128, 128, 128])
##new_agent = load_previous(new_agent)

In [None]:
##env.no_graphics = True
score_history = DDPG_train(new_agent, env, max_t = 1001, num_episodes = 150)
#print(score_history)

In [None]:
score_history = DDPG_train(new_agent, env, max_t = 1001, num_episodes = 150, existing_score=score_history)

In [None]:
print(score_history)
print(len(score_history))
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(score_history)+1), score_history)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
%load_ext autoreload
%autoreload 2
from AGENT import DDPG_Agent

new_agent = DDPG_Agent(state_size, action_size, 5)
new_agent = load_previous(new_agent)

In [None]:
DDPG_eval(new_agent, env, max_t = 100000000000)

In [None]:
env.close()

In [None]:
torch.cuda.is_available()