<center><br><br><b><font size=10> NAVIGATION </font></b><br><br>
<font size = 4><i>...implementation for Udacity Deep Reinforcement Learning 
<hr>

## Initialize Imports for the notebook
This Notebook uses code from separate python files where most of the implementation is handled

In [None]:
import numpy as np
from agent import DQN_Agent
from environment import Environment
from data_handling import Logger, Saver, gather_args

## Implement MENU.PY
#### <i> This implementation was originally intended to be run on the command-line, so let's import the functions from main.py and explore

Commandline arguments run the entire show, so we'll need to manually declare them...

In [None]:
cmd_args = "--num_episodes 500 --learn_rate 0.0001 --batch_size 64 -C 650"

In [None]:
args = gather_args(cmd_args.split())

Let's check out what arguments have been loaded...

In [None]:
print('\n'.join(["{}: {}".format(arg, getattr(args, arg)) for arg in vars(args)]))

## Set up the world
Now that args are loaded, set up the remainder of the groundwork to prepare for running the Agent.

In [None]:
# Load the environment using the above ar
env = Environment(args)

# Using the params from args and the environment, set up an agent for training
agent = DQN_Agent(env.state_size,
                  env.action_size,
                  args)

# The Saver object will do all the saving and loading for the Agent
saver = Saver(agent.framework, agent, args.save_dir, args.load_file)

Print some information about the environment.

In [None]:
print("State size:", env.state_size)
print("Action size:", env.action_size)
print("Num Agents:", env.agent_count)

## Take random actions in the Environment
* Check that the environment is working
* Test commands and see the results!

While testing out the environment, set training mode to False, and limit max_steps to ensure it doesn't run too long for testing purposes.

In [None]:
env.train = False
env.reset()
max_steps = 100
scores = np.zeros(env.agent_count)
arewards = np.empty((max_steps, env.agent_count))
states = env.states
actions = np.zeros((20,4))
for i in range(max_steps):
    actions = np.random.randn(env.agent_count, env.action_size)
    #actions[:,3] += .01
    actions = np.clip(actions, -1, 1)
    next_states, rewards, dones = env.step(actions)
    scores += rewards
    arewards[i] = rewards
    states = next_states
#     if np.any(dones):
#         break
#    i += 1
#print(arewards)
print(arewards.min())
print(arewards.max())
#print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))  


### Force-reload modules as they're updated
<i> This notebook was developed as the code is written in Atom, the below cell reloads the modules as they're needed.

In [None]:
import models
import agent
import buffers
importlib.reload(models)
importlib.reload(agent)
importlib.reload(buffers)
from agent import D4PG_Agent
env.train = True
env.reset()

### Load the AGENT!

In [None]:
d4pg_agent = D4PG_Agent(env.state_size, env.action_size, env.agent_count, args.alr, args.clr, args.batch, args.buffer, args.C)

d4pg_agent.initialize_memory(args.pretrain, env)

In [None]:
print(d4pg_agent.critic.fc1.weight.data)
print(d4pg_agent.critic_target.fc1.weight.data)

## Test out Actor actions without training
<i> Test the <b>Actor</b> network

In [None]:
env.reset()
scores = np.zeros(env.agent_count)
states = env.states
for i in range(30):
    actions = d4pg_agent.act(states)
    
    # Print sample actions returned by the ACTOR network
    print("ACTIONS:", actions[1])
    
    next_states, rewards, dones = env.step(actions)
    scores += rewards
    states = next_states
    if np.any(dones):
        break
    i += 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))    

## Test out Critic scores without training
<i> Test the <b>Critic</b> network

In [None]:
env.reset()
scores = np.zeros(env.agent_count)
states = env.states
for i in range(2):
    actions = d4pg_agent.act(states) 
    ns, rewards, dones = env.step(actions)
    scores += rewards
    
    # Print sample distributions returned by the CRITIC
    batch = d4pg_agent.memory.sample(batch_size=1)
    states = torch.cat(batch.state).to(DEVICE)
    actions = torch.cat(batch.action).float().to(DEVICE)
    rewards = torch.cat(batch.reward).to(DEVICE)
    next_states = torch.cat(batch.next_state).to(DEVICE) 

    dist, probs = d4pg_agent.critic_target(next_states, d4pg_agent.actor(next_states))
    proj_dist = d4pg_agent._get_targets(rewards, next_states)
    ldist, lprobs = d4pg_agent.critic(next_states, d4pg_agent.actor(next_states))
    
    print("DIST: ", dist)
    print("Local DIST: ", ldist)
#     print("PROBS: ", probs)
#     print("PROJECTED DIST: ", proj_dist)
    print("\n")
    
    states = ns
    if np.any(dones):
        break
    i += 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))    

<hr><hr>

## Test out various Agent functionality

<hr><hr>

In [None]:
args.alr = 1e-4
args.clr = 1e-4
args.batch = 128
args.buffer = 100000
args.C = 4000
vmin = 0
vmax = 0.1
num_atoms = 51

In [None]:
check_args()


In [None]:
import models
import agent
import buffers
importlib.reload(models)
importlib.reload(agent)
importlib.reload(buffers)
from agent import D4PG_Agent
d4pg_agent = D4PG_Agent(env.state_size, env.action_size, env.agent_count, args.alr, args.clr, args.batch, args.buffer, args.C)
env.train = True
env.reset()
#d4pg_agent.initialize_memory(args.pretrain, env)

In [None]:
os.listdir()

In [None]:
env.train = False
env.reset()
file = 'saves/D4PG_20190306_v17_eps100_FINAL.agent'
checkpoint = torch.load(file, map_location=lambda storage, loc: storage)

In [None]:
d4pg_agent.actor.load_state_dict(checkpoint['actor_dict'])

In [None]:
args.max_time = 4

In [None]:
t= torch.linspace(vmin, vmax, num_atoms)
print(t)
print(t.sum())

In [None]:
#w_old = torch.zeros(d4pg_agent.actor.output.weight.data[0].shape)

for episode in range(1, 2):
    # Begin each episode with a clean environment
    env.reset()
    # Get initial state
    states = env.states
    scores = np.zeros(env.agent_count)
    # Gather experience for a maximum amount of steps, or until Done,
    # whichever comes first
    for t in range(args.max_time):
        actions = d4pg_agent.act(states)
        next_states, rewards, dones = env.step(actions)
        d4pg_agent.step(states, actions, rewards, next_states)
        states = next_states

        scores += rewards
        if np.any(dones):
            break
        print("A LOSS: ", d4pg_agent.actor_loss)
        print("C LOSS: ", d4pg_agent.critic_loss)
    print("Episode rewards: ", scores.mean())
    agent.new_episode()

## Learn about how the Categorical Bellman step works

In [None]:
actions = np.random.randn(env.agent_count, env.action_size)
actions = np.clip(actions, -1, 1).astype(np.float32)
states = torch.from_numpy(env.states).float()

In [None]:
probs, log_probs = critic(states, torch.from_numpy(actions))

In [None]:
print(probs.shape)
print(log_probs.shape)

In [None]:
class Container():
    def __init__(self):
        pass
c = Container()
    

In [None]:
vmin = -10
vmax = 10
natoms = 51
gamma = .99
atoms = torch.linspace(vmin, vmax, natoms)
delta_z = (vmax - vmin) / (natoms -1)
r = torch.tensor(rewards).unsqueeze(-1)

probs = probs.detach()
q_next = (probs * atoms).sum()


### projected atoms
<html><i>
<b>atoms.view(1,-1)</b> becomes shape [1, num_atoms]
<br>
<b>r</b> is unsqueezed in the last (-1) dimension, so it's shape [20,1]
<br>
the result is a tensor that holds an offset (projected) version of the atoms for each reward instance

In [None]:
np.set_printoptions(suppress=False)

#### tz = projected atoms, atoms (values) projected by scaling and offsetting via the bellman equation


In [None]:
tz = r + gamma * atoms.view(1,-1)
tz.clamp_(vmin, vmax)

#### computes "bj" from the pseudo-code

In [None]:
b = (tz - vmin) / c.delta_z
b[0].numpy()

#### l/u in the psuedocode are LOWER and UPPER bounds on the supports

In [None]:
l = b.floor().long()
u = b.ceil().long()
print(l[0])
print(u[0])

#### m_l/m_u are computed in the pseudocode under "distribute the probability of tz", but still a bit opaque to me on how it should be used

In [None]:
ml = (u.float() + (l == u).float() - b) * c.probs
ml[0].numpy()

In [None]:
mu = (b - l.float()) * c.probs
mu[0].numpy()

In [None]:
target_prob = torch.tensor(np.zeros(probs.size()))
for i in range(target_prob.size(0)):
    target_prob[i].index_add_(0, l[i].long(), ml[i].double())
    target_prob[i].index_add_(0, u[i].long(), mu[i].double())
target_prob[0].numpy()

### Close the environment when finished with the code/training/etc

In [None]:
env.close()