<br><br><b><font size=10> CONTINUOUS CONTROL</font></b>
#### <i>...implementation for Udacity Deep Reinforcement Learning 
<hr><hr>

## Initialize Imports for the notebook
This Notebook uses code from separate python files where most of the implementation is handled

In [1]:
import environment as E
from buffers import ReplayBuffer
from agent import D4PG_Agent
import models
from logger import Logger

import copy
import importlib
import os.path
import random
import re
import sys
import time

import torch
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
from collections import deque
import torchvision.transforms as T
# import multiprocessing as multi
# multi.cpu_count()

## Manually declare an ARGS class
<i> For testing code in the notebook, to take the place of argparser in the command line.

In [2]:
class Args:
    def __init__(self):
        self.train = True
        self.nographics = False
        self.num_eps = 10
        self.rollout = 5
        self.batchsize = 64
        self.pretrain = 1000
        self.num_episodes = 1
        self.max_time = 50
        self.alr = 1e-4
        self.clr = 1e-4
        self.batch = 128
        self.buffer = 100000
        self.C = 4000        
        
args = Args()
def check_args():
    for arg in vars(args):
        if arg == "sep": continue
        print("{}: {}".format(arg.upper(), getattr(args, arg)))
        
        
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")        

<b>Confirm that the args are all set the way we want them.

In [3]:
check_args()

TRAIN: True
NOGRAPHICS: False
NUM_EPS: 10
ROLLOUT: 5
BATCHSIZE: 64
PRETRAIN: 1000
NUM_EPISODES: 1
MAX_TIME: 50
ALR: 0.0001
CLR: 0.0001
BATCH: 128
BUFFER: 100000
C: 4000


## Load the environment
<i> 
And print a bit of information contained in the wrapper class
    
Set the training mode to FALSE while interactively learning about the code

In [4]:
env = E.Environment(args, id=1)
print("State size:", env.state_size)
print("Action size:", env.action_size)
print("Num Agents:", env.agent_count)


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


State size: 33
Action size: 4
Num Agents: 20


<hr>

# Test code as it's developed

<hr>

## Take random actions in the environment below 
<i>
-to check that code is working<br>
-to get familiar with the environment

In [None]:
env.train = False
env.reset()
max_steps = 100
scores = np.zeros(env.agent_count)
arewards = np.empty((max_steps, env.agent_count))
states = env.states
actions = np.zeros((20,4))
for i in range(max_steps):
    actions = np.random.randn(env.agent_count, env.action_size)
    #actions[:,3] += .01
    actions = np.clip(actions, -1, 1)
    next_states, rewards, dones = env.step(actions)
    scores += rewards
    arewards[i] = rewards
    states = next_states
#     if np.any(dones):
#         break
#    i += 1
#print(arewards)
print(arewards.min())
print(arewards.max())
#print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))  


### Force-reload modules as they're updated
<i> This notebook was developed as the code is written in Atom, the below cell reloads the modules as they're needed.

In [None]:
import models
import agent
import buffers
importlib.reload(models)
importlib.reload(agent)
importlib.reload(buffers)
from agent import D4PG_Agent
env.train = True
env.reset()

### Load the AGENT!

In [5]:
d4pg_agent = D4PG_Agent(env.state_size, env.action_size, env.agent_count, args.alr, args.clr, args.batch, args.buffer, args.C)

d4pg_agent.initialize_memory(args.pretrain, env)

Initializing memory buffer.
Taking pretrain step 10... memory filled: 120/1000                      
Taking pretrain step 20... memory filled: 320/1000                      
Taking pretrain step 30... memory filled: 520/1000                      
Taking pretrain step 40... memory filled: 720/1000                      
Taking pretrain step 50... memory filled: 920/1000                      
Taking pretrain step 54... memory filled: 1000/1000                      
Done!


In [None]:
print(d4pg_agent.critic.fc1.weight.data)
print(d4pg_agent.critic_target.fc1.weight.data)

## Test out Actor actions without training
<i> Test the <b>Actor</b> network

In [None]:
env.reset()
scores = np.zeros(env.agent_count)
states = env.states
for i in range(30):
    actions = d4pg_agent.act(states)
    
    # Print sample actions returned by the ACTOR network
    print("ACTIONS:", actions[1])
    
    next_states, rewards, dones = env.step(actions)
    scores += rewards
    states = next_states
    if np.any(dones):
        break
    i += 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))    

## Test out Critic scores without training
<i> Test the <b>Critic</b> network

In [None]:
env.reset()
scores = np.zeros(env.agent_count)
states = env.states
for i in range(2):
    actions = d4pg_agent.act(states) 
    ns, rewards, dones = env.step(actions)
    scores += rewards
    
    # Print sample distributions returned by the CRITIC
    batch = d4pg_agent.memory.sample(batch_size=1)
    states = torch.cat(batch.state).to(DEVICE)
    actions = torch.cat(batch.action).float().to(DEVICE)
    rewards = torch.cat(batch.reward).to(DEVICE)
    next_states = torch.cat(batch.next_state).to(DEVICE) 

    dist, probs = d4pg_agent.critic_target(next_states, d4pg_agent.actor(next_states))
    proj_dist = d4pg_agent._get_targets(rewards, next_states)
    ldist, lprobs = d4pg_agent.critic(next_states, d4pg_agent.actor(next_states))
    
    print("DIST: ", dist)
    print("Local DIST: ", ldist)
#     print("PROBS: ", probs)
#     print("PROJECTED DIST: ", proj_dist)
    print("\n")
    
    states = ns
    if np.any(dones):
        break
    i += 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))    

<hr><hr>

## Test out various Agent functionality

<hr><hr>

In [None]:
args.alr = 1e-4
args.clr = 1e-4
args.batch = 128
args.buffer = 100000
args.C = 4000
vmin = 0
vmax = 0.1
num_atoms = 51

In [None]:
check_args()


In [6]:
import models
import agent
import buffers
importlib.reload(models)
importlib.reload(agent)
importlib.reload(buffers)
from agent import D4PG_Agent
d4pg_agent = D4PG_Agent(env.state_size, env.action_size, env.agent_count, args.alr, args.clr, args.batch, args.buffer, args.C)
env.train = True
env.reset()
#d4pg_agent.initialize_memory(args.pretrain, env)

In [8]:
os.listdir()

['.ipynb_checkpoints',
 'agent.py',
 'buffers.py',
 'D4PG.ipynb',
 'environment.py',
 'get_args.py',
 'logger.py',
 'main.py',
 'meta.py',
 'models.py',
 'Notebooks',
 'Reacher_Windows_x86_64',
 'Reacher_Windows_x86_64_1agent',
 'README.md',
 'saves',
 'savesD4PG_20190306_v18',
 'unity-environment.log',
 'utils.py',
 'work.ipynb',
 '_ARCHIVE',
 '__pycache__']

In [9]:
env.train = False
env.reset()
file = 'saves/D4PG_20190306_v17_eps100_FINAL.agent'
checkpoint = torch.load(file, map_location=lambda storage, loc: storage)

In [11]:
d4pg_agent.actor.load_state_dict(checkpoint['actor_dict'])

In [None]:
args.max_time = 4

In [None]:
t= torch.linspace(vmin, vmax, num_atoms)
print(t)
print(t.sum())

In [None]:
#w_old = torch.zeros(d4pg_agent.actor.output.weight.data[0].shape)

for episode in range(1, 2):
    # Begin each episode with a clean environment
    env.reset()
    # Get initial state
    states = env.states
    scores = np.zeros(env.agent_count)
    # Gather experience for a maximum amount of steps, or until Done,
    # whichever comes first
    for t in range(args.max_time):
        actions = d4pg_agent.act(states)
        next_states, rewards, dones = env.step(actions)
        d4pg_agent.step(states, actions, rewards, next_states)
        states = next_states

        scores += rewards
        if np.any(dones):
            break
        print("A LOSS: ", d4pg_agent.actor_loss)
        print("C LOSS: ", d4pg_agent.critic_loss)
    print("Episode rewards: ", scores.mean())
    agent.new_episode()

## Learn about how the Categorical Bellman step works

In [None]:
actions = np.random.randn(env.agent_count, env.action_size)
actions = np.clip(actions, -1, 1).astype(np.float32)
states = torch.from_numpy(env.states).float()

In [None]:
probs, log_probs = critic(states, torch.from_numpy(actions))

In [None]:
print(probs.shape)
print(log_probs.shape)

In [None]:
class Container():
    def __init__(self):
        pass
c = Container()
    

In [None]:
vmin = -10
vmax = 10
natoms = 51
gamma = .99
atoms = torch.linspace(vmin, vmax, natoms)
delta_z = (vmax - vmin) / (natoms -1)
r = torch.tensor(rewards).unsqueeze(-1)

probs = probs.detach()
q_next = (probs * atoms).sum()


### projected atoms
<html><i>
<b>atoms.view(1,-1)</b> becomes shape [1, num_atoms]
<br>
<b>r</b> is unsqueezed in the last (-1) dimension, so it's shape [20,1]
<br>
the result is a tensor that holds an offset (projected) version of the atoms for each reward instance

In [None]:
np.set_printoptions(suppress=False)

#### tz = projected atoms, atoms (values) projected by scaling and offsetting via the bellman equation


In [None]:
tz = r + gamma * atoms.view(1,-1)
tz.clamp_(vmin, vmax)

#### computes "bj" from the pseudo-code

In [None]:
b = (tz - vmin) / c.delta_z
b[0].numpy()

#### l/u in the psuedocode are LOWER and UPPER bounds on the supports

In [None]:
l = b.floor().long()
u = b.ceil().long()
print(l[0])
print(u[0])

#### m_l/m_u are computed in the pseudocode under "distribute the probability of tz", but still a bit opaque to me on how it should be used

In [None]:
ml = (u.float() + (l == u).float() - b) * c.probs
ml[0].numpy()

In [None]:
mu = (b - l.float()) * c.probs
mu[0].numpy()

In [None]:
target_prob = torch.tensor(np.zeros(probs.size()))
for i in range(target_prob.size(0)):
    target_prob[i].index_add_(0, l[i].long(), ml[i].double())
    target_prob[i].index_add_(0, u[i].long(), mu[i].double())
target_prob[0].numpy()

### Close the environment when finished with the code/training/etc

In [None]:
env.close()