<br><br><b><font size=10> CONTINUOUS CONTROL</font></b>
#### <i>...implementation for Udacity Deep Reinforcement Learning 
<hr><hr><hr>

## Initialize Imports for the notebook
This Notebook uses code from separate python files where most of the implementation is handled

In [1]:
import environment as E
from buffers import ReplayBuffer, nStepBuffer
from agent import D4PG_Agent

#from get_args import get_args

import os.path
import time
import re
import sys
import importlib
import copy

import torch
import matplotlib.pyplot as plt
import numpy as np
from unityagents import UnityEnvironment
from collections import deque
import torchvision.transforms as T
import multiprocessing as multi
multi.cpu_count()

## Manually declare an ARGS class
<i> For testing code in the notebook, to take the place of argparser in the command line.

In [3]:
class Args:
    def __init__(self):
        self.train = True
        self.nographics = False
        self.num_eps = 10
        self.rollout = 5
        self.batchsize = 64
        self.pretrain = 1000
        
args = Args()

<b>Confirm that the args are all set the way we want them.

In [4]:
for arg in vars(args):
    if arg == "sep": continue
    print("{}: {}".format(arg.upper(), getattr(args, arg)))

TRAIN: True
NOGRAPHICS: False
NUM_EPS: 10
ROLLOUT: 5
BATCHSIZE: 64
PRETRAIN: 1000


## Load the environment
<i> & print a bit of information contained in the wrapper class

In [5]:
env = E.Environment(args)
print("State size:", env.state_size)
print("Action size:", env.action_size)
print("Num Agents:", env.agent_count)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


State size: 33
Action size: 4
Num Agents: 20


<hr>

# Test code as it's developed

<hr>

## Take random actions in the environment below 
<i>
-to check that code is working<br>
-to get familiar with the environment

In [6]:
env.train = False
env.reset()

In [8]:
scores = np.zeros(env.agent_count)
states = env.states
for i in range(20):
    actions = np.random.randn(env.agent_count, env.action_size)
    actions = np.clip(actions, -1, 1)
    rewards, next_states, dones = env.step(actions)
    scores += rewards
    states = next_states
    if np.any(dones):
        break
    i += 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))    

Total score (averaged over agents) this episode: 0.0


### Force-reload modules as they're updated
<i> This notebook was developed as the code is written in Atom, the below cell reloads the modules as they're needed.

In [None]:
import agent
importlib.reload(agent)
importlib.reload(E)
from agent import D4PG_Agent


In [9]:
d4pg_agent = D4PG_Agent(env.state_size, env.action_size, env.agent_count)
print(d4pg_agent.__class__.__name__)
print(d4pg_agent.memory)
# agent.initialize_memory(10, env)

D4PG_Agent
<buffers.ReplayBuffer object at 0x0000019521037550>


In [None]:
env.reset()
states = env.states

### Test out Actor actions without training
<i> Test the <b>Actor</b> network

In [12]:
env.reset()
scores = np.zeros(env.agent_count)
states = torch.from_numpy(env.states).float()
for i in range(30):
    #actions = agent.actor(torch.from_numpy(states).float()).detach().numpy()
    #actions += agent.gauss_noise(actions.shape)
    #actions = np.clip(actions, -1, 1)
    actions = d4pg_agent.act(states)
    print("ACTIONS:", actions[1])
    rewards, next_states, dones = env.step(actions)
    scores += rewards
    states = next_states
    if np.any(dones):
        break
    i += 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))    

ACTIONS: [-0.05064295  0.52460176 -0.11020826 -0.11866675]
ACTIONS: [-0.85577244 -0.50838435  0.47384122 -0.40554383]
ACTIONS: [-0.14419676  0.11097305 -0.38743973  0.21927273]
ACTIONS: [-0.29291332  0.04862425 -0.12056067 -0.251571  ]
ACTIONS: [ 0.36597195  0.03275591 -0.21064252  0.4973052 ]
ACTIONS: [ 0.12144467 -0.4172025   0.00674612 -0.20973124]
ACTIONS: [-0.4237652  -0.495635   -0.2719553   0.48972487]
ACTIONS: [-0.38684046  0.29628918  0.10385685 -0.12236919]
ACTIONS: [ 0.15706265 -0.2275485  -0.07639529 -0.1208619 ]
ACTIONS: [ 0.04466942  0.07305933  0.42201594 -0.48331177]
ACTIONS: [-0.55127746 -0.4606485  -0.01193872  0.0854649 ]
ACTIONS: [ 0.48883307 -0.01707737  0.562292    0.5201416 ]
ACTIONS: [-0.22596851  0.20080428  0.36317518 -0.59994304]
ACTIONS: [ 0.26393005 -0.5978283   0.2588284   0.23891526]
ACTIONS: [-0.5864362   0.11508519  0.05778166  0.29325074]
ACTIONS: [ 0.07371671  0.41658568  0.20192315 -0.43073243]
ACTIONS: [0.31083867 0.12927142 0.16332026 0.3216419 ]
A

### Test out Critic scores without training
<i> Test the <b>Critic</b> network

In [16]:
env.reset()
scores = np.zeros(env.agent_count)
states = torch.from_numpy(env.states).float()
for i in range(100):
    actions = d4pg_agent.act(states) 
    rewards, next_states, dones = env.step(actions)
    scores += rewards
    q, probs, log_probs = d4pg_agent.critic(next_states.float(), torch.from_numpy(actions))
    print(q.shape, q)
    #print(values.sample())
    states = next_states
    if np.any(dones):
        break
    i += 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))    

torch.Size([20]) tensor([-4.4316e-05, -4.4147e-05, -4.3584e-05, -4.4554e-05, -4.3107e-05,
        -4.2999e-05, -4.3955e-05, -4.4702e-05, -4.3930e-05, -4.3269e-05,
        -4.2750e-05, -4.2502e-05, -4.3577e-05, -4.4249e-05, -4.3303e-05,
        -4.3212e-05, -4.2714e-05, -4.3852e-05, -4.3575e-05, -4.3618e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.5259e-05, -4.2509e-05, -4.3515e-05, -4.3619e-05, -4.3139e-05,
        -4.3213e-05, -4.3444e-05, -4.4258e-05, -4.2930e-05, -4.4217e-05,
        -4.3834e-05, -4.3374e-05, -4.3372e-05, -4.2725e-05, -4.3608e-05,
        -4.3234e-05, -4.3694e-05, -4.3701e-05, -4.3176e-05, -4.3269e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.3610e-05, -4.2645e-05, -4.5193e-05, -4.4212e-05, -4.3117e-05,
        -4.5728e-05, -4.4335e-05, -4.3320e-05, -4.3375e-05, -4.3942e-05,
        -4.3657e-05, -4.5020e-05, -4.3610e-05, -4.3198e-05, -4.3282e-05,
        -4.3422e-05, -4.2764e-05, -4.3379e-05, -4.3364e-05, -4.3154e-05],
       g

torch.Size([20]) tensor([-4.2360e-05, -4.4480e-05, -4.4150e-05, -4.3530e-05, -4.3944e-05,
        -4.3411e-05, -4.4657e-05, -4.3413e-05, -4.3508e-05, -4.3128e-05,
        -4.3087e-05, -4.3588e-05, -4.3897e-05, -4.3357e-05, -4.4469e-05,
        -4.3469e-05, -4.4249e-05, -4.3981e-05, -4.3059e-05, -4.4083e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.3008e-05, -4.4031e-05, -4.4476e-05, -4.3111e-05, -4.3539e-05,
        -4.3521e-05, -4.3625e-05, -4.3072e-05, -4.4150e-05, -4.3403e-05,
        -4.4778e-05, -4.4208e-05, -4.2861e-05, -4.2915e-05, -4.4094e-05,
        -4.3374e-05, -4.3798e-05, -4.4670e-05, -4.3599e-05, -4.4433e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.2913e-05, -4.3472e-05, -4.3962e-05, -4.3452e-05, -4.4053e-05,
        -4.3025e-05, -4.3932e-05, -4.3314e-05, -4.4432e-05, -4.6795e-05,
        -4.2671e-05, -4.4189e-05, -4.3591e-05, -4.3172e-05, -4.3729e-05,
        -4.3519e-05, -4.3370e-05, -4.3422e-05, -4.4368e-05, -4.4005e-05],
       g

torch.Size([20]) tensor([-4.3642e-05, -4.4227e-05, -4.3411e-05, -4.4785e-05, -4.3212e-05,
        -4.3495e-05, -4.3346e-05, -4.2664e-05, -4.6114e-05, -4.2606e-05,
        -4.2608e-05, -4.4106e-05, -4.4506e-05, -4.3698e-05, -4.3273e-05,
        -4.3899e-05, -4.4435e-05, -4.3809e-05, -4.2502e-05, -4.2668e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.3485e-05, -4.3500e-05, -4.3459e-05, -4.3247e-05, -4.3625e-05,
        -4.3301e-05, -4.4100e-05, -4.2720e-05, -4.4338e-05, -4.4018e-05,
        -4.2906e-05, -4.3647e-05, -4.3670e-05, -4.3631e-05, -4.4778e-05,
        -4.2435e-05, -4.2865e-05, -4.4404e-05, -4.1816e-05, -4.2964e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.3996e-05, -4.4158e-05, -4.3772e-05, -4.3795e-05, -4.3172e-05,
        -4.2457e-05, -4.3323e-05, -4.2470e-05, -4.5117e-05, -4.3882e-05,
        -4.3320e-05, -4.4907e-05, -4.3675e-05, -4.3146e-05, -4.2981e-05,
        -4.3370e-05, -4.3491e-05, -4.4292e-05, -4.3526e-05, -4.3146e-05],
       g

torch.Size([20]) tensor([-4.3536e-05, -4.3470e-05, -4.6043e-05, -4.4081e-05, -4.2854e-05,
        -4.2699e-05, -4.3653e-05, -4.3932e-05, -4.2971e-05, -4.2515e-05,
        -4.2297e-05, -4.3975e-05, -4.2558e-05, -4.3740e-05, -4.2258e-05,
        -4.5434e-05, -4.3113e-05, -4.4961e-05, -4.4970e-05, -4.2886e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.3701e-05, -4.3277e-05, -4.2863e-05, -4.4513e-05, -4.3217e-05,
        -4.4122e-05, -4.4171e-05, -4.3089e-05, -4.3552e-05, -4.2172e-05,
        -4.2625e-05, -4.4936e-05, -4.3120e-05, -4.4830e-05, -4.5720e-05,
        -4.3554e-05, -4.4283e-05, -4.3672e-05, -4.4163e-05, -4.2897e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.4947e-05, -4.3904e-05, -4.3508e-05, -4.3809e-05, -4.2822e-05,
        -4.2468e-05, -4.3150e-05, -4.2245e-05, -4.2455e-05, -4.3873e-05,
        -4.1667e-05, -4.3685e-05, -4.2893e-05, -4.3197e-05, -4.2310e-05,
        -4.3886e-05, -4.3234e-05, -4.3478e-05, -4.4921e-05, -4.3273e-05],
       g

torch.Size([20]) tensor([-4.4063e-05, -4.4040e-05, -4.3655e-05, -4.3273e-05, -4.3791e-05,
        -4.2276e-05, -4.5286e-05, -4.4020e-05, -4.3308e-05, -4.4759e-05,
        -4.7050e-05, -4.4476e-05, -4.2830e-05, -4.3018e-05, -4.4020e-05,
        -4.3899e-05, -4.3575e-05, -4.3672e-05, -4.2960e-05, -4.5648e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.3979e-05, -4.3053e-05, -4.4325e-05, -4.3754e-05, -4.4035e-05,
        -4.2053e-05, -4.5033e-05, -4.3271e-05, -4.3331e-05, -4.4202e-05,
        -4.4804e-05, -4.3808e-05, -4.3284e-05, -4.3467e-05, -4.4230e-05,
        -4.2828e-05, -4.5793e-05, -4.4726e-05, -4.2623e-05, -4.4493e-05],
       grad_fn=<SumBackward2>)
torch.Size([20]) tensor([-4.4085e-05, -4.4003e-05, -4.4504e-05, -4.5337e-05, -4.4303e-05,
        -4.2597e-05, -4.4484e-05, -4.3388e-05, -4.3720e-05, -4.4268e-05,
        -4.4180e-05, -4.4303e-05, -4.2811e-05, -4.3519e-05, -4.3275e-05,
        -4.3636e-05, -4.3234e-05, -4.3461e-05, -4.2977e-05, -4.5178e-05],
       g

In [None]:
actions = np.random.randn(env.agent_count, env.action_size)
actions = np.clip(actions, -1, 1).astype(np.float32)
states = torch.from_numpy(env.states).float()

In [None]:
q, probs, log_probs = critic(states, torch.from_numpy(actions))

In [None]:
print(probs.shape)
print(log_probs.shape)
print(q)

In [23]:
states

tensor([[-3.1357e-02, -3.9640e+00,  5.5490e-01,  9.9760e-01, -3.8834e-03,
          4.1969e-04,  6.9142e-02, -9.8196e-01,  1.7830e-01,  1.5022e+00,
          6.0810e+00,  2.3895e-01,  3.9467e+00, -1.8211e+00, -8.6312e+00,
          2.7140e+00,  8.7632e-01, -2.8530e-01,  1.1828e-01,  3.6970e-01,
          6.0944e-01, -2.3743e+00, -8.5185e-01,  1.9549e+00,  8.3565e-01,
         -4.7310e-01, -7.2433e+00, -1.0000e+00,  3.3964e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  2.1031e-01],
        [-1.2114e+00, -3.6242e+00, -1.1918e+00,  9.7599e-01, -1.5054e-01,
         -2.4045e-02, -1.5556e-01,  1.0558e+00, -1.3634e-01,  4.4779e-01,
          1.7893e+00,  6.1650e-01, -4.0310e+00, -2.5506e+00, -9.2432e+00,
         -1.4284e+00,  9.8884e-01, -8.0499e-02, -6.9630e-02,  1.0422e-01,
         -1.9719e+00,  5.6658e-01, -4.4657e-01,  1.6339e+00,  2.9171e+00,
          1.8775e+00, -7.9724e+00, -1.0000e+00, -6.6438e-01,  0.0000e+00,
          1.0000e+00,  0.0000e+00, -7.4941e-01],
        [-3.28

In [None]:
env.close()