In [1]:
import time
import numpy as np
import torch

from get_args import get_args
from agent import Agent
from utils import load_filepath, load_checkpoint, plot_scores, print_debug_info, print_status, save_checkpoint

In [2]:
from unityagents import UnityEnvironment

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
args = get_args()
args.framework = "DDQN"

In [5]:
for arg in vars(args):
    print("{}: {}".format(arg, getattr(args, arg)))

framework: DDQN
prioritized_replay: False
batchsize: 64
buffersize: 50000
continue: False
cpu: False
debug: False
dropout: 0.05
epsilon: 1.0
epsilon_decay: 0.999
epsilon_min: 0.075
gamma: 0.99
latest: False
learn_rate: 0.00025
momentum: 0.95
nographics: False
num_episodes: 1500
optimizer: Adam
print_count: 15
tau: 0.001
train: False
update_every: 4
verbose: False


In [6]:
start_time = time.time()
sep = "#"*50
unity_env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe", no_graphics=True)
# get the default brain (In this environment there is only one agent/brain)
brain_name = unity_env.brain_names[0]
brain = unity_env.brains[brain_name]
env = unity_env.reset(train_mode=True)[brain_name]
nA = brain.vector_action_space_size
nS = len(env.vector_observations[0])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [7]:
agent = Agent(nS, nA, device, args)

In [8]:
env = unity_env.reset(train_mode=args.train)[brain_name]
state = env.vector_observations[0]


In [11]:
for i in range(1,66):
    print("Working on step: {}".format(i))
    action = agent.act(state)
    env = unity_env.step(action)[brain_name]

    #collect info about new state
    reward = env.rewards[0]
    next_state = env.vector_observations[0]
    done = env.local_done[0]

    agent.teststep(state, action, reward, next_state, done)
    agent.memory.add(state, action, reward, next_state, done)

    state = next_state
    

Working on step: 1
Working on step: 2
Working on step: 3
Working on step: 4
Working on step: 5
Working on step: 6
Working on step: 7
Working on step: 8
Working on step: 9
Working on step: 10
Working on step: 11
Working on step: 12
Working on step: 13
Working on step: 14
Working on step: 15
Working on step: 16
Working on step: 17
Working on step: 18
Working on step: 19
Working on step: 20
Working on step: 21
Working on step: 22
Working on step: 23
Working on step: 24
Working on step: 25
Working on step: 26
Working on step: 27
Working on step: 28
Working on step: 29
Working on step: 30
Working on step: 31
Working on step: 32
Working on step: 33
Working on step: 34
Working on step: 35
Working on step: 36
Working on step: 37
Working on step: 38
Working on step: 39
Working on step: 40
Working on step: 41
Working on step: 42
Working on step: 43
Working on step: 44
Working on step: 45
Working on step: 46
Working on step: 47
Working on step: 48
Working on step: 49
Working on step: 50
Working o

In [13]:
batch = agent.memory.sample(per=False)
#batch = state, action, reward, next_state, done

In [17]:
print(len(batch)) #this is not how many memories are in the batch, it's how many vars per memory (e.g. S/A/R/NS/D)

5


In [15]:
print(batch[0])

tensor([[ 0.0000e+00,  0.0000e+00,  1.0000e+00,  ...,  4.7285e-01,
          3.5226e-01,  6.5773e+00],
        [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  8.5530e-01,
          2.9412e-01, -2.1828e+00],
        [ 1.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  8.1906e-01,
         -1.0368e-01,  9.9903e+00],
        ...,
        [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.6427e-01,
          2.5587e-01,  8.7246e+00],
        [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
         -2.5226e-01, -8.9614e-01],
        [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.2440e-01,
         -7.9143e-08,  7.1260e-08]], device='cuda:0')


In [16]:
for state, action, reward, next_state, done in zip(*batch):
    print("STATE:\n{}".format(state))
    print("ACTION:\n{}".format(action))
    print("REWARD:\n{}".format(reward))
    print("NEXT STATE:\n{}".format(next_state))
    print("DONE:\n{}".format(done))    

STATE:
tensor([0.0000, 0.0000, 1.0000, 0.0000, 0.8735, 0.0000, 1.0000, 0.0000, 0.0000,
        0.4969, 0.0000, 1.0000, 0.0000, 0.0000, 0.1352, 1.0000, 0.0000, 0.0000,
        0.0000, 0.4770, 0.0000, 1.0000, 0.0000, 0.0000, 0.2347, 0.0000, 1.0000,
        0.0000, 0.0000, 0.5997, 0.0000, 1.0000, 0.0000, 0.0000, 0.4728, 0.3523,
        6.5773], device='cuda:0')
ACTION:
tensor([1], device='cuda:0')
REWARD:
tensor([0.], device='cuda:0')
NEXT STATE:
tensor([ 0.0000,  0.0000,  1.0000,  0.0000,  0.8738,  0.0000,  1.0000,  0.0000,
         0.0000,  0.4981,  0.0000,  1.0000,  0.0000,  0.0000,  0.1352,  1.0000,
         0.0000,  0.0000,  0.0000,  0.4810,  0.0000,  1.0000,  0.0000,  0.0000,
         0.2347,  0.0000,  1.0000,  0.0000,  0.0000,  0.6012,  0.0000,  1.0000,
         0.0000,  0.0000,  0.4741,  0.1426, -4.1467], device='cuda:0')
DONE:
tensor([0.], device='cuda:0')
STATE:
tensor([ 0.0000,  1.0000,  0.0000,  0.0000,  0.0934,  1.0000,  0.0000,  0.0000,
         0.0000,  0.3756,  0.0000,  1.

NEXT STATE:
tensor([ 0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  9.6702e-02,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  1.1489e-01,
         0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  8.2266e-01,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  8.5500e-02,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  9.2088e-02,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  1.8187e-01,
         4.7684e-07, -7.8722e+00], device='cuda:0')
DONE:
tensor([0.], device='cuda:0')
STATE:
tensor([ 0.0000,  1.0000,  0.0000,  0.0000,  0.4528,  0.0000,  1.0000,  0.0000,
         0.0000,  0.2024,  0.0000,  1.0000,  0.0000,  0.0000,  0.3126,  0.0000,
         1.0000,  0.0000,  0.0000,  0.4239,  0.0000,  1.0000,  0.0000,  0.0000,
         0.2160,  0.0000,  1.0000,  0.0000,  0.0000,  0.2442,  0.0000,  1.0000,
         0.0000,  0.0000,  0.1926, -1.1065, -6.0340], device='

ACTION:
tensor([1], device='cuda:0')
REWARD:
tensor([0.], device='cuda:0')
NEXT STATE:
tensor([ 0.0000,  1.0000,  0.0000,  0.0000,  0.0513,  0.0000,  1.0000,  0.0000,
         0.0000,  0.1659,  1.0000,  0.0000,  0.0000,  0.0000,  0.3608,  0.0000,
         1.0000,  0.0000,  0.0000,  0.0575,  0.0000,  0.0000,  1.0000,  0.0000,
         0.8196,  0.0000,  1.0000,  0.0000,  0.0000,  0.0833,  1.0000,  0.0000,
         0.0000,  0.0000,  0.5612,  1.4145, -5.5582], device='cuda:0')
DONE:
tensor([0.], device='cuda:0')
STATE:
tensor([ 0.0000,  1.0000,  0.0000,  0.0000,  0.4473,  0.0000,  1.0000,  0.0000,
         0.0000,  0.1730,  0.0000,  1.0000,  0.0000,  0.0000,  0.2672,  0.0000,
         1.0000,  0.0000,  0.0000,  0.3624,  0.0000,  1.0000,  0.0000,  0.0000,
         0.1847,  0.0000,  1.0000,  0.0000,  0.0000,  0.2088,  0.0000,  1.0000,
         0.0000,  0.0000,  0.1646, -0.0275, 10.2380], device='cuda:0')
ACTION:
tensor([2], device='cuda:0')
REWARD:
tensor([0.], device='cuda:0')
NEXT STATE:
t

DONE:
tensor([0.], device='cuda:0')
STATE:
tensor([0.0000, 1.0000, 0.0000, 0.0000, 0.0946, 0.0000, 1.0000, 0.0000, 0.0000,
        0.3061, 1.0000, 0.0000, 0.0000, 0.0000, 0.9551, 0.0000, 1.0000, 0.0000,
        0.0000, 0.1062, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000,
        0.0000, 0.0000, 0.1536, 1.0000, 0.0000, 0.0000, 0.0000, 0.5153, 1.7979,
        3.1873], device='cuda:0')
ACTION:
tensor([0], device='cuda:0')
REWARD:
tensor([0.], device='cuda:0')
NEXT STATE:
tensor([0.0000, 1.0000, 0.0000, 0.0000, 0.0883, 0.0000, 1.0000, 0.0000, 0.0000,
        0.2857, 1.0000, 0.0000, 0.0000, 0.0000, 0.3208, 0.0000, 1.0000, 0.0000,
        0.0000, 0.0991, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000,
        0.0000, 0.0000, 0.1434, 1.0000, 0.0000, 0.0000, 0.0000, 0.5085, 0.7493,
        7.9681], device='cuda:0')
DONE:
tensor([0.], device='cuda:0')
STATE:
tensor([ 1.0000,  0.0000,  0.0000,  0.0000,  0.3359,  0.0000,  0.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  1.0

In [18]:
print("STATE:\n{}".format(state))
print("ACTION:\n{}".format(action))
print("REWARD:\n{}".format(reward))
print("NEXT STATE:\n{}".format(next_state))
print("DONE:\n{}".format(done))

STATE:
tensor([ 0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  6.6145e-02,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  7.8588e-02,
         1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  3.5373e-01,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  5.8483e-02,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  6.2989e-02,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  1.2440e-01,
        -7.9143e-08,  7.1260e-08], device='cuda:0')
ACTION:
tensor([1], device='cuda:0')
REWARD:
tensor([0.], device='cuda:0')
NEXT STATE:
tensor([ 0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  7.4028e-02,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  8.7955e-02,
         1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  3.6028e-01,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  6.5453e-02,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0

In [19]:
states, actions, rewards, next_states, dones = batch

In [32]:
t = agent.q(next_states).gather(1, actions) #.unsqueeze(0)
t.shape

torch.Size([64, 1])

In [24]:
t = agent.q(next_states).detach().max(1)[0]
t

tensor([ 0.1500,  0.0170, -0.0093,  0.1387,  0.1699,  0.1238,  0.1607,  0.1311,
         0.1311,  0.2318,  0.1880,  0.1776,  0.1825,  0.1310,  0.0787,  0.1519,
         0.1044,  0.0434,  0.1018,  0.1959,  0.1178,  0.1177,  0.1300,  0.1277,
         0.1350,  0.1071,  0.1816,  0.1508,  0.0825,  0.1688,  0.1090,  0.1555,
         0.0932,  0.1683,  0.1060,  0.1321,  0.0303,  0.1063,  0.1073,  0.1411,
         0.2055,  0.0999,  0.1323,  0.1362,  0.1034,  0.1242,  0.2204,  0.0988,
         0.0945,  0.1041,  0.1654,  0.0942,  0.1252,  0.0704,  0.1160,  0.1578,
         0.0819,  0.0568,  0.0478,  0.1043,  0.1385,  0.1438,  0.1589,  0.1844],
       device='cuda:0')

In [None]:
v = agent.q(next_states)
v1 = agent.qhat(next_states)

In [None]:
maxv = v.detach().max(1)[0].unsqueeze(1)

In [None]:
v

In [None]:
argv = v.detach().argmax(1).unsqueeze(1)
argv

In [None]:
v1.gather(1,argv)

In [None]:
unity_env.close()