In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from ddq_network import QNetwork

In [2]:
network = QNetwork(state_size=8, action_size=4, seed=0)
network

QNetwork(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=8, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=128, bias=True)
  )
  (output): Linear(in_features=128, out_features=4, bias=True)
)

In [3]:
sample1 = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8])
logits = network.forward(sample1.view(1, 8))
torch.exp(logits)

tensor([[0.9353, 1.1721, 1.0826, 0.9122]], grad_fn=<ExpBackward>)

In [4]:
import torch.optim as optim
network_local = QNetwork(state_size=8, action_size=4, seed=0)
network_target = QNetwork(state_size=8, action_size=4, seed=0)
gamma = 0.995
optimizer = optim.Adam(network_local.parameters(), lr=5e-4)

states = torch.FloatTensor([[1, 2, 3, 4, 5, 6, 7, 8], [11, 12, 13, 14, 15, 16, 17, 18]])
actions = torch.LongTensor([[0], [1]])
rewards = torch.FloatTensor([[0.1], [0.2]])
next_states = torch.FloatTensor([[1, 2, 3, 4, 5, 6, 7, 8], [11, 12, 13, 14, 15, 16, 17, 18]])
dones = torch.FloatTensor([[0.0], [1.0]])

next_Q_values = network_target(next_states)
print(f"next_Q_values: {next_Q_values}")
print(f"next_Q_values max: {next_Q_values.max(dim=1, keepdim=True)}")
print(f"next_Q_values argmax: {next_Q_values.argmax(dim=1, keepdim=True)}")

next_Q_values: tensor([[-0.0669,  0.1588,  0.0793, -0.0919],
        [-0.3797,  1.3628, -0.1498, -0.9924]], grad_fn=<ThAddmmBackward>)
next_Q_values max: (tensor([[0.1588],
        [1.3628]], grad_fn=<MaxBackward0>), tensor([[1],
        [1]]))
next_Q_values argmax: tensor([[1],
        [1]])


In [5]:
max([-1.8587, -1.1321, -0.9678, -1.9530])

-0.9678

In [6]:
next_Q_values = network_target(next_states).max(dim=1)[0].unsqueeze(1)
network_target(next_states)
print(f"network_target(next_states): {network_target(next_states)}")
print(f"network_target(next_states): {network_target(next_states).max(dim=1)}")
print(f"network_target(next_states): {network_target(next_states).argmax(dim=1)}")
# print(f"next_Q_values: {next_Q_values}")
# print(f"rewards: {rewards}")
targets = rewards + gamma * next_Q_values * (1.0 - dones)
# print(f"targets: {targets}")
Q_values = network_local.forward(states).gather(dim=1, index=actions)

network_target(next_states): tensor([[-0.0669,  0.1588,  0.0793, -0.0919],
        [-0.3797,  1.3628, -0.1498, -0.9924]], grad_fn=<ThAddmmBackward>)
network_target(next_states): (tensor([0.1588, 1.3628], grad_fn=<MaxBackward0>), tensor([1, 1]))
network_target(next_states): tensor([1, 1])


In [7]:
loss = F.mse_loss(input=Q_values, target=targets)

optimizer.zero_grad()
loss.backward()
optimizer.step()