In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        "*** YOUR CODE HERE ***"
        hidden_sizes = [512, 256, 128]
        self.hidden_layers = nn.ModuleList([nn.Linear(state_size, hidden_sizes[0])])
        layer_sizes = zip(hidden_sizes[:-1], hidden_sizes[1:])
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])
        
        self.output = nn.Linear(hidden_sizes[-1], action_size)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = state
        for linear in self.hidden_layers:
            x = F.elu(linear(x))
            x = self.dropout(x)
        x = self.output(x)
        return F.log_softmax(x, dim=1)

In [3]:
network = QNetwork(state_size=8, action_size=4, seed=0)
network

QNetwork(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=8, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
  (output): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.2)
)

In [4]:
sample1 = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8])
logits = network.forward(sample1.view(1, 8))
torch.exp(logits)

tensor([[0.2052, 0.2963, 0.3308, 0.1678]], grad_fn=<ExpBackward>)

In [14]:
import torch.optim as optim
network_local = QNetwork(state_size=8, action_size=4, seed=0)
network_target = QNetwork(state_size=8, action_size=4, seed=0)
gamma = 0.995
optimizer = optim.Adam(network_local.parameters(), lr=5e-4)

states = torch.FloatTensor([[1, 2, 3, 4, 5, 6, 7, 8], [11, 12, 13, 14, 15, 16, 17, 18]])
actions = torch.LongTensor([[0], [1]])
rewards = torch.FloatTensor([[0.1], [0.2]])
next_states = torch.FloatTensor([[1, 2, 3, 4, 5, 6, 7, 8], [11, 12, 13, 14, 15, 16, 17, 18]])
dones = torch.FloatTensor([[0.0], [1.0]])

next_Q_values = network_target(next_states)
print(f"next_Q_values: {next_Q_values}")
print(f"next_Q_values max: {next_Q_values.max(dim=1, keepdim=True)}")
print(f"next_Q_values argmax: {next_Q_values.argmax(dim=1, keepdim=True)}")

next_Q_values: tensor([[-1.5840, -1.2164, -1.1064, -1.7849],
        [-2.1907, -0.6123, -1.2782, -2.6956]], grad_fn=<LogSoftmaxBackward>)
next_Q_values max: (tensor([[-1.1064],
        [-0.6123]], grad_fn=<MaxBackward0>), tensor([[2],
        [1]]))
next_Q_values argmax: tensor([[2],
        [1]])


In [11]:
max([-1.8587, -1.1321, -0.9678, -1.9530])

-0.9678

In [None]:
next_Q_values = network_target(next_states).max(dim=1)[0].unsqueeze(1)
network_target(next_states)
print(f"network_target(next_states): {network_target(next_states)}")
print(f"network_target(next_states): {network_target(next_states).max(dim=1)}")
print(f"network_target(next_states): {network_target(next_states).argmax(dim=1)}")
# print(f"next_Q_values: {next_Q_values}")
# print(f"rewards: {rewards}")
targets = rewards + gamma * next_Q_values * (1.0 - dones)
# print(f"targets: {targets}")
Q_values = network_local.forward(states).gather(dim=1, index=actions)

In [None]:
loss = F.mse_loss(input=Q_values, target=targets)

optimizer.zero_grad()
loss.backward()
optimizer.step()