In [10]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.distributions.normal import Normal

from octopus.replay_buffer import ReplayBuffer

In [3]:
class CriticNetwork(nn.Module):
    def __init__(
        self,
        n_observations, n_actions,
        fc1_dim=256, fc2_dim=256
    ):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations[0] + n_actions, fc1_dim),
            nn.ReLU(),
            nn.Linear(fc1_dim, fc2_dim),
            nn.ReLU(),
            nn.Linear(fc2_dim, 1),
        )
    
    def forward(self, state, action):
        input = torch.cat([state, action], dim=1)
        return self.layers(input)

In [4]:
class ValueNetwork(nn.Module):
    def __init__(self, n_observations, fc1_dim, fc2_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, fc1_dim),
            nn.ReLU(),
            nn.Linear(fc1_dim, fc2_dim),
            nn.ReLU(),
            nn.Linear(fc2_dim, 1)
        )
    
    def forward(self, state):
        return self.layers(state)

In [12]:
class ActorNetwork(nn.Module):
    def __init__(self, n_observations, n_actions, max_action, fc1_dim=256, fc2_dim=256):
        super().__init__()
        self.reparam_noise = 1e-6
        self.max_action = max_action
        
        self.layers = nn.Sequential(
            nn.Linear(n_observations, fc1_dim),
            nn.ReLU(),
            nn.Linear(fc1_dim, fc2_dim),
            nn.ReLU(),
        )
        
        self.mu = nn.Linear(fc2_dim, n_actions)
        self.sigma = nn.Linear(fc2_dim, n_actions)
    
    def forward(self, state):
        prob = self.layers(state)
        
        # mean of distribution
        mu = self.mu(state)
        
        # standard deviation of distribution
        sigma = self.sigma(state)
        sigma = torch.clamp(sigma, min=self.reparam_noise, max=1)

        return mu, sigma

    def sample_normal(self, state, reparametrize=True):
        mu, sigma = self.forward(state)
        probs = Normal(mu, sigma)
        
        if reparametrize:
            # do reparametrize trick
            # add some noise to the acton!
            actions = probs.rsample()
        else:
            actions = probs.sample()
        
        # scales the values of actions to the range [-self.max_action, self.max_action]
        actions = torch.tanh(actions) * self.max_action
        
        log_probs = probs.log_prob(actions)
        
        # come from the paper's appendix
        # c. Enforcing Action Bounds
        log_probs -= torch.log(1 - actions.pow(2) + self.reparam_noise)
        log_probs = log_probs.sum(1, keepdim=True)
        
        return actions, log_probs

In [2]:
class Agent:
    def __init__(
        self, env, n_observations: int, n_actions: int, mem_size: int=1000000,
        scale_entropy: float = 0.99, discount_factor: float=0.99, reward_scale=2,
        batch_size: int=256,
        fc1_dim: int = 256, fc2_dim: int = 256
    ):
        
        self.n_actions = n_actions
        self.n_observations = n_observations
        
        self.scale_entropy = scale_entropy # tau
        self.discount_factor = discount_factor # gamma
        self.reward_scale = reward_scale
        
        self.memory = ReplayBuffer(mem_size, n_observations, n_actions)
        self.batch_size = batch_size
        
        max_action = env.action_space.high
        
        self.actor_network = ActorNetwork(
            n_observations, n_actions, max_action,
            fc1_dim, fc2_dim
        )
        
        self.critic_1_network = CriticNetwork(n_observations, n_actions, fc1_dim, fc2_dim)
        self.critic_2_network = CriticNetwork(n_observations, n_actions, fc1_dim, fc2_dim)
    
        self.value_network = ValueNetwork(n_observations, fc1_dim, fc2_dim)
        self.target_value_network = ValueNetwork(n_observations, fc1_dim, fc2_dim)
        
        self.update_network_parameters(tau=1)
        
    def choose_action(self, state):
        state = torch.tensor([state])
        action, _ = self.actor.sample_normal(state, reparametrize=False)
        action_idx = action[0]
        
        return action_idx
    
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def update_network_parameters(self, tau = None):
        # at the begging of simulation, we want to set the value
        # of the target network to exact copy of the value network
        # but from there on, we want to be a soft copy
        if tau is None:
            tau = self.scale_entropy
        
        target_value_params = self.target_value_network.named_parameters()
        value_params = self.value_network.named_parameters()
        
        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)
        
        for name in value_state_dict:
            scaled_target_value = (1-tau) * target_value_state_dict[name].clone()
            value_state_dict[name] = tau*value_state_dict[name].clone() + scaled_target_value
            
        self.target_value_network.load_state_dict(value_state_dict)
    
    def store_transition(self): pass
    
    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            # only learn if filled at least the batch size
            return

        state, action, reward, next_state, done = self.memory.sample_buffer(self.batch_size)
        
        state = torch.tensor(state, dtype=torch.float)
        action = torch.tensor(action, dtype=torch.float)
        next_state = torch.tensor(next_state, dtype=torch.float)
        reward = torch.tensor(reward, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)
        
        value = self.value_network(state).view(dim=-1)
        value_of_next_state = self.value_network(next_state).view(dim=-1)
        value_of_next_state[done] = 0.0
        
        self.value_network.optimizer.zero_grad()
        value_loss = self._get_value_loss(state)
        value_loss.backward(retain_graph=True)
        self.value_network.optimizer.zero_grad()
        
        # TODO: why do reparameterize
    
    def _get_critic_prediction(self, state: torch.Tensor, actions: torch.Tensor):
        q1_new_policy = self.critic_1_network.forward(state, actions)
        q2_new_policy = self.critic_2_network.forward(state, actions)
        
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        
        return critic_value.view(dim=-1)
    
    def _get_value_loss(self, state):
        actions, log_probs = self.actor_network.sample_normal(state, reparametrize=True)
        log_probs = log_probs.view(dim=-1)
        
        pred_critic_value = self._get_critic_prediction(state, actions)
        target_critic_value = pred_critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(pred_critic_value, target_critic_value)
        
        return value_loss
    
    def _get_actor_loss(self, state):
        actions, log_probs = self.actor_network.sample_normal(state, reparametrize=True)
        log_probs = log_probs.view(dim=-1)
        
        

In [6]:
CRITIC_LEARNING_RATE = 1e-3
VALUE_LEARNING_RATE = 1e-3
ACTOR_LEARNING_RATE = 1e-3

In [7]:
critic_network = CriticNetwork()
value_network = ValueNetwork()
actor_network = ActorNetwork()

TypeError: __init__() missing 2 required positional arguments: 'n_observations' and 'n_actions'

In [8]:
critic_optimizier = optim.Adam(critic_network.parameters(), lr=CRITIC_LEARNING_RATE)
value_optimizier = optim.Adam(value_network.parameters(), lr=VALUE_LEARNING_RATE)
actor_optimizier = optim.Adam(actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

NameError: name 'critic_network' is not defined

In [9]:
#https://youtu.be/ioidsRlf79o?t=1230