In [1]:
import os

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor, device

import utils

from agent import AgentBase

from numpy._typing import _ShapeLike

from typing import Type, Any

from tqdm.notebook import tqdm_notebook as tqdm

In [2]:
# class Actor(nn.Module):
#     def __init__(self, inp_channels: int, n_actions: int, features = [16, 32, 64, 128]):
#         super().__init__()
#         self.inp_channels = inp_channels
#         self.n_actions = n_actions
#         self.features = features

#         self.initial = nn.Sequential(
#             nn.Conv2d(inp_channels, features[0], 3, 1, 1),
#             nn.ReLU(True)
#         )        

#         self.net = nn.Sequential(
#             *[utils.ConvBn(features[i], features[i+1], pool = (i!=(len(features)-2))) for i in range(len(features)-1)]
#         )

#         self.net.append(nn.AdaptiveMaxPool2d((1, 1)))
#         self.net.append(nn.Flatten())

#         self.mu = nn.Sequential(
#             nn.Linear(features[-1], 64),
#             nn.ReLU(True),
#             nn.Linear(64, n_actions),
#             nn.Tanh()
#         )

#     def forward(self, states: Tensor) -> Tensor:
#         states_encoder = self.net(self.initial(states))

#         mu = self.mu(states_encoder)

#         return mu
    
class Actor(nn.Module):
    def __init__(self, n_inp: int, n_action, features = [64, 64]):
        super().__init__()

        layer_sizes = [n_inp] + features

        self.encoded = nn.Sequential()

        for i in range(len(layer_sizes) - 1):
            self.encoded.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
            self.encoded.append(nn.ReLU(inplace=True))
        
        self.actor = nn.Linear(layer_sizes[-1], n_action)

    def forward(self, x):
        return F.tanh(self.actor(self.encoded(x)))

In [3]:
# class Critic(nn.Module):
#     def __init__(self, inp_channels: int, n_actions: int, features = [16, 32, 64, 128]):
#         super().__init__()
#         self.inp_channels = inp_channels
#         self.n_actions = n_actions
#         self.features = features

#         self.initial = nn.Sequential(
#             nn.Conv2d(inp_channels, features[0], 3, 1, 1),
#             nn.ReLU(True)
#         )        

#         self.net = nn.Sequential(
#             *[utils.ConvBn(features[i], features[i+1], pool = (i!=(len(features)-2))) for i in range(len(features)-1)]
#         )

#         self.net.append(nn.AdaptiveMaxPool2d((1, 1)))
#         self.net.append(nn.Flatten())

#         self.q = nn.Sequential(
#             nn.Linear(features[-1] + n_actions, 64),
#             nn.ReLU(True),
#             nn.Linear(64, 32),
#             nn.ReLU(True),
#             nn.Linear(32, 1)
#         )

#     def forward(self, states: Tensor, action: Tensor) -> Tensor:
#         states_encoder = self.net(self.initial(states))

#         action_value = self.q(torch.concat([states_encoder, action], 1))

#         return action_value
    

class Critic(nn.Module):
    def __init__(self, n_inp, n_action, features = [64, 64]):
        super(Critic, self).__init__()
        layer_sizes = [n_inp + n_action] + features

        self.encoded = nn.Sequential()

        for i in range(len(layer_sizes) - 1):
            self.encoded.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
            self.encoded.append(nn.ReLU(inplace=True))
        
        self.critic = nn.Linear(layer_sizes[-1], 1)

    def forward(self, states, actions):
        return self.critic(self.encoded(torch.concat([states, actions], 1)))

In [4]:
class DDPG(AgentBase):
    def __init__(
            self, 
            state_shape: _ShapeLike, 
            action_shape: _ShapeLike, 
            batch_size: int, 
            actor_lr = 1e-3,
            critic_lr = 2e-3,
            actor_optim: Type[optim.Optimizer] = optim.Adam,
            critic_optim: Type[optim.Optimizer] = optim.Adam,
            gamma = 0.99,
            tau = 5e-3,
            noise = 0.1,
            buffer_size: int = int(1e5), 
            update_every: int = 1, 
            device: str = 'cuda' if torch.cuda.is_available() else 'cpu', 
            seed = 0, 
            **kwargs
        ) -> None:

        super().__init__(state_shape, action_shape, batch_size, False, update_every, buffer_size, device, seed, **kwargs)

        self.gamma = gamma
        self.tau = tau
        self.noise = noise

        self.actor = Actor(state_shape[0], n_action=action_shape[0]).to(device)
        self.critic = Critic(state_shape[0], action_shape[0]).to(device)
        self.target_actor = Actor(state_shape[0], n_action=action_shape[0]).to(device)
        self.target_critic = Critic(state_shape[0], action_shape[0]).to(device)

        self.actor_optim = actor_optim(self.actor.parameters(), actor_lr)
        self.critic_optim = critic_optim(self.critic.parameters(), critic_lr)

        self.apply(utils.init_weights)

        self.update_network_parameters(tau = 1)

    def update_network_parameters(self, tau = None):
        if tau is None:
            tau = self.tau
        
        for target_param, local_param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
        
        for target_param, local_param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)


    @torch.no_grad()
    def act(self, state: np.ndarray) -> Any:
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        self.actor.eval()
        action: np.ndarray = self.actor(state)[0].cpu().detach().numpy()
        self.actor.train()

        if not self.eval:
            action += np.random.normal(0, self.noise, action.shape).clip(-1, 1)
        
        return action

        # return np.array([action[0], action[1], 0]) if action[1]>0 else np.array([action[0], 0, -action[1]])

    def learn(self, states: Tensor, actions: Tensor, rewards: Tensor, next_states: Tensor, terminals: Tensor):
        self.eval = False
        target_actions = self.target_actor(next_states)
        next_q = self.target_critic(next_states, target_actions)
        q_value = self.critic(states, actions)

        target_q_value = rewards + self.gamma * next_q * (~terminals)

        critic_loss = F.mse_loss(q_value, target_q_value)

        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        new_policy_actions = self.actor(states)
        actor_loss = -self.critic(states, new_policy_actions).mean()

        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()

        self.update_network_parameters()

In [5]:
env = utils.make('LunarLanderContinuous-v2')

In [6]:
agent = DDPG(
    state_shape=env.observation_space.shape,
    action_shape=env.action_space.shape,
    batch_size=64,
    actor_lr=0.001,
    critic_lr=0.002,
)

In [7]:
scores = agent.fit(env, 1000, 1000, save_best=True, save_dir='checkpoint/', progress_bar=tqdm)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [9]:
env = utils.make('LunarLanderContinuous-v2', render_mode='human')

agent.play(env)

314.00363530131136