In [10]:
import torch
from torch import nn
from torch import optim

import torch.nn.functional as F
import gymnasium as gym

In [49]:
class Actor(nn.Module):
    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, 32),
            nn.ReLU(),
            nn.Linear(32, n_actions),
            nn.Softmax(dim=-1),
        )
    
    def forward(self, state):
        return self.layers(state)

In [50]:
class Critic(nn.Module):
    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations + n_actions, 32),
            nn.ReLU(),
            nn.Linear(32, n_actions)
        )
    
    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        return self.layers(x)

In [59]:
env = gym.make('CartPole-v1')
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n

In [60]:
ACTOR_LR = 1e-3
CRITIC_LR = 1e-3

DISCOUNT_FACTOR = 0.99
NOISE = 1.0

MAX_EPISODES = 100
MAX_STEPS = 100000

In [61]:
actor_network = Actor(n_observations, n_actions)
critic_network = Critic(n_observations, n_actions)

actor_loss = nn.MSELoss()
critic_loss = nn.MSELoss()

actor_optim = optim.Adam(actor_network.parameters(), lr=ACTOR_LR)
critic_optim = optim.Adam(critic_network.parameters(), lr=CRITIC_LR)

$\text { target } \mathrm{Q} \text {-values }=\text { reward }+\gamma \cdot \max _{a^{\prime}} Q\left(s^{\prime}, a^{\prime}\right)$

where
- target Q-value: the target Q-value for the current state-action pair
- reward: the reward received for taking the action in the current state
- $\gamma$: the discount factor, which determines the importance of future rewards
- $Q\left(s^{\prime}, a^{\prime}\right)$: s the Q-value of the action $a^{\prime}$ in the next state $s^{\prime}$
- $\max _{a^{\prime}} Q\left(s^{\prime}, a^{\prime}\right)$: the maximum Q-value of all actions in the next state s′, which is predicted by the critic network.

In [63]:
for episode in range(MAX_EPISODES):
    state, _ = env.reset()
    state = torch.from_numpy(state)
    episode_reward = 0
    
    for step in range(MAX_STEPS):
        predicted_action = actor_network(state)
        action = torch.argmax(predicted_action, dim=-1).item()
        
        next_state, reward, done, truncated, info = env.step(action)
        next_state = torch.from_numpy(next_state)
        
        q_values = critic_network(state, torch.tensor(action))
        
        # if done = True => 1 - done = 1 - 1 = 0 => there's no next state
        # if done = False => 1 - done = 1- 0 = 1 => there's a next state
        predicted_next_action = actor_network(next_state)
        next_action = torch.argmax(predicted_next_action, dim=-1).item()
        predicted_next_value = critic_network(next_state, next_action)
        max_predicted_next_value = torch.max(predicted_next_value, dim=-1)
        
        target_q_values = reward + (1 - done) * DISCOUNT_FACTOR * max_predicted_next_value
        
        critic_loss = critic_loss(q_values, target_q_values)
        critic_optim.zero_grad()
        critic_loss.backward()
        critic_optim.step()
        
        actor_loss = actor_loss(action, q_values)
        actor_optim.zero_grad()
        actor_loss.backward()
        actor_optim.step()
        
        state = next_state
        episode_reward += reward
        
        if done:
            break

    print(f'Episode {episode}: {episode_reward}')

RuntimeError: zero-dimensional tensor (at position 1) cannot be concatenated