In [33]:
import torch
from torch import nn
import numpy as np
import gymnasium as gym
from torch.distributions import Normal

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
#Actor network
class ActorNeuralNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
        )
        self.mu = nn.Linear(hidden_dim, action_dim) # Distribution mean
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, state):
        x = self.net(state)
        mu = torch.tanh(self.mu(x))
        log_std = torch.clamp(self.log_std, -10, 10)  # safe range
        std = log_std.exp()
        return mu, std
    
    def get_dist(self, state):
        mu, std = self(state)
        return Normal(mu, std)

In [35]:
#Critic network

class CriticNeuralNetwork(nn.Module):
    def __init__(self, state_dim, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, state):
        return self.net(state)

In [36]:
env = gym.make("InvertedDoublePendulum-v5")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

env.close()

actor = ActorNeuralNetwork(state_dim, action_dim)
critic = CriticNeuralNetwork(state_dim)

# actor_optimizer = torch.optim.Adam(actor.parameters(), lr=1e-4)
# critic_optimizer = torch.optim.Adam(critic.parameters(), lr=1e-3

In [37]:
#Actor-Critic w/ Eligibility Traces

def training_loop_ACWET(epochs, lr_theta, lr_weight, discount, lambda_et, discount_lr):
    global actor, critic

    env = gym.make("InvertedDoublePendulum-v5")

    for ep in range(epochs):
        actor_traces = [torch.zeros_like(p.data) for p in actor.parameters()]
        critic_traces = [torch.zeros_like(p.data) for p in critic.parameters()]
        terminated = truncated = False
        obs, _ = env.reset()
        obs = torch.tensor(obs, dtype=torch.float32, device=device)
        total_reward = 0

        while not (terminated or truncated):
            actor_dist = actor.get_dist(obs)
            action = actor_dist.sample()
            action = torch.clamp(action, env.action_space.low[0], env.action_space.high[0])
            log_prob = actor_dist.log_prob(action).sum()

            old_obs = obs
            obs, reward, terminated, truncated, _ = env.step(action)
            obs = torch.tensor(obs, dtype=torch.float32, device=device)

            with torch.no_grad():
                next_value = 0 if terminated or truncated else critic(obs)
            td_error = reward + discount * next_value - critic(old_obs)
            td_error = torch.clamp(td_error, -5, 5)
            total_reward += reward

            critic.zero_grad()
            critic(old_obs).sum().backward()

            with torch.no_grad():
                for trace, param in zip(critic_traces, critic.parameters()):
                    trace.mul_(lambda_et * discount).add_(param.grad)
                    param.data.add_(lr_weight * td_error * trace)

            actor.zero_grad()
            log_prob.backward()

            with torch.no_grad():
                for trace, param in zip(actor_traces, actor.parameters()):
                    trace.mul_(lambda_et * discount).add_(param.grad)
                    param.data.add_(lr_theta * td_error * trace)

        lr_theta *= discount_lr
        lr_weight *= discount_lr

        print(f"Epoch: {ep} | Total reward: {total_reward} | LR: {lr_theta} | Terminated: {terminated} | Truncated: {truncated}")

def testing_loop_ACWET(epochs):
    global actor, critic

    env = gym.make("InvertedDoublePendulum-v5", render_mode="human")

    for ep in range(epochs):
        terminated = truncated = False
        obs, _ = env.reset()
        obs = torch.tensor(obs, dtype=torch.float32, device=device)
        total_reward = 0

        while not (terminated or truncated):
            actor_dist = actor.get_dist(obs)
            action = actor_dist.sample()
            action = torch.clamp(action, env.action_space.low[0], env.action_space.high[0])

            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward+=reward
            obs = torch.tensor(obs, dtype=torch.float32, device=device)
            #i_t *= discount

        print(f"Epoch: {ep} | Total reward: {total_reward} | Terminated: {terminated} | Truncated: {truncated}")

In [None]:
training_loop_ACWET(10000, 1e-4, 1e-4, 0.95, 0.85, 0.9994)

In [39]:
testing_loop_ACWET(10)

Epoch: 0 | Total reward: 9349.855866087582 | Terminated: False | Truncated: True
Epoch: 1 | Total reward: 9348.039792073263 | Terminated: False | Truncated: True
Epoch: 2 | Total reward: 9223.858498845268 | Terminated: True | Truncated: False


/home/vitor/Documentos/reinforcement-learning/venv/lib/python3.13/site-packages/glfw/__init__.py:917: GLFWError: (65537) b'The GLFW library is not initialized'


Epoch: 3 | Total reward: 8933.315757387216 | Terminated: True | Truncated: False
Epoch: 4 | Total reward: 9348.827164725093 | Terminated: False | Truncated: True
Epoch: 5 | Total reward: 8146.833796840885 | Terminated: True | Truncated: False
Epoch: 6 | Total reward: 4925.604865934828 | Terminated: True | Truncated: False
Epoch: 7 | Total reward: 5934.471574451457 | Terminated: True | Truncated: False
Epoch: 8 | Total reward: 9351.579689106346 | Terminated: False | Truncated: True
Epoch: 9 | Total reward: 866.4769396873893 | Terminated: True | Truncated: False
