In [1]:
from typing import List

import torch
from torch import nn
from torch import optim
from torch.distributions import Categorical
import gym

from octopus.policy.reward import calculate_discounted_return_each_timestep, calculate_advantages

In [2]:
class ActorNet(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, state):
        return self.layers(state)

In [3]:
class CriticNet(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
    
    def forward(self, state):
        return self.layers(state)

In [4]:
# class A2C(nn.Module):
#     def __init__(self, n_observations, n_actions, hidden_size):
#         super().__init__()
#         self.actor = nn.Sequential(
#             nn.Linear(n_observations, hidden_size),
#             nn.ReLU(),
#             nn.Linear(hidden_size, n_actions),
#             nn.Softmax(dim=-1)
#         )
        
#         self.critic = nn.Sequential(
#             nn.Linear(n_observations, hidden_size),
#             nn.ReLU(),
#             nn.Linear(hidden_size, 1)
#         )
    
#     def forward(self, state):
#         pred_action = self.actor(state)
#         pred_value = self.critic(state)
        
#         return pred_action, pred_value

In [5]:
env = gym.make('CartPole-v1')

In [6]:
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n

actor = ActorNet(n_observations, n_actions, hidden_size=128)
critic = CriticNet(n_observations, n_actions, hidden_size=128)

LEARNING_RATE = 0.001

actor_optim = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
critic_optim = optim.Adam(critic.parameters(), lr=LEARNING_RATE)
    
critic_loss_func = nn.MSELoss()

In [7]:
def compute_actor_loss(log_probs: List[torch.Tensor], advantages: List[torch.Tensor]):
    """Calculate the advantage loss for actor

    Args:
        log_probs (_type_): a list of 
        advantages (_type_): _description_

    Returns:
        _type_: _description_
    """
    assert len(log_probs) == len(advantages)
    
    total_loss = 0
    for log_prob, advantage in zip(log_probs, advantages):
        total_loss = log_prob * advantage
    
    # return -(torch.sum(log_probs * advantages))
    return -total_loss

In [8]:
N_EPISODES = 10000
DISCOUNT_FACTOR = 0.99

episode_rewards = []
episode_actor_losses = []
episode_critic_losses = []

for episode in range(N_EPISODES):
    
    pred_log_probs = []
    pred_q_values = []
    rewards = []
    total_reward = 0

    observation, _ = env.reset()
    observation = torch.from_numpy(observation)
    in_progress = True
    
    while in_progress:
        pred_action = actor(observation)
        pred_value = critic(observation)

        action_idx = Categorical(pred_action).sample().item()
                        
        new_observation, reward, done, truncated, info = env.step(action_idx)
        reward = torch.tensor(reward)
        
        pred_log_probs.append(pred_action[action_idx].log())
        pred_q_values.append(pred_value)
        rewards.append(reward)
        total_reward += reward
        
        if done: break
    
    episode_rewards.append(total_reward)
    
    # TODO: this one make it loss grad
    # pred_q_values = torch.tensor(pred_q_values)
    
    discounted_returns = calculate_discounted_return_each_timestep(rewards, DISCOUNT_FACTOR)
    advantages = calculate_advantages(discounted_returns, torch.tensor(pred_q_values))
    loss_actor = compute_actor_loss(pred_log_probs, advantages)
    
    _pred_q_values = torch.tensor(pred_q_values)
    _pred_q_values.requires_grad = True
    loss_critic = critic_loss_func(discounted_returns, _pred_q_values)
    
    episode_actor_losses.append(loss_actor.detach().numpy())
    episode_critic_losses.append(loss_critic.detach().numpy())
    
    actor_optim.zero_grad()
    critic_optim.zero_grad()
    
    loss_actor.backward()
    loss_critic.backward()
    
    actor_optim.step()
    critic_optim.step()
    
    print(f"episode={episode} loss_actor={loss_actor}, loss_critic={loss_critic}")

In [9]:
import matplotlib.pyplot as plt
plt.plot(episode_rewards)

[<matplotlib.lines.Line2D at 0x1335c4dc0>]

In [10]:
episode_rewards

[tensor(27.),
 tensor(23.),
 tensor(23.),
 tensor(48.),
 tensor(11.),
 tensor(26.),
 tensor(12.),
 tensor(37.),
 tensor(11.),
 tensor(30.),
 tensor(14.),
 tensor(33.),
 tensor(19.),
 tensor(26.),
 tensor(16.),
 tensor(14.),
 tensor(29.),
 tensor(16.),
 tensor(37.),
 tensor(14.),
 tensor(20.),
 tensor(14.),
 tensor(23.),
 tensor(32.),
 tensor(15.),
 tensor(30.),
 tensor(8.),
 tensor(30.),
 tensor(16.),
 tensor(21.),
 tensor(15.),
 tensor(9.),
 tensor(14.),
 tensor(36.),
 tensor(10.),
 tensor(33.),
 tensor(43.),
 tensor(21.),
 tensor(19.),
 tensor(37.),
 tensor(14.),
 tensor(57.),
 tensor(20.),
 tensor(12.),
 tensor(10.),
 tensor(34.),
 tensor(19.),
 tensor(13.),
 tensor(15.),
 tensor(14.),
 tensor(21.),
 tensor(12.),
 tensor(16.),
 tensor(17.),
 tensor(21.),
 tensor(15.),
 tensor(22.),
 tensor(16.),
 tensor(10.),
 tensor(13.),
 tensor(52.),
 tensor(10.),
 tensor(12.),
 tensor(19.),
 tensor(20.),
 tensor(13.),
 tensor(11.),
 tensor(32.),
 tensor(10.),
 tensor(20.),
 tensor(14.),
 tensor(

In [11]:
len(episode_rewards)

10000

In [12]:
episode_rewards

[tensor(27.),
 tensor(23.),
 tensor(23.),
 tensor(48.),
 tensor(11.),
 tensor(26.),
 tensor(12.),
 tensor(37.),
 tensor(11.),
 tensor(30.),
 tensor(14.),
 tensor(33.),
 tensor(19.),
 tensor(26.),
 tensor(16.),
 tensor(14.),
 tensor(29.),
 tensor(16.),
 tensor(37.),
 tensor(14.),
 tensor(20.),
 tensor(14.),
 tensor(23.),
 tensor(32.),
 tensor(15.),
 tensor(30.),
 tensor(8.),
 tensor(30.),
 tensor(16.),
 tensor(21.),
 tensor(15.),
 tensor(9.),
 tensor(14.),
 tensor(36.),
 tensor(10.),
 tensor(33.),
 tensor(43.),
 tensor(21.),
 tensor(19.),
 tensor(37.),
 tensor(14.),
 tensor(57.),
 tensor(20.),
 tensor(12.),
 tensor(10.),
 tensor(34.),
 tensor(19.),
 tensor(13.),
 tensor(15.),
 tensor(14.),
 tensor(21.),
 tensor(12.),
 tensor(16.),
 tensor(17.),
 tensor(21.),
 tensor(15.),
 tensor(22.),
 tensor(16.),
 tensor(10.),
 tensor(13.),
 tensor(52.),
 tensor(10.),
 tensor(12.),
 tensor(19.),
 tensor(20.),
 tensor(13.),
 tensor(11.),
 tensor(32.),
 tensor(10.),
 tensor(20.),
 tensor(14.),
 tensor(

In [13]:
import numpy as np

np.array(episode_rewards).mean()

14.4351

In [14]:
from typing import List

import torch
from torch import nn
from torch import optim
from torch.distributions import Categorical
import gym

from octopus.policy.reward import calculate_discounted_return_each_timestep, calculate_advantages

In [15]:
import wandb

In [16]:
import wandb
import time

run_name = f"{gym_id}__{exp_name}__{seed}__{int(time.time())}"

In [17]:
gym_id = "CartPole-v1"
exp_name = "A2C"
seed = 42

In [18]:
import wandb
import time

run_name = f"{gym_id}__{exp_name}__{seed}__{int(time.time())}"

In [19]:
run_name

'CartPole-v1__A2C__42__1671857230'

In [20]:
import wandb
import time

run_name = f"{gym_id}__{exp_name}__{seed}__{int(time.time())}"

In [21]:
run_name

'CartPole-v1__A2C__42__1671857240'

In [22]:
import wandb
import time

run_name = f"{gym_id}__{exp_name}__{seed}__{int(time.time())}"

In [23]:
run_name

'CartPole-v1__A2C__42__1671857243'

In [24]:
import wandb
import time

run_name = f"{gym_id}__{exp_name}__{seed}__{int(time.time())}"

wandb.init(
    # project=args.wandb_project_name,
    # entity=args.wandb_entity,
    sync_tensorboard=True,
    # config=vars(args),
    name=run_name,
    monitor_gym=True,
    save_code=True,
)

In [25]:
from typing import List

import torch
from torch import nn
from torch import optim
from torch.distributions import Categorical
import gym

from octopus.policy.reward import calculate_discounted_return_each_timestep, calculate_advantages

In [26]:
class ActorNet(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, state):
        return self.layers(state)

In [27]:
class CriticNet(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
    
    def forward(self, state):
        return self.layers(state)

In [28]:
# class A2C(nn.Module):
#     def __init__(self, n_observations, n_actions, hidden_size):
#         super().__init__()
#         self.actor = nn.Sequential(
#             nn.Linear(n_observations, hidden_size),
#             nn.ReLU(),
#             nn.Linear(hidden_size, n_actions),
#             nn.Softmax(dim=-1)
#         )
        
#         self.critic = nn.Sequential(
#             nn.Linear(n_observations, hidden_size),
#             nn.ReLU(),
#             nn.Linear(hidden_size, 1)
#         )
    
#     def forward(self, state):
#         pred_action = self.actor(state)
#         pred_value = self.critic(state)
        
#         return pred_action, pred_value

In [29]:
gym_id = "CartPole-v1"
exp_name = "A2C"
seed = 42

In [30]:
import wandb
import time

run_name = f"{gym_id}__{exp_name}__{seed}__{int(time.time())}"

wandb.init(
    # project=args.wandb_project_name,
    # entity=args.wandb_entity,
    sync_tensorboard=True,
    # config=vars(args),
    name=run_name,
    monitor_gym=True,
    save_code=True,
)

<wandb.sdk.wandb_run.Run at 0x17fd8d760>

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01674309514999853, max=1.0)…

In [31]:
from typing import List

import torch
from torch import nn
from torch import optim
from torch.distributions import Categorical
import gym

from octopus.policy.reward import calculate_discounted_return_each_timestep, calculate_advantages

In [32]:
class ActorNet(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, state):
        return self.layers(state)

In [33]:
class CriticNet(nn.Module):
    def __init__(self, n_observations, n_actions, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_observations, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
    
    def forward(self, state):
        return self.layers(state)

In [34]:
gym_id = "CartPole-v1"
exp_name = "A2C"
seed = 42

In [35]:
import wandb
import time

run_name = f"{gym_id}__{exp_name}__{seed}__{int(time.time())}"

wandb.init(
    # project=args.wandb_project_name,
    # entity=args.wandb_entity,
    sync_tensorboard=True,
    # config=vars(args),
    name=run_name,
    monitor_gym=True,
    save_code=True,
)