# REINFORCE

Notebook that runs REINFORCE-based RL methods - REINFORCE, Actor-Critic, A2C, and A3C.
The agent has been trained and tested on gymnasium environments. 

## REINFORCE-based methods
- model-free
- policy-based
- on-policy (A3C: off-policy)

||REINFORCE|Actor-Critic|A2C|A3C|
|-|-|-|-|-|
|bias|low|high|higher|high|
|variance|high|low|lower|low|

## Environment

In this notebook, we use gymnasium environment. 

- CarRacing-v2 (discrete action space)
- CarRacing-v2 (continuous action space)

In [None]:
%pip install -q numpy
%pip install -q tqdm
%pip install -q matplotlib
%pip install -q torch
%pip install -q swig
%pip install -q gymnasium
%pip install -q gymnasium[box2d]

In [19]:
import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import json
from gymnasium.spaces import Discrete, Box
from gymnasium.wrappers.gray_scale_observation import GrayScaleObservation
from gymnasium.wrappers.frame_stack import FrameStack
from tqdm import tqdm
from collections import namedtuple
from datetime import datetime
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import HuberLoss
from torch.distributions import Categorical, Normal

def preprocess_env(env: gym.Env):
  env = GrayScaleObservation(env, keep_dim=False)
  env = FrameStack(env, num_stack=4)
  return env

# define environment
envname = "CarRacing-v2"

# continuous or discrete (editable)
continuous = False
env = gym.make(envname, continuous=continuous)
env = preprocess_env(env)

# plot settings
is_ipython = "inline" in matplotlib.get_backend()
if is_ipython:
  from IPython import display

# device setting (cpu or cuda!)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device", device)

Device cuda


## Hyperparameters

- `ALGORITHM`: one of `["REINFORCE", "AC", "A2C", "A3C"]`.
- `NUM_EPISODES`: number of training episodes. 
- `TEST_EPISODES`: number of test episodes during training. 
- `TEST_FREQ`: testing frequency. 
- `GAMMA`: discount factor when calculating estimated goal. 
- `LR`: learning rate. 

In [20]:
HyperParameter = namedtuple("HyperParameter", (
  "ALGORITHM",
  "NUM_EPISODES",
  "TEST_EPISODES",
  "TEST_FREQ",
  "GAMMA",
  "LR"
))

# editable
hp = HyperParameter(
  ALGORITHM="REINFORCE",
  NUM_EPISODES=2000,
  TEST_EPISODES=5,
  TEST_FREQ=100,
  GAMMA=0.99,
  LR=0.0005
)

## Actor & Critic Network

Network that receives 96x96 pixel information and returns (1) few discrete actions or (2) one continuous action, with calculated state value. 
We use simple network including CNN and fully-connected layers. 

We use value network for Actor-Critic, A2C, and A3C, so value part is also in the network. 

In [21]:
class ActorCriticNetwork(nn.Module):
  """
  Policy network and value network with CNN and fully-connected layers. 
  """
  
  def __init__(
    self, 
    dim_observation: tuple, 
    action_space: Discrete | Box,
    use_state_value_network: bool=False,
    use_action_value_network: bool=False
  ):
    """
    n_observations input channels and n_actions output channels. 

    Args:
        dim_observation (tuple): environment's observation dimension
        action_space (Discrete | Box): environment's action space, which is Discrete or Box
        use_state_value_network (bool, optional): whether we use state value network or not. True for A2C and A3C. 
        use_action_value_network (bool, optional): whether we use action value network or not. True for AC.
    """
    super(ActorCriticNetwork, self).__init__()
    C, H, W = dim_observation
    assert H == 96
    assert W == 96
    
    # feature extraction
    self.seq_model = nn.Sequential(
      nn.Conv2d(in_channels=C, out_channels=32, kernel_size=8, stride=4),
      nn.ReLU(),
      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=2),
      nn.ReLU(),
      nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
      nn.ReLU(),
      nn.Flatten(),
      nn.Linear(4096, 512),
      nn.ReLU()
    )
    
    # actor head
    # action can be discrete or continuous
    self.is_discrete_action = isinstance(action_space, Discrete)
    if self.is_discrete_action:
      # discrete action: softmax output
      self.policy_prob_model = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_space.n),
        nn.Softmax(dim=-1)
      )
    else:
      # continuous action: vector of float
      assert len(action_space.shape) == 1
      self.policy_mean_model = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_space.shape[0])
      )
      self.policy_log_std_model = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_space.shape[0])
      )
    
    # critic head - state value
    self.use_state_value_network = use_state_value_network
    if self.use_state_value_network:
      self.state_value_model = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
      )
    
    # critic head - action value
    self.use_action_value_network = use_action_value_network
    if self.use_action_value_network:
      # action value only available for discrete action
      assert self.is_discrete_action
      self.action_value_model = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_space.n)
      )

  def forward(self, x: torch.Tensor):
    """
    Forward to get value and policy.

    Args:
        x (torch.Tensor): input tensor

    Returns:
        state_value (torch.Tensor): state value
        action_value (torch.Tensor): state-action value
        policy (Categorical | Normal): distribution of action probabilities
    """
    x = self.seq_model(x)
    
    # actor head
    if self.is_discrete_action:
      # discrete action
      policy = Categorical(self.policy_prob_model(x))
    else:
      # continuous action
      mean = self.policy_mean_model(x)
      log_std = self.policy_log_std_model(x)
      log_std = torch.clamp(log_std, min=-20, max=2)
      std = torch.exp(log_std)
      policy = Normal(mean, std)
    
    # critic head - state value
    if self.use_state_value_network:
      # state value
      state_value = self.state_value_model(x)
    else:
      state_value = None
      
    # critic head - action value
    if self.use_action_value_network:
      # action value
      action_value = self.action_value_model(x)
    else:
      action_value = None
    
    return state_value, action_value, policy

## Training: Utility Functions

- `select_action`: select agent's action using policy network. 
- `save_plot`: plot objective func, average frames, and score. 
- `save_model`: store model, hyperparameters, and training info. 

In [27]:
# algorithm name validation
assert hp.ALGORITHM in ["REINFORCE", "AC", "A2C", "A3C"]

# policy network
dim_observation = env.observation_space.shape
action_space = env.action_space
is_discrete_action = isinstance(env.action_space, Discrete)
if not is_discrete_action:
  action_low = torch.tensor(action_space.low, device=device, dtype=torch.float32).unsqueeze(0)
  action_high = torch.tensor(action_space.high, device=device, dtype=torch.float32).unsqueeze(0)

policy_net = ActorCriticNetwork(
  dim_observation, 
  action_space, 
  hp.ALGORITHM in ["A2C", "A3C"],
  hp.ALGORITHM == "AC"
).to(device)

# adamw optimizer
optimizer = optim.AdamW(policy_net.parameters(), lr=hp.LR, amsgrad=True)

# save directory
start_datetime = datetime.now()
dirname = start_datetime.strftime("%Y%m%d-%H%M%S")
path = os.path.join(os.getcwd(), "reinforce_based", dirname)

# training variables
train_objectives = []
train_actor_losses = []
train_critic_losses = []
train_frames = []
train_scores = []
test_frames = []
test_scores = []
steps = 0

In [28]:
def select_action(state: torch.Tensor):
  """
  Select agent's action using policy network. 
  Don't use torch.no_grad() because REINFORCE is on-policy. 
  """
  assert state.shape[0] == 1
  
  # get value and policy
  state_value, action_value, policy = policy_net(state)
  # sample action
  sample = policy.sample()
  log_prob = policy.log_prob(sample)
  
  if is_discrete_action:
    # select action
    action = sample
    # return with log probabilities
    return state_value, action_value, action.item(), log_prob
  else:
    # get action: squeeze between low and high
    # squeeze into [-1, 1]
    # high * 0.5 * (1 + val) + low * 0.5 * (1 - val)
    action = torch.tanh(sample)
    action = action_low * 0.5 * (1.0 - action) + action_high * 0.5 * (1.0 + action)
    # return with log probabilities
    return state_value, action_value, action[0].cpu().numpy(), torch.sum(log_prob, dim=-1, keepdim=False)

In [29]:
def save_plot():
  """
  Plot objective func, losses, average frames, and scores and save the figures. 
  """
  plt.figure(figsize=(16, 12))
  plt.clf()
  plt.ion()

  plt.subplot(2, 2, 1)
  plt.title("J(θ)")
  plt.xlabel("Episode")
  plt.ylabel("J(θ)")
  plt.plot(*zip(*train_objectives), label="train")
  if len(train_objectives) >= 100:
    x, y = zip(*train_objectives)
    y = torch.mean(torch.tensor(y).unfold(0, 100, 1), dim=1)
    plt.plot(x[99:], y, label="train-avg100")
  plt.legend()
  plt.grid()
  
  plt.subplot(2, 2, 2)
  plt.title("Loss")
  plt.xlabel("Learning Step")
  plt.ylabel("Loss")
  if len(train_actor_losses) >= 100:
    x, y = zip(*train_actor_losses)
    y = torch.mean(torch.tensor(y).unfold(0, 100, 1), dim=1)
    plt.plot(x[99:], y, label="train-actor-avg100")
  if len(train_critic_losses) >= 100:
    x, y = zip(*train_critic_losses)
    y = torch.mean(torch.tensor(y).unfold(0, 100, 1), dim=1)
    plt.plot(x[99:], y, label="train-critic-avg100")
  plt.legend()
  plt.grid()
  
  plt.subplot(2, 2, 3)
  plt.title("# of Frames")
  plt.xlabel("Episode")
  plt.ylabel("# of Frames")
  plt.plot(*zip(*train_frames), label="train")
  plt.plot(*zip(*test_frames), label="test")
  plt.legend()
  plt.grid()
  
  plt.subplot(2, 2, 4)
  plt.title("Score")
  plt.xlabel("Episode")
  plt.ylabel("Score")
  plt.plot(*zip(*train_scores), label="train")
  plt.plot(*zip(*test_scores), label="test")
  plt.legend()
  plt.grid()
  
  plt.ioff()
  plt.savefig(os.path.join(path, "plot.png"))
  
  if is_ipython:
    display.clear_output(wait=True)
    display.display(plt.gcf())

In [30]:
def save_model():
  """
  Save model, hyperparameters, and training info.
  """
  # save model
  torch.save({
    "policy_net": policy_net.state_dict()
  }, os.path.join(path, "model.pt"))
  
  # save hyperparameters
  with open(os.path.join(path, "hparam.json"), "w") as w:
    json.dump(hp._asdict(), w, indent=2)
  
  # save training info
  with open(os.path.join(path, "info.json"), "w") as w:
    json.dump(dict([
      ("env", envname), 
      ("action_type", "discrete" if is_discrete_action else "continuous"),
      ("test_frames", test_frames), 
      ("test_scores", test_scores), 
      ("steps", steps), 
      ("training_time", (datetime.now() - start_datetime).seconds)
    ]), w, indent=2)

## Training: Testing Function

- `test_model`: run policy model in the environment with trained policy network. 

In [31]:
def test_model():
  """
  Test policy model and return the result in training variables. 
  """
  # testing variables
  frames = []
  scores = []
  
  # repeat for TEST_EPISODES episodes
  for _ in range(1, hp.TEST_EPISODES + 1):
    # initialize environment and state
    state, _ = env.reset()
    state = torch.tensor(np.array(state), device=device, dtype=torch.float32).unsqueeze(0)
    score = 0
    
    # start an episode
    for frame in count():
      # select action
      with torch.no_grad():
        _, _, action, _ = select_action(state)
      
      # act to next state
      observation, reward, terminated, truncated, _ = env.step(action)
      score += reward
      done = terminated or truncated
      
      # update state
      state = torch.tensor(np.array(observation), device=device, dtype=torch.float32).unsqueeze(0)

      # check end condition
      if done:
        frames.append(frame)
        scores.append(score)
        break
      
  # add to training variables
  return np.mean(np.array(frames)), np.mean(np.array(scores))

## Training

### REINFORCE

In training, simulate agent in the environment to create trajectory, and trains policy network. 

Object of REINFORCE algorithm is maximizing objective function, which is defined as below. 

$$
\begin{gather}
J_i(\theta_i) = E_{s0 \sim p0} \left[ v_{\pi_{\theta_i}}(s_0) \right] = E_{\tau \sim \pi_\theta} \left[ G(\tau) \right] \notag \\
\text{where } \tau = S_0, A_0, R_0, S_1, ..., S_{T-1}, A_{T-1}, R_{T-1}, S_T \notag \\
\text{where } G(\tau) = R_0 + \gamma R_1 + ... + \gamma^{T-1} R_{T-1} \notag
\end{gather}
$$

We can get objective function using some mathematical tricks. 

$$
\begin{aligned}
\nabla_\theta J(\theta) &= \nabla_\theta E_{\tau \sim \pi_\theta} \left[ G(\tau) \right] \\ 
&= E_{\tau \sim \pi_\theta} \left[ \nabla_\theta \ln p(\tau|\pi_\theta) G(\tau) \right] \\ 
&= E_{\tau \sim \pi_\theta} \left[ G(\tau) \sum_{t=0}^T \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right] \\ 
&= E_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \gamma^t R_t \sum_{t=0}^T \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right] \\
&= E_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \left( \left( \sum_{k=t}^{T-1} \gamma^k R_k \right) \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right) \right] 
\end{aligned}
$$

The last equation makes sense because the following can be shown: $ E_{\tau \sim \pi_\theta} \left[\gamma^k R_k \nabla_\theta \ln \pi(A_t | S_t ; \pi_\theta) \right] = 0 \text{ for } k < t $

In this point, we approximate $ \gamma $-included term to prevent reward vanishing. 

$$
\begin{gather}
\sum_{k=t}^{T-1} \gamma^k R_k 
= \gamma^t \sum_{k=t}^{T-1}  \gamma^{k-t} R_k 
= \gamma^t G_t
\cong G_t \notag \\
\nabla_\theta J(\theta) 
\cong E_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T G_t \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right] \notag
\end{gather}
$$

So, we update parameters as below. 

$$
\begin{aligned}
\theta &\leftarrow \theta + \alpha \sum_{t=0}^T G_t \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta)
\end{aligned}
$$

### Actor-Critic

In REINFORCE algorithm, $ G_t $ is required for training. 
Thus, training can be done only after the end of an episode, which causes high variance. 

So, Actor-Critic tries to estimate $ G_t $ using neural network $ Q_w $, or $ V_w $. 

$$
\begin{aligned}
\nabla_\theta J(\theta) 
&\cong E_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T G_t \nabla_\theta \ln \pi(A_t|S_t; \pi_\theta) \right] \\
&= \sum_{t=0}^T E_{S_0,...,A_t} \left[ \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) E_{S_{t+1},...,S_T} \left[ G_t | S_0,..., A_t \right] \right] \\
&=\sum_{t=0}^T E_{S_0,...,A_t} \left[ \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) Q(S_t, A_t) \right] \\
&=\sum_{t=0}^T E_{S_t,A_t} \left[ \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) Q(S_t, A_t) \right]
\end{aligned}
$$

First, we can change average of sums to sum of averages. 
Then, average for all trajectory can be splitted before $ t $ and after $ t $. 
Finally, we can define value function $ Q(S_t, A_t) = E_{S_{t+1},...,S_T} \left[ G_t | S_0,..., A_t \right] $

Update formula for actor $ \theta $ and critic $ w $ are shown below. 
Objective of the actor is increasing objective function $ J(\theta) $. 
Objective of the critic is decreasing TD error $ \delta = R_t + \gamma V_w(S_{t+1}) - V_w(S_t) $

$$
\begin{aligned}
\theta &\leftarrow \theta + \alpha  \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) ( R_t + \gamma \max_A Q_w(S_{t+1}, A) - Q_w(S_t, A_t) ) \\
w &\leftarrow w - \beta \nabla_w (R_t + \gamma \max_A Q_w(S_{t+1}, A) - Q_w(S_t, A_t))^2
\end{aligned}
$$

In this code, I used Actor-Critic using $ Q(S_t, A_t) $, so it can't deal with continuous action space. 

In [34]:
# create training directory
os.makedirs(path)

for episode in tqdm(range(1, hp.NUM_EPISODES + 1)):
  # initialize environment and state
  state, _ = env.reset()
  state = torch.tensor(np.array(state), device=device, dtype=torch.float32).unsqueeze(0)
  score = 0
  if hp.ALGORITHM == "REINFORCE":
    reward_history = [] # list of scalars
    log_prob_history = [] # list of (1,) tensors
  
  # start an episode
  for frame in count():
    # select action
    state_value, action_value, action, log_prob = select_action(state)
    steps += 1
    
    # act to next state
    observation, reward, terminated, truncated, _ = env.step(action)
    score += reward
    done = terminated or truncated
    if hp.ALGORITHM == "REINFORCE":
      reward_history.append(reward)
      log_prob_history.append(log_prob)
    
    # update state to next state
    state = torch.tensor(np.array(observation), device=device, dtype=torch.float32).unsqueeze(0)
    
    # Actor-Critic training
    if hp.ALGORITHM == "AC":
      # TD error
      current_action_value = action_value.gather(1, torch.tensor([[action]], device=device, dtype=torch.int64))
      _, next_action_value, _, _ = select_action(state)
      next_action_value = next_action_value.max(1, keepdim=True).values
      expected_action_value = reward + hp.GAMMA * (1 - done) * next_action_value
      TD = expected_action_value - current_action_value
      
      # actor loss
      actor_loss = -torch.sum(torch.mul(TD, log_prob.unsqueeze(1)))
      train_actor_losses.append((steps, actor_loss.item()))
      
      # critic loss
      critic_loss = HuberLoss(reduction="none")(current_action_value, expected_action_value)
      train_critic_losses.append((steps, critic_loss.item()))
      
      # gradient ascent/descent
      loss = actor_loss + critic_loss
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    
    # check end condition
    if done:
      train_scores.append((episode, score))
      train_frames.append((episode, frame))
      break
  
  # REINFORCE training
  if hp.ALGORITHM == "REINFORCE":
    # get list of G_t
    g_history = [0 for _ in range(len(reward_history))]
    g_history[-1] = reward_history[-1]
    for i in range(len(reward_history) - 2, -1, -1):
      g_history[i] = reward_history[i] + g_history[i + 1] * hp.GAMMA
    g_history = torch.tensor(g_history, device=device)
    
    # normalize g_history
    train_objectives.append((episode, g_history[0].item()))
    g_history = (g_history - torch.mean(g_history)) / torch.std(g_history)
    
    # get list of log probs of actions
    log_prob_history = torch.cat(log_prob_history).to(device=device)
    
    # gradient ascent
    loss = -torch.sum(torch.mul(g_history, log_prob_history))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  if episode % hp.TEST_FREQ == 0:
    mean_frame, mean_score = test_model()
    test_frames.append((episode, mean_frame))
    test_scores.append((episode, mean_score))
    save_plot()
    save_model()

save_plot()
save_model()

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

(96, 96)





TypeError: list indices must be integers or slices, not tuple

## Test

In this block, trained agent plays in the environment. We can see rendered environment played by the agent. 

In [None]:
env = gym.make(envname, render_mode="human", continuous=continuous)
env = preprocess_env(env)

scores = []

# if you want to load from trained model, edit this (editable)
load_dirname = None

if load_dirname is not None:
  # load models
  path = os.path.join(os.getcwd(), "reinforce_based", load_dirname)
  checkpoint = torch.load(os.path.join(path, "model.pt"), map_location=device)
  
  policy_net.load_state_dict(checkpoint["policy_net"])

# repeat for TEST_EPISODES episodes
for episode in range(1, hp.TEST_EPISODES + 1):
  # initialize environment and state
  state, _ = env.reset()
  state = torch.tensor(np.array(state), device=device, dtype=torch.float32).unsqueeze(0)
  score = 0
  
  # start an episode
  for _ in count():
    # select greedy action
    with torch.no_grad():
      _, _, action, _ = select_action(state)
    
    # act to next state
    observation, reward, terminated, truncated, _ = env.step(action)
    score += reward
    done = terminated or truncated
    
    # update state
    state = torch.tensor(np.array(observation), device=device, dtype=torch.float32).unsqueeze(0)

    # check end condition
    if done:
      print(f"Episode {episode}: {score}")
      scores.append(score)
      break

env.close()

print(f"Average: {sum(scores) / hp.TEST_EPISODES}")
print(f"Max: {max(scores)}")
print(f"Min: {min(scores)}")