# REINFORCE

Notebook that runs REINFORCE-based RL methods - REINFORCE, Actor-Critic, A2C, and A3C.
The agent has been trained and tested on gymnasium environments. 

## REINFORCE-based methods
- model-free
- policy-based
- on-policy (A3C: off-policy)

||REINFORCE|Actor-Critic|A2C|A3C|
|-|-|-|-|-|
|bias|low|high|higher|high|
|variance|high|low|lower|low|

## Environment

In this notebook, we use gymnasium environment. 

- CarRacing-v2 (discrete action space)
- CarRacing-v2 (continuous action space)

In [None]:
import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import json
from gymnasium.spaces import Discrete, Box
from gymnasium.wrappers.gray_scale_observation import GrayScaleObservation
from gymnasium.wrappers.frame_stack import FrameStack
from tqdm import tqdm
from collections import namedtuple
from datetime import datetime
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical, Normal

def preprocess_env(env: gym.Env):
  env = GrayScaleObservation(env, keep_dim=False)
  env = FrameStack(env, num_stack=4)
  return env

# define environment
envname = "CarRacing-v2"

# continuous or discrete (editable)
continuous = False
env = gym.make(envname, continuous=continuous)
env = preprocess_env(env)

# plot settings
is_ipython = "inline" in matplotlib.get_backend()
if is_ipython:
  from IPython import display

# device setting (cpu or cuda!)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device", device)

## Hyperparameters

- `ALGORITHM`: one of `["REINFORCE", "AC", "A2C", "A3C"]`.
- `NUM_EPISODES`: number of training episodes. 
- `TEST_EPISODES`: number of test episodes during training. 
- `TEST_FREQ`: testing frequency. 
- `GAMMA`: discount factor when calculating estimated goal. 
- `LR`: learning rate. 

In [None]:
HyperParameter = namedtuple("HyperParameter", (
  "ALGORITHM",
  "NUM_EPISODES",
  "TEST_EPISODES",
  "TEST_FREQ",
  "GAMMA",
  "LR"
))

# editable
hp = HyperParameter(
  ALGORITHM="REINFORCE",
  NUM_EPISODES=2000,
  TEST_EPISODES=5,
  TEST_FREQ=100,
  GAMMA=0.99,
  LR=0.0005
)

## Actor & Critic Network

Network that receives 96x96 pixel information and returns (1) few discrete actions or (2) one continuous action, with calculated state value. 
We use simple network including CNN and fully-connected layers. 

We use value network for Actor-Critic, A2C, and A3C, so value part is also in the network. 

In [None]:
class ActorCriticNetwork(nn.Module):
  """
  Policy network and value network with CNN and fully-connected layers. 
  """
  
  def __init__(
    self, 
    dim_observation: tuple, 
    action_space: Discrete | Box,
    use_value_network: bool=False
  ):
    """
    n_observations input channels and n_actions output channels. 
    """
    super(ActorCriticNetwork, self).__init__()
    C, H, W = dim_observation
    assert H == 96
    assert W == 96
    
    # feature extraction
    self.seqmodel = nn.Sequential(
      nn.Conv2d(in_channels=C, out_channels=32, kernel_size=8, stride=4),
      nn.ReLU(),
      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=2),
      nn.ReLU(),
      nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
      nn.ReLU(),
      nn.Flatten(),
      nn.Linear(4096, 512),
      nn.ReLU()
    )
    
    # critic head
    self.use_value_network = use_value_network
    if self.use_value_network:
      self.value = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
      )
    
    # actor head
    # action can be discrete or continuous
    self.discrete_action = isinstance(action_space, Discrete)
    if self.discrete_action:
      # discrete action: softmax output
      self.action_prob = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_space.n),
        nn.Softmax(dim=-1)
      )
    else:
      # continuous action: vector of float
      assert len(action_space.shape) == 1
      self.mean = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_space.shape[0])
      )
      self.log_std = nn.Sequential(
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_space.shape[0])
      )

  def forward(self, x: torch.Tensor):
    """
    Forward to get value and policy.
    """
    x = self.seqmodel(x)
    
    # critic head
    if self.use_value_network:
      # value of the state
      value = self.value(x)
    else:
      value = None
    
    # actor head
    if self.discrete_action:
      # discrete action
      # output: ([,], [BATCH_SIZE, N_ACTIONS])
      policy = Categorical(self.action_prob(x))
    else:
      # continuous action
      # output: ([,], [BATCH_SIZE, ACTION_SPACE], [BATCH_SIZE, ACTION_SPACE])
      mean = self.mean(x)
      log_std = self.log_std(x)
      log_std = torch.clamp(log_std, min=-20, max=2)
      std = torch.exp(log_std)
      policy = Normal(mean, std)
    return value, policy

## Training: Utility Functions

- `select_action`: select agent's action using policy network. 
- `save_plot`: plot objective func, average frames, and score. 
- `save_model`: store model, hyperparameters, and training info. 

In [None]:
# algorithm name validation
assert hp.ALGORITHM in ["REINFORCE", "AC", "A2C", "A3C"]

# policy network
dim_observation = env.observation_space.shape
action_space = env.action_space
discrete_action = isinstance(env.action_space, Discrete)
if not discrete_action:
  action_low = torch.tensor(action_space.low, device=device, dtype=torch.float32).unsqueeze(0)
  action_high = torch.tensor(action_space.high, device=device, dtype=torch.float32).unsqueeze(0)

policy_net = ActorCriticNetwork(dim_observation, action_space, False).to(device)

# adamw optimizer
optimizer = optim.AdamW(policy_net.parameters(), lr=hp.LR, amsgrad=True)

# save directory
start_datetime = datetime.now()
dirname = start_datetime.strftime("%Y%m%d-%H%M%S")
path = os.path.join(os.getcwd(), "reinforce_based", dirname)

# training variables
train_objectives = []
train_losses = []
train_frames = []
train_scores = []
test_frames = []
test_scores = []
steps = 0

In [None]:
def select_action(state: torch.Tensor):
  """
  Select agent's action using policy network. 
  Don't use torch.no_grad() because REINFORCE is on-policy. 
  """
  assert state.shape[0] == 1
  
  # get value and policy
  _, policy = policy_net(state)
  # sample action
  sample = policy.sample()
  log_prob = policy.log_prob(sample)
  
  if discrete_action:
    # select action
    action = sample
    # return with log probabilities
    return action.item(), log_prob
  else:
    # get action: squeeze between low and high
    # squeeze into [-1, 1]
    # high * 0.5 * (1 + val) + low * 0.5 * (1 - val)
    action = torch.tanh(sample)
    action = action_low * 0.5 * (1.0 - action) + action_high * 0.5 * (1.0 + action)
    # return with log probabilities
    return action[0].cpu().numpy(), torch.sum(log_prob, dim=-1, keepdim=False)

In [None]:
def save_plot():
  """
  Plot objective func, losses, average frames, and scores and save the figures. 
  """
  plt.figure(figsize=(16, 12))
  plt.clf()
  plt.ion()

  plt.subplot(2, 2, 1)
  plt.title("J(θ)")
  plt.xlabel("Episode")
  plt.ylabel("J(θ)")
  plt.plot(*zip(*train_objectives), label="train")
  if len(train_objectives) >= 100:
    x, y = zip(*train_objectives)
    y = torch.mean(torch.tensor(y).unfold(0, 100, 1), dim=1)
    plt.plot(x[99:], y, label="train-avg100")
  plt.legend()
  plt.grid()
  
  plt.subplot(2, 2, 2)
  plt.title("Loss")
  plt.xlabel("Learning Step")
  plt.ylabel("Loss")
  plt.plot(*zip(*train_losses), label="train")
  if len(train_losses) >= 100:
    x, y = zip(*train_losses)
    y = torch.mean(torch.tensor(y).unfold(0, 100, 1), dim=1)
    plt.plot(x[99:], y, label="train-avg100")
  plt.legend()
  plt.grid()
  
  plt.subplot(2, 2, 3)
  plt.title("# of Frames")
  plt.xlabel("Episode")
  plt.ylabel("# of Frames")
  plt.plot(*zip(*train_frames), label="train")
  plt.plot(*zip(*test_frames), label="test")
  plt.legend()
  plt.grid()
  
  plt.subplot(2, 2, 4)
  plt.title("Score")
  plt.xlabel("Episode")
  plt.ylabel("Score")
  plt.plot(*zip(*train_scores), label="train")
  plt.plot(*zip(*test_scores), label="test")
  plt.legend()
  plt.grid()
  
  plt.ioff()
  plt.savefig(os.path.join(path, "plot.png"))
  
  if is_ipython:
    display.clear_output(wait=True)
    display.display(plt.gcf())

In [None]:
def save_model():
  """
  Save model, hyperparameters, and training info.
  """
  # save model
  torch.save({
    "policy_net": policy_net.state_dict()
  }, os.path.join(path, "model.pt"))
  
  # save hyperparameters
  with open(os.path.join(path, "hparam.json"), "w") as w:
    json.dump(hp._asdict(), w, indent=2)
  
  # save training info
  with open(os.path.join(path, "info.json"), "w") as w:
    json.dump(dict([
      ("env", envname), 
      ("test_frames", test_frames), 
      ("test_scores", test_scores), 
      ("steps", steps), 
      ("training_time", (datetime.now() - start_datetime).seconds)
    ]), w, indent=2)

## Training: Testing Function

- `test_model`: run policy model in the environment with trained policy network. 

In [None]:
def test_model():
  """
  Test policy model and return the result in training variables. 
  """
  # testing variables
  frames = []
  scores = []
  
  # repeat for TEST_EPISODES episodes
  for _ in range(1, hp.TEST_EPISODES + 1):
    # initialize environment and state
    state, _ = env.reset()
    state = torch.tensor(np.array(state), device=device, dtype=torch.float32).unsqueeze(0)
    score = 0
    
    # start an episode
    for frame in count():
      # select action
      with torch.no_grad():
        action, _ = select_action(state)
      
      # act to next state
      observation, reward, terminated, truncated, _ = env.step(action)
      score += reward
      done = terminated or truncated
      
      # update state
      state = torch.tensor(np.array(observation), device=device, dtype=torch.float32).unsqueeze(0)

      # check end condition
      if done:
        frames.append(frame)
        scores.append(score)
        break
      
  # add to training variables
  return np.mean(np.array(frames)), np.mean(np.array(scores))

## Training

In training, simulate agent in the environment to create trajectory, and trains policy network. 

Object of REINFORCE algorithm is maximizing objective function, which is defined as below. 

$$
\begin{gather}
J_i(\theta_i) = E_{s0 \sim p0} \left[ v_{\pi_{\theta_i}}(s_0) \right] = E_{\tau \sim \pi_\theta} \left[ G(\tau) \right] \notag \\
\text{where } \tau = S_0, A_0, R_0, S_1, ..., S_{T-1}, A_{T-1}, R_{T-1}, S_T \notag \\
\text{where } G(\tau) = R_0 + \gamma R_1 + ... + \gamma^{T-1} R_{T-1} \notag
\end{gather}
$$

We can get objective function using some mathematical tricks. 

$$
\begin{aligned}
\nabla_\theta J(\theta) &= \nabla_\theta E_{\tau \sim \pi_\theta} \left[ G(\tau) \right] \\ 
&= E_{\tau \sim \pi_\theta} \left[ \nabla_\theta \ln p(\tau|\pi_\theta) G(\tau) \right] \\ 
&= E_{\tau \sim \pi_\theta} \left[ G(\tau) \sum_{t=0}^T \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right] \\ 
&= E_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \gamma^t R_t \sum_{t=0}^T \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right] \\
&= E_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \left( \left( \sum_{k=t}^{T-1} \gamma^k R_k \right) \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right) \right] 
\end{aligned}
$$

The last equation makes sense because the following can be shown: $ E_{\tau \sim \pi_\theta} \left[\gamma^k R_k \nabla_\theta \ln \pi(A_t | S_t ; \pi_\theta) \right] = 0 \text{ for } k < t $

In this point, we approximate $ \gamma $-included term to prevent reward vanishing. 

$$
\begin{gather}
\sum_{k=t}^{T-1} \gamma^k R_k 
= \gamma^t \sum_{k=t}^{T-1}  \gamma^{k-t} R_k 
= \gamma^t G_t
\cong G_t \notag \\
\nabla_\theta J(\theta) 
\cong E_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T G_t \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta) \right] \notag
\end{gather}
$$

So, we update parameters as below. 

$$
\theta \leftarrow \theta + \alpha \sum_{t=0}^T G_t \nabla_\theta \ln \pi(A_t|S_t;\pi_\theta)
$$

In [None]:
# create training directory
os.makedirs(path)

for episode in tqdm(range(1, hp.NUM_EPISODES + 1)):
  # initialize environment and state
  state, _ = env.reset()
  state = torch.tensor(np.array(state), device=device, dtype=torch.float32).unsqueeze(0)
  score = 0
  reward_history = [] # list of scalars
  log_prob_history = [] # list of (1,) tensors
  
  # start an episode
  for frame in count():
    # select action
    action, log_prob = select_action(state)
    steps += 1
    
    # act to next state
    observation, reward, terminated, truncated, _ = env.step(action)
    score += reward
    reward_history.append(reward)
    log_prob_history.append(log_prob)
    done = terminated or truncated
    
    # update state to next state
    state = torch.tensor(np.array(observation), device=device, dtype=torch.float32).unsqueeze(0)
    
    # check end condition
    if done:
      train_scores.append((episode, score))
      train_frames.append((episode, frame))
      break
  
  # get list of G_t
  g_history = []
  r = 0
  for i in range(len(reward_history) - 1, -1, -1):
    r = reward_history[i] + r * hp.GAMMA
    g_history.insert(0, r)
  g_history = torch.tensor(g_history, device=device)
  
  # normalize g_history
  train_objectives.append((episode, g_history[0].item()))
  g_history = (g_history - torch.mean(g_history)) / torch.std(g_history)
  
  # get list of log probs of actions
  log_prob_history = torch.cat(log_prob_history).to(device=device)
  
  # gradient ascent
  loss = -torch.sum(torch.mul(g_history, log_prob_history))
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  
  if episode % hp.TEST_FREQ == 0:
    mean_frame, mean_score = test_model()
    test_frames.append((episode, mean_frame))
    test_scores.append((episode, mean_score))
    save_plot()
    save_model()

save_plot()
save_model()

## Test

In this block, trained agent plays in the environment. We can see rendered environment played by the agent. 

In [None]:
env = gym.make(envname, render_mode="human", continuous=continuous)
env = preprocess_env(env)

scores = []

# if you want to load from trained model, edit this (editable)
load_dirname = None

if load_dirname is not None:
  # load models
  path = os.path.join(os.getcwd(), "reinforce_based", load_dirname)
  checkpoint = torch.load(os.path.join(path, "model.pt"), map_location=device)
  
  policy_net.load_state_dict(checkpoint["policy_net"])

# repeat for TEST_EPISODES episodes
for episode in range(1, hp.TEST_EPISODES + 1):
  # initialize environment and state
  state, _ = env.reset()
  state = torch.tensor(np.array(state), device=device, dtype=torch.float32).unsqueeze(0)
  score = 0
  
  # start an episode
  for _ in count():
    # select greedy action
    with torch.no_grad():
      action, _ = select_action(state)
    
    # act to next state
    observation, reward, terminated, truncated, _ = env.step(action)
    score += reward
    done = terminated or truncated
    
    # update state
    state = torch.tensor(np.array(observation), device=device, dtype=torch.float32).unsqueeze(0)

    # check end condition
    if done:
      print(f"Episode {episode}: {score}")
      scores.append(score)
      break

env.close()

print(f"Average: {sum(scores) / hp.TEST_EPISODES}")
print(f"Max: {max(scores)}")
print(f"Min: {min(scores)}")