<a href="https://colab.research.google.com/github/gitHubAndyLee2020/OpenAI_Gym_RL_Algorithms_Database/blob/main/ActorCritic_Module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Actor-Critic

> About

- Given some state, the Actor-Critic model will output the probability of taking each possible action, and expected reward for the state
- The difference between the actual reward from taking the selected action and the expected reward is used to calculate the loss value, which trains the neural network to maximize the probability of action that produces the maximum reward
- Same concept as Policy Gradient, except the expected reward stabilizes the training by reducing the variance of the loss value

> Pro

- Variance Reduction; more stable training than Policy Gradient

> Con

- Sensitivity to Hyperparameters; compared to Policy Gradient

```
class Policy(nn.Module):
  def __init__(self):
    - Initialize neural network that has two mappings, one that maps state space to action space and another that maps state space to state value
    - The state value represents the estimated reward given the current state
    - Initialize storage for saving actions and rewards

  def forward(self, x):
    - Feed the state input through both state-to-action-probability mapping and state-to-state-value mapping
    - Apply softmax to generated action probability and return the action probability and state value
```

```
def select_action(state):
  - Feed the state to the Policy network to get action probability and state value
  - Select an action from the action probabilty; if action had p probability in the action probabilty, then it has p chance of being selected
  - Store the selected action and state value pair in the storage
  - Return the selected action
```

```
def finish_episode():
  - Calculate the accumulated reward with formula R = r0 + gamma * r1 + gamma^2 * r2 + ...
  - Normalize the reward
  - Calculate the policy loss using the following steps:
    1. Compute the difference between actual reward and state value, a.k.a. predicted reward. The difference represents the advantage of the action compared to the average reward expected
    2. Loss is calculated by the negated log probability multiplied by the advantage. Higher advantage results in higher loss, which means the optimizer will adjust the neural network more towards the direction of producing higher probability for high-reward actions
  - Backpropagation is applied
```

```
def main():
  - During the data collection loop, selected actions and rewards are stored
  - The data collection loop runs until the agent fails in the environment
  - Afterwards, the collected data is used for the finish_episode functon defined above, where the model is trained
```

In [None]:
# Import the necessary libraries
import gym, os
import numpy as np
import matplotlib.pyplot as plt
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

# Create a CartPole-v0 environment object from the OpenAI Gym library
env = gym.make('CartPole-v0')
# Remove any wrappers from the environment
env = env.unwrapped

# Seed the environment and PyTorch random number generator for reproducibility
env.seed(1)
torch.manual_seed(1)

# Get the size of the state and action spaces
state_space = env.observation_space.shape[0]
action_space = env.action_space.n

# Set hyperparameters
learning_rate = 0.01
gamma = 0.99
episodes = 20000
render = False
eps = np.finfo(np.float32).eps.item()

# Define a Named Tuple to store log probabilities and values for the actions
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

# Define the neural network model for the policy
class Policy(nn.Module):
    def __init__(self):
        # Call parent class constructor
        super(Policy, self).__init__()
        # Define the first fully-connected layer
        self.fc1 = nn.Linear(state_space, 32)

        # Define the action and value heads for the neural network
        self.action_head = nn.Linear(32, action_space)
        self.value_head = nn.Linear(32, 1)  # Scalar Value

        # Initialize lists to store actions and rewards
        self.save_actions = []
        self.rewards = []

        # Create a directory to store outputs
        os.makedirs('./AC_CartPole-v0', exist_ok=True)

    def forward(self, x):
        # Forward pass through the neural network
        x = F.relu(self.fc1(x))
        action_score = self.action_head(x)
        state_value = self.value_head(x)

        return F.softmax(action_score, dim=-1), state_value

# Initialize the policy model and the optimizer
model = Policy()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Function to plot training progress
def plot(steps):
    ax = plt.subplot(111)
    ax.cla()
    ax.grid()
    ax.set_title('Training')
    ax.set_xlabel('Episode')
    ax.set_ylabel('Run Time')
    ax.plot(steps)
    RunTime = len(steps)

    path = './AC_CartPole-v0/' + 'RunTime' + str(RunTime) + '.jpg'
    if len(steps) % 200 == 0:
        plt.savefig(path)
    plt.pause(0.0000001)

# Function to select an action based on the current state
def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)
    m = Categorical(probs)
    action = m.sample()
    model.save_actions.append(SavedAction(m.log_prob(action), state_value))

    return action.item()

# Function to optimize the policy network
def finish_episode():
    R = 0
    save_actions = model.save_actions
    policy_loss = []
    value_loss = []
    rewards = []

    for r in model.rewards[::-1]:
        R = r + gamma * R
        rewards.insert(0, R)

    rewards = torch.tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + eps)

    for (log_prob , value), r in zip(save_actions, rewards):
        reward = r - value.item()
        policy_loss.append(-log_prob * reward)
        value_loss.append(F.smooth_l1_loss(value, torch.tensor([r])))

    optimizer.zero_grad()
    loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum()
    loss.backward()
    optimizer.step()

    del model.rewards[:]
    del model.save_actions[:]

# Main function where the training loop exists
def main():
    running_reward = 10
    live_time = []

    # Loop through episodes
    for i_episode in count(episodes):
        state = env.reset()

        # Loop through time steps in each episode
        for t in count():
            action = select_action(state)
            state, reward, _, done, info = env.step(action)
            if render: env.render()
            model.rewards.append(reward)

            if done or t >= 1000:
                break

        # Update running reward and plot the progress
        running_reward = running_reward * 0.99 + t * 0.01
        live_time.append(t)
        plot(live_time)

        # Save the model every 100 episodes
        if i_episode % 100 == 0:
            modelPath = './AC_CartPole_Model/ModelTraing'+str(i_episode)+'Times.pkl'
            torch.save(model, modelPath)

        # Optimize the policy
        finish_episode()

# Entry point of the script
if __name__ == '__main__':
    main()