<a href="https://colab.research.google.com/github/gitHubAndyLee2020/OpenAI_Gym_RL_Algorithms_Database/blob/main/PolicyGradient_Module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### PolicyGradient

> About

- Given some state, the Policy Gradient algorithm outputs the probability of taking each possible action
- The neural network is trained to maximize the probability of action that produces the maximum reward

> Pro

- Better for Continuous and High-Dimensional Spaces

> Con

- Sample Inefficiency; requires many samples to train the agent


```
class Policy(nn.Module):
  def __init__(self):
    - Initialize a neural network that maps from input: observation length -> hidden layer: usually larger, something like 128 -> output: number of actions nodes
    - Initialize storage for log probabilties and rewards

  def forward(self, x):
    - Feed the observation through the neural network and softmax to get action probabilities
```

```
def select_action(state):
  - Feed the state to the Policy network to get action probability
  - Select an action from the action probabilty; if action had p probability in the action probabilty, then it has p chance of being selected
  - Store the log of the probability of the action selected as well
  - Return the selected action
```

```
def finish_episode():
  - Loop over the collected rewards and calculate the total reward, the reward per time step is calculated by R = r0 + gamma * r1 + gamma^2 * r2 + ..., this calculates the accumulated reward for the set of actions taken so far for each time step
  - Normalize the reward history
  - Calculate the policy loss by multipling the reward and negated action log probability of the action per each time step, where higher reward and higher action log probability will result in lower loss value. Lower loss value means the neural network will be adjusted more towards taking thoses actions during backpropagation
  - With the calculated policy loss, backpropagation is applied to the Policy network
```

```
def main():
  - Collect action log probability and reward for some t amount of times
  - Calculated running reward by the length of time steps the agent was able to survive in the environment, which the t from above
  - If the running reward is higher than the environment's reward threshold, it means the agent is trained well. The training loop is terminated
```

In [None]:
# Import the necessary libraries for environment handling and neural networks
import gym
import numpy as np
from itertools import count

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Define constants and hyperparameters
gamma = 0.99  # Discount factor for future rewards
seed = 543  # Seed for reproducibility
render = False  # Whether to render the environment
log_interval = 10  # Logging interval for performance statistics

# Create an environment object for the CartPole-v0 problem
env = gym.make('CartPole-v0')
# Seed the environment
env.seed(seed)
# Seed PyTorch for reproducibility
torch.manual_seed(seed)

# Define the neural network architecture for the policy
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        # Define the first linear layer, input=4, output=128
        self.affine1 = nn.Linear(4, 128)
        # Define the second linear layer, input=128, output=2
        self.affine2 = nn.Linear(128, 2)

        # Initialize lists to store log probabilities and rewards
        self.saved_log_probs = []
        self.rewards = []

    # Define forward pass
    def forward(self, x):
        # Apply first linear layer followed by ReLU activation
        x = F.relu(self.affine1(x))
        # Apply second linear layer
        action_scores = self.affine2(x)
        # Apply softmax to get action probabilities
        return F.softmax(action_scores, dim=1)

# Instantiate the policy neural network
policy = Policy()
# Set up optimizer for the neural network
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
# Initialize epsilon to a small value for numerical stability
eps = np.finfo(np.float32).eps.item()

# Function to select an action given a state
def select_action(state):
    # Convert state to PyTorch tensor and add a batch dimension
    state = torch.from_numpy(state).float().unsqueeze(0)
    # Get action probabilities from the policy network
    probs = policy(state)
    # Create a categorical distribution over the action probabilities
    m = Categorical(probs)
    # Sample an action from this distribution
    action = m.sample()
    # Save the log probability of this action
    policy.saved_log_probs.append(m.log_prob(action))
    # Return the sampled action
    return action.item()

# Function to update the policy network
def finish_episode():
    # Initialize sum of rewards as zero
    R = 0
    # Initialize empty list to store policy loss values
    policy_loss = []
    # Initialize empty list to store future rewards
    rewards = []
    # Calculate the future reward for each time step
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        rewards.insert(0, R)
    # Convert rewards to PyTorch tensor
    rewards = torch.tensor(rewards)
    # Normalize the rewards
    rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
    # Calculate policy loss
    for log_prob, reward in zip(policy.saved_log_probs, rewards):
        policy_loss.append(-log_prob * reward)
    # Perform backpropagation
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    # Clear rewards and log probabilities for next episode
    del policy.rewards[:]
    del policy.saved_log_probs[:]

# Main training loop
def main():
    # Initialize running reward
    running_reward = 10
    # Loop through episodes
    for i_episode in count(1):
        # Reset the environment for the new episode
        state = env.reset()
        # Loop through time steps in episode
        for t in range(10000):  # Don't infinite loop while learning
            # Select action using policy
            action = select_action(state)
            # Take action and observe next state and reward
            state, reward, done, _ = env.step(action)
            # Optionally render the environment
            if render:
                env.render()
            # Store reward
            policy.rewards.append(reward)
            # Break loop if episode ends
            if done:
                break
        # Update running reward
        running_reward = running_reward * 0.99 + t * 0.01
        # Update policy
        finish_episode()
        # Log performance metrics
        if i_episode % log_interval == 0:
            print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
                i_episode, t, running_reward))
        # Check if problem is solved
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

# Run the training loop if this script is executed
if __name__ == '__main__':
    main()