## REINFORCE algorithm <br/>
So called name is **"Policy Gradient Algorithm"** <br/>
This algorithm is Policy Based reinforcement learning.

Three component in need.
1. Parameterized Policy
2. Objective to maximize (Gradient Ascent)
3. Method for updating the policy parameters

### Policy
Policy is mapping function which map state to action probabilities. <br/>
a ∼ π(s), in REINFORCE, agent learns a policy and uses policy to act in environment.

In [2]:
import numpy as np
import torch
import gym

from torch.distributions import Categorical
import torch.nn as nn
import torch.optim as optim

gamma = 0.99 # Discounting rewards value over step in episode

### Agent (Simple Neural Network)

In [11]:
# Policy network
class Pi(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Pi, self).__init__ ()

        # Leanable parameters
        layers = [
            nn.Linear(in_dim, 64),
            nn.ReLU(),
            nn.Linear(64, out_dim)
        ]
        
        self.model = nn.Sequential(*layers)
        self.onpolicy_reset()       
        self.train()

    def onpolicy_reset(self) :
        self.log_probs = []
        self.rewards = []

    def forward(self, x):
        pdparam = self.model(x)
        return pdparam

    def act(self, state):
        x = torch.from_numpy(state.astype(np.float32))
        pdparam = self.forward(x)

        pd = Categorical(logits = pdparam)
        action = pd.sample()
        log_prob = pd.log_prob(action)
        self.log_probs.append(log_prob)
        
        return action.item()

### Train function

In [12]:
def train (pi, optimizer):
    # Inner gradient descent loop of REINFORCE algorithm
    T = len(pi.rewards) # get maximum step within episode
    rets = np.empty(T, dtype = np.float32)
    future_ret = 0.0

    # Compute the returns efficiently
    for t in reversed(range(T)):
        future_ret = pi.rewards[t] + gamma * future_ret
        rets[t] = future_ret

    rets = torch.tensor(rets)
    log_probs = torch.stack(pi.log_probs)

    loss = -log_probs*rets
    loss = torch.sum(loss)

    optimizer.zero_grad() 
    loss.backward()  # Backpropagate, compute gradients
    optimizer.step() # Update weight to opimizer

    return loss

In [13]:
def main():
    env = gym.make('CartPole-v0')
    in_dim = env.observation_space.shape[0]
    out_dim = env.action_space.n

    pi = Pi(in_dim, out_dim)
    optimizer = optim.Adam(pi.parameters(), lr = 0.01)

    for epi in range(300): # number of total episode
        state = env.reset()
        for t in range(200): # number of time within episode
            action = pi.act(state)
            state, reward, done, _ = env.step(action)
            pi.rewards.append(reward)
            
            env.render()
            if done:
                break
        
        # After ending one episode
        loss = train(pi, optimizer)
        total_reward = sum(pi.rewards)
        solved = total_reward > 195.0
        pi.onpolicy_reset() # onpolicy : clear memory after training
        print(f'Episode {epi}, loss: {loss}, total_reward : {total_reward}, solved : {solved}')

In [14]:
# Example of trains loop with CartPole environment.
if __name__ == '__main__':
    main()

Episode 0, loss: 88.60798645019531,                 total_reward : 16.0, solved : False
Episode 1, loss: 132.59632873535156,                 total_reward : 20.0, solved : False
Episode 2, loss: 102.2104263305664,                 total_reward : 16.0, solved : False
Episode 3, loss: 221.6617431640625,                 total_reward : 26.0, solved : False
Episode 4, loss: 49.43059158325195,                 total_reward : 12.0, solved : False
Episode 5, loss: 786.698974609375,                 total_reward : 50.0, solved : False
Episode 6, loss: 87.26021575927734,                 total_reward : 16.0, solved : False
Episode 7, loss: 95.16365814208984,                 total_reward : 16.0, solved : False
Episode 8, loss: 87.89383697509766,                 total_reward : 16.0, solved : False
Episode 9, loss: 192.78077697753906,                 total_reward : 24.0, solved : False
Episode 10, loss: 274.04876708984375,                 total_reward : 29.0, solved : False
Episode 11, loss: 588.9201660

: 

### Policy Construction

In [14]:
from torch.distributions import Normal, Categorical
import torch
import math
import numpy as np

# Assume output from policy network have 2 action (Left, Right)
# Output is logit probability (Log of action probability)
policy_net_output = torch.tensor([-1.6094, -0.2231]) # equivalent to probs = [0.2, 0.8]
pd_params = policy_net_output
# Create discrete probability function
pd = Categorical(logits = pd_params)

# Sample an action base on probability of each action
action = pd.sample()

# Compute action log probability
pd.log_prob(action)

tensor(-0.2231)

In [7]:
np.log(0.2)

-1.6094379124341003