## REINFORCE algorithm <br/>
So called name is **"Policy Gradient Algorithm"** <br/>
This algorithm is Policy Based reinforcement learning.

Three component in need.
1. Parameterized Policy
2. Objective to maximize
3. Method for updating the policy parameters

### Policy
Policy is mapping function which map state to action probabilities. <br/>
a ∼ π(s), in REINFORCE, agent learns a policy and uses policy to act in environment.

In [5]:
import numpy as np
import torch
import gym

from torch.distributions import Categorical
import torch.nn as nn
import torch.optim as optim

gamma = 0.99

### Agent (Simple Neural Network)

In [None]:
class Pi(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Pi, self).__init__ ()
        layers = [
            nn.Linear(in_dim, 64),
            nn.Relu(),
            nn.Linear(64, out_dim)
        ]
        
        self.model = nn.Sequential(*layers)
        self.onpolicy_reset()
        self.train()

    def onpolicy_reset(self) :
        self.log_probs = []
        self.rewards = []

    def forward(self, x):
        pdparam = self.model(x)
        return pdparam

    def act(self, state):
        x = torch.from_numpy(state.astype(np.float32))
        pdparam = self.forward(x)

        pd = Categorical(logits = pdparam)
        action = pd.sample()
        log_prob = pd.log_prob(action)
        self.log_probs.append(log_prob)
        
        return action.item()

### Train function

In [20]:
def train (pi, optimizer):
    # Inner gradient descent loop of REINFORCE algorithm

    T = len(pi.rewards)
    rets = np.empty(T, dtype = np.float32)
    reture_ret = 0.0

    # Compute the returns efficiently
    for t in reversed(range(T)):
        future_ret = pi.rewards[t] + gamma * future_ret
        rets[t] = future_ret

    rets = torch.tensor(rets)
    log_probs = torch.stack(pi.log_probs)

    loss = -log_probs*rets
    loss = torch.sum(loss)

    optimizer.zero_grad() 
    loss.backward()  # Backpropagate, compute gradients
    optimizer.step() # Update weight to opimizer

    return loss