# **一个Policy Gradient算法的简单实现**
## **算法概述**
- 一个较为基础的policybased算法
- 为了解决众多以DQN为主的value-based算法对连续动作处理能力不足的问题；为了解决受限状态下的问题处理能力不足的问题，该问题是因为观测的限制或建模的局限，导致真实的环境下不同的两个状态有相同的特征表示，进而可能导致value based的方法无法得到最优解（这一点还不是特别理解，policy based算法也有这样的问题？）；解决随机策略的问题，value based的最优策略通常是确定性的，而有些问题的最优策略是随机的，因此导致value based方法无法得到最优解。
- 算法网络直接输出动作（包括离散动作的softmax或者直接输出连续动作）
- on-policy算法，policy-based算法

论文链接：*https://docs.popo.netease.com/docs/e54e7b5d00a44f52b7edc926efe4e829*

In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

In [2]:
GAMMA = 0.95
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class policy_gradient(nn.Module):
    def __init__(self, env):
        super(policy_gradient, self).__init__()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.ep_obs, self.ep_as, self.ep_rs = [], [], []
        self.create_training_network()
        self.create_training_method()
        self.to(device)
        if not self.training:
            self.train()
    
    def forward(self, states):
        x = F.relu(self.fc1(states))
        x = self.fc2(x)
        return x
    
    def create_training_network(self):
        self.fc1 = nn.Linear(self.state_dim, 20)
        self.fc2 = nn.Linear(20, self.action_dim)
    
    def create_training_method(self):
        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
        # self.loss_cal = nn.CrossEntropyLoss()
        
    def choose_action(self, observation):
        with torch.no_grad():
            state = torch.tensor(observation, device=device)
            prob_weight = F.softmax(self(state), dim=0)
            action = torch.multinomial(prob_weight,1)
            return action.item()
    
    def choose_max_action(self, observation):
        with torch.no_grad():
            state = torch.tensor(observation, device=device)
            prob_weight = F.softmax(self(state), dim=0)
            action = torch.argmax(prob_weight)
            return action.item()
        
    def store_transition(self, s, a, r):
        self.ep_obs.append(s)
        self.ep_as.append(a)
        self.ep_rs.append(r)
        
    def clear_transition(self):
        self.ep_obs.clear()
        self.ep_as.clear()
        self.ep_rs.clear()
        
    def train_loop(self):
        discounted_ep_rs = np.zeros(len(self.ep_obs))
        accumulate_discount_reward = 0.0
        for t in reversed(range(0,len(self.ep_rs))):
            accumulate_discount_reward += GAMMA * accumulate_discount_reward + self.ep_rs[t]
            discounted_ep_rs[t] = accumulate_discount_reward
        discounted_ep_rs -= np.mean(discounted_ep_rs)
        discounted_ep_rs /= np.std(discounted_ep_rs)
        
        states = torch.tensor(np.stack(self.ep_obs), device=device)
        action_labels = torch.tensor(np.stack(self.ep_as), device=device)
        # action_probs = torch.log(F.softmax(self(states), dim=1))
        action_logits = self(states)
        action_log_probs = torch.log(F.softmax(action_logits, dim=1))
        ep_value = torch.tensor(discounted_ep_rs, device=device)
        loss = (torch.gather(action_log_probs,1,action_labels.unsqueeze(-1)).squeeze(-1) * ep_value).mean()
        # loss = (self.loss_cal(action_logits, action_labels) * ep_value).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.clear_transition()

In [4]:
import gym
env_name = "CartPole-v1"
env = gym.make(env_name)

agent = policy_gradient(env)

In [5]:
def main():
    for episode in range(30000):
        state, _ = env.reset()
        for step in range(3000):
            action = agent.choose_action(state)
            next_state, reward, done, _, _ = env.step(action)
            reward = -1 if done else 0.01
            agent.store_transition(state, action, reward)
            state = next_state
            if done:
                agent.train_loop()
                break
        if episode % 100 == 0:
            total_reward = 0
            for i in range(10):
                state, _ = env.reset()
                for step in range(300):
                    action = agent.choose_action(state)
                    next_state, reward, done, _, _ = env.step(action)
                    total_reward += reward
                    state = next_state
                    if done:
                        break
            print(f"episode {episode} total reward is {total_reward/10}")

In [None]:
if __name__ == "__main__":
    main()

  if not isinstance(terminated, (bool, np.bool8)):
