In [None]:
"""
@ Author: Zachary Deng
@ Date: 2021/2/15
@ Brief: Policy Gradient practice in Reinforcement learning
"""

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

env = gym.make('CartPole-v1')
print(env.action_space)
print(env.observation_space)

In [None]:
class PGN(nn.Module):
    def __init__(self):
        super(PGN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(4, 24),
            nn.ReLU(True), #覆盖操作
            nn.Linear(24, 36),
            nn.ReLU(True),
            nn.Linear(36, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.model(x)
        return x
    
class CartAgent(object):
    def __init__(self, learning_rate, gamma):
        self.pgn = PGN()
        self.gamma = gamma
        
        self._init_memory()
        self.optimizer = torch.optim.Adam(self.pgn.parameters(), lr=learning_rate)
    
    def _init_memory(self):
        self.state_pool = []
        self.action_pool = []
        self.reward_pool = []
        self.steps = 0
        
    def memorize(self, state, action, reward):
        # save to memory for mini-batch gradient descent
        self.state_pool.append(state)
        self.action_pool.append(action)
        self.reward_pool.append(reward)
        self.steps += 1
        
    def learn(self):
        self._adjust_reward()
        
        #policy gradient
        self.optimizer.zero_grad()
        for i in range(self.steps):
            action = torch.FloatTensor([self.action_pool[i]])
            reward = self.reward_pool[i]
            
            probs = self.act(state)
            m = torch.bernoulli(probs)
            loss = -m.log_prob(action) * reward
            loss.backward()
        
        self.optimizer.step()
        self._init_memory()
        
    def act(self, state):
        return self.pgn(state)
    
    def _adjust_reward(self):
        # backward weight
        running_add = 0
        for i in reversed(range(self.steps)):
            if self.reward_pool[i] == 0:
                running_add = 0
            else:
                running_add = running_add * self.gamma + self.reward_pool[i]
                self.reward_pool[i] = running_add

In [None]:
import time

# hyper parameter
BATCH_SIZE = 5
LEARNING_RATE = 0.01
GAMMA = 0.99
NUM_EPISODES = 500

env = gym.make('CartPole-v1')
cart_agent = CartAgent(learning_rate=LEARNING_RATE, gamma=GAMMA)

for i_episode in range(NUM_EPISODES):
    time_start = time.time()
    next_state = env.reset()
    env.render(mode='rgb_array')

    state = torch.from_numpy(next_state).float()

    probs = cart_agent.act(state)
    m = torch.bernoulli(probs)
    action = m.sample()

    action = action.data.numpy().astype(int).item()
    #action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    env.render(mode='rgb_array')

    # end action's reward equals 0
    if done:
        reward = 0

    cart_agent.memorize(state, action, reward)

    if done:
        time_end = time.time()
        t = time_end-time_start
        logger.info({'Episode {}: durations {}'.format(i_episode, t)})
        break

    # update parameter every batch size
    if i_episode > 0 and i_episode % BATCH_SIZE == 0:
        cart_agent.learn()