Policy Gradient實現

In [1]:
import itertools
import time

import gym
import numpy as np
import tensorflow as tf

In [2]:
env = gym.make('Acrobot-v1')

print(env.observation_space)
print(env.action_space)

Box(6,)
Discrete(3)


In [132]:
class Agent:
    def __init__(self):
        self.memory = {'observation': [], 'action': [], 'reward': []}
        self.gamma = .95 # 對歷史記憶的衰減係數
        
        self.model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(6, activation='relu', input_shape=(6, )), 
            tf.keras.layers.BatchNormalization(), 
            
            tf.keras.layers.Dense(64, activation='relu'), 
            tf.keras.layers.BatchNormalization(), 
            tf.keras.layers.Dropout(.1), 
            
            tf.keras.layers.Dense(3, activation='softmax'), 
        ])
        self.model.compile(optimizer='adam', 
                           loss='CategoricalCrossentropy')
    
    def get_action(self, observation):
        prob = self.model.predict(observation.reshape(1, 6))
        action = np.random.choice((-1, 0, 1), p=prob.reshape(3, ))
        return action

    def remember(self, observation, action, reward):
        self.memory['observation'].append(observation)
        self.memory['action'].append(action)
        self.memory['reward'].append(reward)
    
    def _encode_onehot(self, action):
        if action == -1:
            return (1, 0, 0)
        elif action == 0:
            return (0, 1, 0)
        else:
            return (0, 0, 1)
            
    def _preprocess_train_data(self):
        self.memory['observation'] = np.array(self.memory['observation'])

        self.memory['action'] = map(self._encode_onehot, self.memory['action']) # 獨熱編碼
        self.memory['action'] = np.array(list(self.memory['action']))
        
        self.memory['reward'] = itertools.accumulate(reversed(self.memory['reward']), 
                                                     lambda x, y: x*self.gamma + y) # 累積期望
        self.memory['reward'] = np.array(list(self.memory['reward']))
        self.memory['reward'] = (self.memory['reward'] - self.memory['reward'].mean()) \
                                / self.memory['reward'].std() # 標準化
        
    def train(self):
        self._preprocess_train_data()
        self.model.fit(self.memory['observation'], 
                       self.memory['action'], 
                       batch_size=500, # 相當於不分批量
                       sample_weight=self.memory['reward'], 
                       verbose=0)
        
        self.memory['observation'] = []
        self.memory['action'] = []
        self.memory['reward'] = []

In [133]:
# 訓練次數
episodes = 50

agent = Agent()

for episode in range(episodes):
    observation = env.reset()
    score = 0
    while True:
        action = agent.get_action(observation)
        next_observation, reward, done, _ = env.step(action)
        agent.remember(observation, action, reward)
        observation = next_observation
        score += reward
        
        if done:
            agent.train()
            print(f'episode: {episode}, score: {score}')
            break

episode: 0, score: -500.0
episode: 1, score: -431.0
episode: 2, score: -500.0
episode: 3, score: -500.0
episode: 4, score: -500.0
episode: 5, score: -500.0
episode: 6, score: -500.0
episode: 7, score: -500.0
episode: 8, score: -500.0
episode: 9, score: -500.0
episode: 10, score: -483.0
episode: 11, score: -443.0
episode: 12, score: -500.0
episode: 13, score: -489.0
episode: 14, score: -500.0
episode: 15, score: -317.0
episode: 16, score: -484.0
episode: 17, score: -250.0
episode: 18, score: -370.0
episode: 19, score: -348.0
episode: 20, score: -364.0
episode: 21, score: -290.0
episode: 22, score: -500.0
episode: 23, score: -403.0
episode: 24, score: -343.0
episode: 25, score: -294.0
episode: 26, score: -299.0
episode: 27, score: -390.0
episode: 28, score: -313.0
episode: 29, score: -332.0
episode: 30, score: -264.0
episode: 31, score: -278.0
episode: 32, score: -389.0
episode: 33, score: -350.0
episode: 34, score: -460.0
episode: 35, score: -429.0
episode: 36, score: -323.0
episode: 37

演示學習成果

上限500回合

分數-100代表在第100回合完成

分數-150代表在第150回合完成

分數-500代表未完成

In [141]:
observation = env.reset()
score = 0
while True:
    env.render()
    action = agent.get_action(observation)
    observation, reward, done, _ = env.step(action)
    score += reward
    time.sleep(0.02)
    
    if done:
        print(f'score: {score}')
        break
env.close()

score: -349.0
