In [1]:
import gym
import tensorflow as tf
import numpy as np
import random
from collections import deque
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
config = tf.ConfigProto() 
config.gpu_options.per_process_gpu_memory_fraction = 0.2
session = tf.Session(config=config)

In [2]:
# Hyper Parameters
GAMMA = 0.95 # discount factor
LEARNING_RATE=0.01

### 来源

https://www.cnblogs.com/pinard/p/10137696.html

value based方法的不足:
* 对连续动作处理能力不足
* 使用特征来描述某一状态时, 有可能因为个体观测的限制或者建模的局限，导致真实环境下本来不同的两个状态却再我们建模后拥有相同的特征描述，进而很有可能导致我们的value Based方法无法得到最优解。此时使用Policy Based强化学习方法也很有效。
* 无法解决随机策略问题.

Policy Based方法的优点:
* 具有更好的收敛性
* 对于高维空间或者是连续空间更加的有效
* 能够对随机策略进行学习

缺点:
* 很容易在局部最优解上面收敛而得不到全局最优
* 对策略的估计通常具有很大的方差，求解的过程较低效
---
 输入：N个蒙特卡罗完整序列,训练步长$\alpha$
 
 输出：策略函数的参数$\theta$
 1. for 每个蒙特卡罗序列:
 
    a. 用蒙特卡罗法计算序列每个时间位置t的状态价值$v_t$
    
    b. 对序列每个时间位置t，使用梯度上升法，更新策略函数的参数$\theta$:
 $$
 \theta=\theta+\alpha\Delta_{\theta}\log\pi_{\theta}(s_t,a_t)v_t
 $$
 
2. 返回策略函数的参数$\theta$

这里的策略函数可以是softmax策略(离散策略)，高斯策略(连续策略)或者其他策略.

In [3]:
class Policy_Gradient():
    def __init__(self, env):
        # init some parameters
        self.time_step = 0
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.ep_obs, self.ep_as, self.ep_rs = [], [], []
        self.create_softmax_network()

        # Init session
        self.session = tf.InteractiveSession()
        self.session.run(tf.global_variables_initializer())

    def create_softmax_network(self):
        # network weights
        W1 = self.weight_variable([self.state_dim, 20])
        b1 = self.bias_variable([20])
        W2 = self.weight_variable([20, self.action_dim])
        b2 = self.bias_variable([self.action_dim])
        # input layer
        self.state_input = tf.placeholder("float", [None, self.state_dim])
        self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
        # 每个时间t的状态价值v_t
        self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
        # hidden layers
        h_layer = tf.nn.relu(tf.matmul(self.state_input, W1) + b1)
        # softmax layer
        self.softmax_input = tf.matmul(h_layer, W2) + b2
        #softmax output
        self.all_act_prob = tf.nn.softmax(self.softmax_input, name='act_prob')
        # sparse_softmax_cross_entropy_with_logits中 lables接受直接的数字标签 
        # 如[1], [2], [3], [4] （类型只能为int32，int64）
        # 而softmax_cross_entropy_with_logits中 labels接受one-hot标签 
        # 如[1,0,0,0], [0,1,0,0],[0,0,1,0], [0,0,0,1] （类型为int32， int64）
        # 相当于sparse_softmax_cross_entropy_with_logits 对标签多做一个one-hot动作
        self.neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.softmax_input,
                                                                      labels=self.tf_acts)
        self.loss = tf.reduce_mean(self.neg_log_prob * self.tf_vt)  # reward guided loss

        self.train_op = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)

    def weight_variable(self, shape):
        initial = tf.truncated_normal(shape)
        return tf.Variable(initial)

    def bias_variable(self, shape):
        initial = tf.constant(0.01, shape=shape)
        return tf.Variable(initial)

    def choose_action(self, observation):
        prob_weights = self.session.run(self.all_act_prob, feed_dict={self.state_input: observation[np.newaxis, :]})
        action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())  # select action w.r.t the actions prob
        return action

    def store_transition(self, s, a, r):
        # 分别存放一个Episode的所有观察到的状态, 采取的动作, 和回报
        self.ep_obs.append(s)
        self.ep_as.append(a)
        self.ep_rs.append(r)

    def learn(self):
        # 折扣的回报期望
        discounted_ep_rs = np.zeros_like(self.ep_rs)
        running_add = 0
        # 从后向前计算
        for t in reversed(range(0, len(self.ep_rs))):
            running_add = running_add * GAMMA + self.ep_rs[t]
            discounted_ep_rs[t] = running_add

        discounted_ep_rs -= np.mean(discounted_ep_rs)
        discounted_ep_rs /= np.std(discounted_ep_rs)

        # train on episode
        self.session.run(self.train_op, feed_dict={
             self.state_input: np.vstack(self.ep_obs), # [batch, state_dim]
             self.tf_acts: np.array(self.ep_as),
             self.tf_vt: discounted_ep_rs,
        })

        self.ep_obs, self.ep_as, self.ep_rs = [], [], []    # empty episode data

In [6]:
# Hyper Parameters
EPISODE = 1000 # Episode limitation
STEP = 1000 # Step limitation in an episode
TEST = 10 # The number of experiment test every 100 episode

## Training

In [None]:
# initialize OpenAI Gym env and dqn agent
env = gym.make('CartPole-v0')
agent = Policy_Gradient(env)

for episode in range(EPISODE):
    # initialize task
    state = env.reset()  # state为一个四维向量表示
    # Train
    for step in range(STEP):
        action = agent.choose_action(state) # e-greedy action for train, 为0或1
        next_state,reward,done,_ = env.step(action)
        agent.store_transition(state, action, reward)
        state = next_state
        if done:
            agent.learn()
            break

    # Test every 100 episodes
    if episode % 100 == 0:
        total_reward = 0
        for i in range(TEST):
            state = env.reset()
            for j in range(STEP):
#                 env.render()
                action = agent.choose_action(state) # direct action for test
                state,reward,done,_ = env.step(action)
                total_reward += reward
                if done:
                    break
        ave_reward = total_reward/TEST
        print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)



episode:  0 Evaluation Average Reward: 45.1
episode:  100 Evaluation Average Reward: 53.4
episode:  200 Evaluation Average Reward: 56.2
episode:  300 Evaluation Average Reward: 155.2
episode:  400 Evaluation Average Reward: 200.0
episode:  500 Evaluation Average Reward: 196.8
episode:  600 Evaluation Average Reward: 193.5
episode:  700 Evaluation Average Reward: 68.7
episode:  800 Evaluation Average Reward: 134.9
episode:  900 Evaluation Average Reward: 111.4
