In [38]:
import numpy as np
from itertools import accumulate
import gym
import tensorflow as tf

class VPGBuffer:
    def __init__(self, obs_dim, act_dim=1, size,gamma=0.99,lam=0.95):
        # obs_dim和act_dim应该可以兼容不同的环境
        self.obs_buf=np.zeros(shape=(size,obs_dim),dtype=np.float32)
        # 为什么要保存act，而不直接保存logpi的梯度？（效率太低，无法进行批量梯度下降）
        self.act_buf=np.zeros(shape=(size,),dtype=np.float32)
        self.rew_buf=np.zeros(shape=(size,),dtype=np.float32)
        self.val_buf=np.zeros(shape=(size,),dtype=np.float32)
        self.logp_buf=np.zeros(shape=(size,),dtype=np.float32)
        self.adv_buf=np.zeros(shape=(size,),dtype=np.float32)
        self.ret_buf=np.zeros(shape=(size,),dtype=np.float32)
        self.ptr,self.path_start_idx=0,0
        self.gamma,self.lam,self.size=gamma,lam,size
            
            
    def store(self,obs,act,rew,val,logp=0):
        assert self.ptr<self.size, \
            "buffer overflowed: size= "+str(self.ptr)+" max_size= "+str(self.size)
        self.obs_buf[self.ptr]=obs
        self.act_buf[self.ptr]=act
        self.rew_buf[self.ptr]=rew
        self.logp_buf[self.ptr]=logp
        self.val_buf[self.ptr]=val
        self.ptr+=1
        
        
    def finish_path(self,last_val=0):
        path_slice=slice(self.path_start_idx,self.ptr)
        rews=np.append(self.rew_buf[path_slice],last_val)
        vals=np.append(self.val_buf[path_slice],last_val)
        
        # the next two lines implement GAE-Lambda advantage calculation
        deltas=rews[:-1]+self.gamma*vals[1:]-vals[:-1]
        self.adv_buf[path_slice]=list(accumulate(deltas[::-1],lambda x,y:x*self.gamma*self.lam+y))[::-1]
        
        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice]=list(accumulate(rews[::-1],lambda x,y:x*self.gamma+y))[::-1][:-1]
        self.path_start_idx=self.ptr
        
        
    def get(self):
        assert self.ptr==self.size,\
            "buffer is not full"
        self.ptr,self.path_start_idx=0,0
        
        # the next two lines implement the advantage normalization trick
        adv_mean,adv_std=np.mean(self.adv_buf),np.std(self.adv_buf)
        self.adv_buf=(self.adv_buf-adv_mean)/adv_std
        return [self.obs_buf,self.act_buf,self.adv_buf,self.ret_buf]
    
    

def mlp(x,hidden_sizes=(64,64),activation=tf.tanh,output_activation=None):
    for i in hidden_sizes[:-1]:
        x=tf.layers.dense(x,units=i,activation=activation)
    return tf.layers.dense(x,units=hidden_sizes[-1],activation=output_activation)


def vpg(env_name,epoch=10,local_steps_per_epoch=4000,seed=0,
        pi_lr=3e-4,vf_lr=1e-3,gamma=0.99,lam=0.97,
        max_ep_len=1000,train_v_iters=80):
    
    env=gym.make(env_name)
    obs_dim=env.observation_space.shape[0]
    act_dim=env.action_space.n
    buf=VPGBuffer(obs_dim,act_dim,local_steps_per_epoch,gamma,lam)
    
    # define graph
    tf.set_random_seed(seed)
    #输入状态x_ph应能兼容不同环境
    x_ph=tf.placeholder(shape=(None,obs_dim),dtype=tf.float32)
    #从buffer中获取动作，求logp(a|s),用于训练，(应能兼容连续动作)
    a_ph=tf.placeholder(shape=(None,),dtype=tf.int32)
    adv_ph=tf.placeholder(shape=(None,),dtype=tf.float32)
    ret_ph=tf.placeholder(shape=(None,),dtype=tf.float32)
    all_phs=[x_ph,a_ph,adv_ph,ret_ph]
    
    # define value-function net(s->v)
    v=tf.squeeze(mlp(x_ph,hidden_sizes=(64,64,1)),axis=1)
    
    # define stochastic policy net(s->a)
    logits=mlp(x_ph,hidden_sizes=(64,64,act_dim))
    logp_all=tf.nn.log_softmax(logits)
    # sampling
    pi=tf.squeeze(tf.random.categorical(logp_all,1),axis=1)
    # log-likelihood
    logp=tf.reduce_sum(tf.one_hot(a_ph,depth=act_dim)*logp_all,axis=1)
    
    # VPG objectives
    pi_loss=-tf.reduce_mean(logp*adv_ph)
    v_loss=tf.reduce_mean((ret_ph-v)**2)
    
    # optimizers
    train_pi=tf.train.AdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v=tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
    
    sess=tf.Session()
    sess.run(tf.global_variables_initializer())
    
    
    for epoch in range(1):
        o,r,d,ep_len=env.reset(),0,False,0
        for t in range(local_steps_per_epoch):
            a,v_t=sess.run([pi,v],feed_dict={x_ph:o.reshape(1,-1)})
            buf.store(o,a,r,v_t)
            o,r,d,_=env.step(a[0])
            ep_len+=1
            
            terminal=d or (ep_len==max_ep_len)
            if terminal:
                last_val=r if d else sess.run(v,feed_dict={x_ph,o.reshape(1,-1)})
                buf.finish_path(last_val)
                o,r,d,ep_len=env.reset(),0,False,0
                
#         buff=buf.get()
#         print(buff[0].shape, buff[1].shape, buff[2].shape,buff[3].shape)
        inputs={k:v for k,v in zip(all_phs,buf.get())}
            
        # update per epoch
        # policy update
        sess.run(train_pi,feed_dict=inputs)
        
        # value function update
        for _ in range(train_v_iters):
            sess.run(train_v,feed_dict=inputs)


            
            
                  
vpg("CartPole-v0")