In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
from torch.distributions import Categorical

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T


#GPU
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

### 环境

In [2]:
env = gym.make('CartPole-v1').unwrapped
env.action_space.n
env.reset()

array([-0.04174217, -0.00824811, -0.02843118, -0.04435518])

### Actor网络

In [3]:
def ini_net(md):
    for m in md.modules():
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_normal(m.weight.data)
            torch.nn.init.normal(m.bias.data)

class ActorNet(nn.Module):

    def __init__(self):
        super(ActorNet, self).__init__()
        self.MLP = nn.Sequential(
            nn.Linear(4,100),
            nn.ReLU(),
            nn.Linear(100,100),
            nn.ReLU(),
            nn.Linear(100,100),
            nn.ReLU(),
            nn.Linear(100,2),
            nn.Softmax(1)
        )
    def forward(self, x):
        x = self.MLP(x)
        return x
    
actor_net = ActorNet().cuda()
actor_net.apply(ini_net)
testx = Variable(torch.randn(1,4)).cuda()
actor_net(testx)

Variable containing:
 0.4827  0.5173
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

### Critic网络

In [4]:
class CriticNet(nn.Module):

    def __init__(self):
        super(CriticNet, self).__init__()
        self.MLP = nn.Sequential(
            nn.Linear(4,100),
            nn.ReLU(),
            nn.Linear(100,100),
            nn.ReLU(),
            nn.Linear(100,100),
            nn.ReLU(),
            nn.Linear(100,1),
        )
    def forward(self, x):
        x = self.MLP(x)
        return x
       
critic_net = CriticNet().cuda()
critic_net.apply(ini_net)
testx = Variable(torch.randn(1,4)).cuda()
critic_net(testx)

Variable containing:
-0.9699
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

### 动作策略

In [5]:
def select_action(state): 
    prob = actor_net(state)
    m = Categorical(prob)
    action = m.sample()
    log_prob = m.log_prob(action)
    action = action.data.cpu().numpy()[0]
    return action, log_prob

testx = Variable(torch.randn(1,4)).cuda()
select_action(testx)

(1, Variable containing:
 -0.5598
 [torch.cuda.FloatTensor of size 1 (GPU 0)])

### Loss

In [None]:
class Episode:
    def __init__(self):
        self.log_probs = []
        self.rewards = []
        self.actions = []
        self.clips = []
        self.R = 0
    
    def save_log_probs(self, log_prob): #存储一个回合的log(prob)
        self.log_probs.append(log_prob)
    
    def save_rewards(self, r): #存储一个回合的reward
        self.rewards.append(r)
    
    #def save_action(self, a): #存储一个回合的reward
    #    self.actions.append(a)
        
    def save_clip(self,s): #存储一个s1和s2
        self.clips.append(s)
        
    def _reward(self): #重定义reward
        n = len(self.rewards)
        ds = np.zeros(n)
        running_add = 0
        for t in reversed(range(0, n)):
            running_add = running_add * 0.99 + self.rewards[t]
            ds[t] = running_add
        return FloatTensor(ds)
    
    def critic_loss(self):
        vt = self._reward()
        As = []
        closs = 0
        for t in range(len(vt)):
            #a = self.actions[t]
            s = self.clips[t]
            v = vt[t] #实际的奖励v
            Qs = critic_net(s) #估计的奖励
            A = v - Qs #实际的奖励-估计的奖励
            As.append(A.detach())
            closs += A.norm()     
        return As, closs
       
    def optimize(self):
        n = len(self.rewards)
        aloss = 0
        As,closs = self.critic_loss()
        #print(As)
        for prob,A in zip(self.log_probs, As):
            #print('a:{}\n prob:{}'.format(r,prob.data))
            aloss += - prob * A #当前状态的loss函数
            #print("loss",loss)
        aloss /= n
        closs /= n
        return aloss, closs

### 训练

In [None]:
from tensorboardX import SummaryWriter
writer = SummaryWriter()
num_episodes = 20000
optimizer1 = optim.Adam(critic_net.parameters(),lr=1e-4)
optimizer2 = optim.Adam(actor_net.parameters(),lr=1e-5)

def get_state():
    state = FloatTensor(env.state).cuda().view(1,-1)
    state = Variable(state)
    return state

for i_episode in range(num_episodes):
    # 初始化环境
    env.reset()
    s = get_state()
    step = 0
    ept = Episode()
    while True: #无限循环
        #env.render()
        
        a, log_prob = select_action(s) #根据状态选择一个动作，得到a和log_prob
        _, r, done, _ = env.step(a) #计算该动作的下一个状态，奖励，done   
        
        if done: r -= 20
        
        ept.save_log_probs(log_prob)
        ept.save_rewards(r)
        ept.save_clip(s)
        #ept.save_action(a)
        
        s = get_state()
        step += 1
        if done:
            writer.add_scalar('step', step, i_episode)
            #print(i_episode, step)
            break
    #回合结束后开始优化
    aloss, closs = ept.optimize()
    optimizer1.zero_grad
    optimizer2.zero_grad
    aloss.backward()
    closs.backward()
    optimizer2.step()
    optimizer1.step()
    
    writer.add_scalar('aloss', aloss.data.cpu().numpy(), i_episode)
    writer.add_scalar('closs', closs.data.cpu().numpy(), i_episode)
    #print("loss:",loss.data.cpu().numpy()[0])

print('Complete')
writer.close()
#env.render(close=True)
#env.close()

if done: r-=20 很重要   

不是特别好收敛，主要是closs还没有达到一个较好的状态，甚至是一个错误的值状态，aloss就收敛了。所以尽量要closs先训练一定程度后，aloss再跟上去

学习率太高可能无法收敛   

critic网络只输出一个状态，为什么不是每个动作都输出一个状态。
因为c网络是判断某个状态下，期望得多少分数，而不是某个动作得多少分，以此来评价实际动作的效果；

dropout可以用

loss加均值？