In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from copy import deepcopy
from PIL import Image
from torch.distributions import Categorical

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T


#GPU
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

### 环境

In [2]:
env = gym.make('CartPole-v1').unwrapped
env.action_space.n
env.reset()
env.state

array([ 0.02722231, -0.02951582,  0.04844879, -0.00138314])

### P网络

In [3]:
def ini_net(md):
    for m in md.modules():
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_normal(m.weight.data)
            torch.nn.init.normal(m.bias.data)

class PN(nn.Module):

    def __init__(self):
        super(PN, self).__init__()
        self.MLP = nn.Sequential(
            nn.Linear(4,100),
            nn.ReLU(),
            nn.Linear(100,100),
            nn.ReLU(),
            nn.Linear(100,2),
            nn.Softmax(1)
        )
    def forward(self, x):
        x = self.MLP(x)
        return x
    
pnet = PN().cuda()
pnet.apply(ini_net)
testx = Variable(torch.randn(1,4)).cuda()
pnet(testx)

Variable containing:
 0.6463  0.3537
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

### 动作策略

In [4]:
def select_action(state): 
    prob = pnet(state)
    m = Categorical(prob)
    action = m.sample()
    log_prob = m.log_prob(action)
    action = action.data.cpu().numpy()[0]
    return action, log_prob

testx = Variable(torch.randn(1,4)).cuda()
print(select_action(testx))

(1, Variable containing:
-0.9122
[torch.cuda.FloatTensor of size 1 (GPU 0)]
)


### Episode 回合片段

In [5]:
class Episode:
    def __init__(self):
        self.log_probs = []
        self.rewards = []
        self.R = 0
    
    def __len__(self):
        return len(self.frames)
    
    def save_log_probs(self, log_prob): #存储一个回合的log(prob)
        self.log_probs.append(log_prob)
    
    def save_rewards(self, r): #存储一个回合的reward
        self.rewards.append(r)
        
    def _reward(self): #重定义reward
        n = len(self.rewards)
        ds = np.zeros(n)
        running_add = 0
        for t in reversed(range(0, n)):
            running_add = running_add * 0.99 + self.rewards[t]
            ds[t] = running_add
        ds = (ds - np.mean(ds))/np.std(ds)
        return FloatTensor(ds)
        
    
    def optimize(self):
        vt = self._reward()
        #print(vt)
        loss = 0
        t = 0
        for prob,r in zip(self.log_probs, vt):
            #print('a:{}\n prob:{}'.format(r,prob.data))
            loss += - prob * r #当前状态的loss函数
            #print("loss",loss)
        return loss

#ept = Episode(net, optimizer, gamma=1)

### 训练

In [None]:
from tensorboardX import SummaryWriter
writer = SummaryWriter()
GAMMA = 1
num_episodes = 5000
optimizer = optim.Adam(pnet.parameters(),lr=1e-4)

def get_state():
    state = FloatTensor(env.state).cuda().view(1,-1)
    state = Variable(state)
    return state

for i_episode in range(num_episodes):
    # 初始化环境
    env.reset()
    state = get_state()
    episode = Episode()
    for t in count(): #无限循环
        #if i_episode>3500: 
        env.render()
        
        action, log_prob = select_action(state) #选择一个动作
        _, reward, done, _ = env.step(action) #计算该动作的奖励，done                
        
        episode.save_log_probs(log_prob)
        episode.save_rewards(reward)
   
        state = get_state() #当前屏幕状态重新获取

        if done:
            writer.add_scalar('t', t, i_episode)
            #print(i_episode, t)
            break
    #回合结束后开始优化
    optimizer.zero_grad()
    loss = episode.optimize()
    loss.backward()
    optimizer.step()   
    writer.add_scalar('loss', loss, i_episode)
    #print("loss:",loss.data.cpu().numpy()[0])
print('Complete')
writer.close()
#env.render(close=True)
#env.close()

In [None]:
### 主要错误还是在loss函数的计算和r的引导上
### 增加网络的泛化能力