In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
from torch.distributions import Categorical
from collections import namedtuple
from copy import deepcopy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T


#GPU
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

#超参数
BATCH_SIZE = 64
GAMMA = 0.99

### 环境

In [2]:
env = gym.make('CartPole-v1').unwrapped
env.action_space.n
env.reset()
env.state

array([-0.03499601, -0.02108763, -0.02357108, -0.00750943])

### 经验重放

In [3]:
Clip = namedtuple('Clip',('state', 'action', 'next_state', 'reward')) #命名元组

class ReplayMemory(object):

    def __init__(self, capacity): #初始化
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args): #添加transition
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Clip(*args)
        self.position = (self.position + 1) % self.capacity # %取余数，可以循环

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

### Actor网络

In [4]:
def ini_net(md):
    for m in md.modules():
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_normal(m.weight.data)
            torch.nn.init.normal(m.bias.data)

class ActorNet(nn.Module):

    def __init__(self):
        super(ActorNet, self).__init__()
        self.MLP = nn.Sequential(
            nn.Linear(4,100),
            nn.ReLU(),
            nn.Linear(100,100),
            nn.ReLU(),
            nn.Linear(100,2),
            nn.Softmax(1)
        )
    def forward(self, x):
        x = self.MLP(x)
        return x
    
actor_net = ActorNet().cuda()
actor_net.apply(ini_net)
testx = Variable(torch.randn(1,4)).cuda()
actor_net(testx)

Variable containing:
 0.2774  0.7226
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

### Critic网络

In [5]:
class CriticNet(nn.Module):

    def __init__(self):
        super(CriticNet, self).__init__()
        self.MLP = nn.Sequential(
            nn.Linear(4,100),
            nn.ReLU(),
            nn.Linear(100,100),
            nn.ReLU(),
            nn.Linear(100,1),
        )
    def forward(self, x):
        x = self.MLP(x)
        return x
       
critic_net = CriticNet().cuda()
critic_net.apply(ini_net)

critic_net2 = deepcopy(critic_net) #延迟网络

testx = Variable(torch.randn(1,4)).cuda()
critic_net(testx)

Variable containing:
 3.8090
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

### 动作策略

In [6]:
def select_action(state):
    state = Variable(state)
    prob = actor_net(state)
    m = Categorical(prob)
    action = m.sample()
    log_prob = m.log_prob(action)
    action = action.data.cpu().numpy()[0]
    return action, log_prob

testx = torch.randn(1,4).cuda()
select_action(testx)

(1, Variable containing:
 -0.3461
 [torch.cuda.FloatTensor of size 1 (GPU 0)])

In [7]:
memory = ReplayMemory(100000) #10000个重放记忆

### Loss

In [8]:
def critic_optimize():
    if len(memory) < BATCH_SIZE: #如果记忆长度小于batch_size就弹出
        return
    
    #取出一个批次的片段
    clips = memory.sample(BATCH_SIZE) #取一个batch_size长度的transitions
    batch = Clip(*zip(*clips)) #打一下包
    s2_batch = Variable(torch.cat(batch.next_state),volatile=True)
    s_batch = Variable(torch.cat(batch.state))
    a_batch = Variable(torch.cat(batch.action))
    r_batch = Variable(torch.cat(batch.reward))
    
    #计算Q(s,a)
    Qsa = critic_net(s_batch) 

    #计算Q(s_,a_) 
    Qs2a = critic_net2(s2_batch) #取最大的一个，动作
    Qs2a = Qs2a.view(-1)
    
    # 计算y = r + Q(s_,a_)
    y = (Qs2a * GAMMA) + r_batch
    

    # Huber loss
    #loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
    loss = torch.nn.MSELoss()(Qsa, y)
    # 优化model
    optimizer1.zero_grad()
    loss.backward()
    #为了防止值过大，可以裁剪梯度
    for param in critic_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer1.step()
    return loss

def A_error(s, s_, r):
    s = Variable(s)
    s_ = Variable(s_)    
    Qs = critic_net(s)
    y = critic_net2(s_).detach() + r
    A = y - Qs #实际奖励-估计的奖励
    A = A.detach()
    return A

def actor_loss(log_prob, A):
    aloss = -log_prob * A
    return aloss    

### 训练

In [19]:
from tensorboardX import SummaryWriter
writer = SummaryWriter()
num_episodes = 1
optimizer1 = optim.Adam(critic_net.parameters(),lr=1e-4)
optimizer2 = optim.Adam(actor_net.parameters(),lr=1e-4)

def get_state():
    state = FloatTensor(env.state).cuda().view(1,-1)
    #state = Variable(state)
    return state

for i_episode in range(num_episodes):
    # 初始化环境
    env.reset()
    s = get_state()
    step = 0
    
    closs_sum = 0
    aloss_sum = 0
    while True: #无限循环
        #if i_episode>3500: 
        #env.render()
        
        a, log_prob = select_action(s) #根据状态选择一个动作，得到a和log_prob
        s_, r, done, _ = env.step(a) #计算该动作的下一个状态，奖励，done   
        s_ = get_state()
        if done: r -= 10
        
        #计算该状态的Q(s，a),（输入s,a,s_,r，输出A，critic loss）,然后更新网络
        A = A_error(s, s_, r)
        
        print(A)
        
        memory.push(s, LongTensor([int(a)]), s_, Tensor([r]))
        
        #计算该动作的价值，（输入logprob, A，输出actor loss），然后更新网络
        aloss = actor_loss(log_prob, A)
        optimizer2.zero_grad
        aloss.backward()
        optimizer2.step()
        aloss_sum += aloss.data.cpu().numpy()[0]
        
        s = s_ 
        step += 1
        
        critic_optimize()
        
        if done:
            writer.add_scalar('step', step, i_episode)
            writer.add_scalar('aloss', aloss_sum/step, i_episode)
            writer.add_scalar('closs', closs_sum/step, i_episode)
            #print(i_episode, step)
            break
    
   
    if i_episode % 20 == 0:
        critic_net2 = deepcopy(critic_net) 
    
print('Complete')
writer.close()

Variable containing:
-231.7013
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-235.8995
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-240.8292
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-246.4740
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-252.7981
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-259.7388
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-267.2031
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-275.0610
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-283.1384
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Variable containing:
-301.2078
[torch.cuda.FloatTensor of size 1x1 (GPU 0)]

Complete
