In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from copy import deepcopy
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display


plt.ion()

#GPU
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

### 重放记忆

In [2]:
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward')) #命名元组

class ReplayMemory(object):

    def __init__(self, capacity): #初始化
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args): #添加transition
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity # %取余数，可以循环

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

### Q网络

In [3]:
def ini_net(md):
    for m in md.modules():
        if isinstance(m, nn.Linear):
            torch.nn.init.normal(m.weight.data)
            #torch.nn.init.xavier_normal(m.bias.data)

class DQN(nn.Module):

    def __init__(self):
        super(DQN, self).__init__()
        self.MLP = nn.Sequential(
            nn.Linear(4,500),
            nn.ReLU(),
            nn.Linear(500,500),
            nn.ReLU(),
            nn.Linear(500,2),
        )
    def forward(self, x):
        x = self.MLP(x)
        return x
    
net = DQN()
net.apply(ini_net)
testx = Variable(torch.randn(1,4))
net(testx)

Variable containing:
-1043.0883   138.2366
[torch.FloatTensor of size 1x2]

### 动作策略

In [4]:
def select_action(state): #策略：e贪婪法选择动作的策略
    global steps_done
    sample = random.random() #产生一个随机数
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY) #一个衰减的eps
    eps_threshold = EPS_END
    steps_done += 1 
    if sample > eps_threshold: #如果随机数大于eps
        return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1) #Q网络输出的0和1
    else:
        return LongTensor([[random.randrange(2)]]) #否则随机选择0和1

### 值迭代

In [5]:
last_sync = 0

def optimize_model():
    global last_sync
    if len(memory) < BATCH_SIZE: #如果记忆长度小于batch_size就返回
        return
    transitions = memory.sample(BATCH_SIZE) #取一个batch_size长度的transitions
    batch = Transition(*zip(*transitions)) #打一下包

    # 非最终状态的掩码,也就是没结束的s-a-s，打上True标记
    non_final_mask = tuple(map(lambda s: s is not None, batch.next_state))
    non_final_mask = ByteTensor(non_final_mask)

    # 我们不希望反向传播 expected action values
    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
                                                if s is not None]),
                                     volatile=True)
    state_batch = Variable(torch.cat(batch.state))
    action_batch = Variable(torch.cat(batch.action))
    reward_batch = Variable(torch.cat(batch.reward))


    state_action_values = model(state_batch).gather(1, action_batch) #计算Q(s,a)

    # 对所有nest_stats计算 V(s_{t+1}) 
    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor)) #全0
    next_state_values[non_final_mask] = model(non_final_next_states).max(1)[0] #取最大的一个，动作
    next_state_values.volatile = False
    # expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Huber loss
    #loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
    loss = torch.nn.MSELoss()(state_action_values, expected_state_action_values)
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    #for param in model.parameters():
    #    param.grad.data.clamp_(-1, 1)
    optimizer.step()
    return loss

### 超参数

In [None]:
BATCH_SIZE = 640
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.01
EPS_DECAY = 200 #越大衰减越慢
num_episodes = 5000

model = DQN().cuda()
model.apply(ini_net)
#optimizer = optim.RMSprop(model.parameters())
optimizer = optim.Adam(model.parameters(),lr=1e-2)
memory = ReplayMemory(100000) #10000个重放记忆
pool = ReplayMemory(640) #重放记忆缓冲池

env = gym.make('CartPole-v0').unwrapped
env.reset()

steps_done = 0

### 训练

In [None]:
from tensorboardX import SummaryWriter
writer = SummaryWriter()

def get_screen(): #获得修改后的屏幕图像
    state = FloatTensor(env.state).cuda().view(1,-1)
    return state

for i_episode in range(num_episodes):
    # 初始化环境
    env.reset()
    last_screen = get_screen() #获得环境的图像
    current_screen = get_screen()
    state = current_screen - last_screen
    for t in count(): #无限循环
        env.render()
        action = select_action(state) #选择一个动作
        _, reward, done, _ = env.step(action[0, 0]) #计算该动作的奖励，done
        
        #reward += 0.5 * np.abs(1-env.state[0])
        if action == 1 and env.state[0]>0:
            reward -= 1
        if action == -1 and env.state[0]<0:
            reward -= 1
        
        reward = Tensor([reward]) #存储奖励

        # 观察状态
        last_screen = current_screen #上一个屏幕状态等于当前屏幕状态
        current_screen = get_screen() #当前屏幕状态重新获取
        if not done: #如果没有结束
            next_state = current_screen - last_screen #下一个状态位当前屏幕状态-上一个屏幕状态
        else:
            next_state = None

        # 把transtion存入重放记忆
        pool.push(state, action, next_state, reward)
        # 变成下一个状态
        state = next_state

        for i in range(1):
            loss = optimize_model()
        if done:
            writer.add_scalar('t', t, i_episode)
            #print(i_episode, t)
            break
    for t in pool.memory:#把缓冲池的内容放进记忆库
        memory.push(*t)

print('Complete')
writer.close()
#env.render(close=True)
#env.close()

两个问题，如何防止溜边，二如何保证战果

增大批次，增大记忆库，简单粗暴提神
溜边和直立存着矛盾约束，要减少溜边可能需要多步骤策略

现在判定改变reward有无效果