# Use DQN to Play Crowdsourcing distribution

PyTorch version

In [1]:
%matplotlib inline

import copy
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)


<torch._C.Generator at 0x1aab43673d0>


## 创造环境，打印环境对象

In [2]:
class Myenv:
    def __init__(self):
        # 定义属性
        self.state_dim =  40                                # 状态空间的维度
        self.action_dim =  30                               # 动作空间的维度
        self._max_episode_steps = 1000                    # 每一回合最大迭代数
        pass
    
    def step(self, action):
        # 定义动作转跳
        pass
    
    def reset(self):
        # 环境重置
        pass
    
    # def render(self):
    #     # 绘制图像
    #     pass
    
    # def close(self):
    #     # 关闭窗口
    #     pass


env = Myenv()


### 定义经验回放

In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        # 记忆池用DataFrame格式储存
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'done'])
        self.i = 0
        self.count = 0
        self.capacity = capacity                # 记忆池的容量

    # 储存经验，超过容量重新填充
    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity                       
        self.count = min(self.count + 1, self.capacity)

    # 经验抽取
    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)

### 定义DQN代理

In [None]:
class DQNAgent:
    def __init__(self, env):
        self.action_n = env.action_dim                                              # 来自环境的动作数
        self.gamma = 0.99

        # 容量capacity定义10000
        self.replayer = DQNReplayer(10000)

        ### 输入状态得到输出动作------评价网络
        self.evaluate_net = self.build_net(
                input_size=env.state_dim,
                hidden_sizes=[64, 64], output_size=self.action_n)                       # 来自环境的状态数
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

        ### 输入状态得到输出动作------目标网络
    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        model = nn.Sequential(*layers)
        return model

    
    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    # agent决策
    def step(self, observation, reward, done):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).squeeze(0)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)                  # 选择最大的Q值对应的动作
            action = action_tensor.item()                           # 选择最大的Q值
        if self.mode == 'train':
            self.trajectory += [observation, reward, done, action]      # 记录当前的state、reward、done、action
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, done, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, done)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()                                                # 开始学习
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, dones = \
                self.replayer.sample(1024) # replay transitions
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        done_tensor = torch.as_tensor(dones, dtype=torch.float)

        # train
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor, _ = next_q_tensor.max(axis=-1)
        target_tensor = reward_tensor + self.gamma * (1. - done_tensor) * next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()


agent = DQNAgent(env)


### 模拟一个回合

In [6]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False                                  ## 来自环境的reset函数
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()                                                               ## 来自环境的render函数
        if done:
            break
        observation, reward, done, _ = env.step(action)                                ## 来自环境的step函数
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


18:56:31 [INFO] ==== train ====


### 查看网络结构

In [None]:
from torchviz import make_dot
model=agent.evaluate_net
y=model(torch.Tensor(np.random.random(env.state_dim)))
g = make_dot(y)
g.render('espnet_model', view=True)

### 强化学习训练Q表

In [None]:
print('============ train ============')
episode_rewards = []
for episode in range(10000):
    episode_reward, elapsed_steps = play_episode(env, agent,
            max_episode_steps=env._max_episode_steps, mode='train')                  ## 来自环境的_max_episode_steps参数
    episode_rewards.append(episode_reward)
    print('train episode %d: reward = %.2f, steps = %d'%
            (episode, episode_reward, elapsed_steps))
    if np.mean(episode_rewards[-10:]) > -110:                                        ## 如果后期稳定到好的效果就跳出循环
        break
plt.plot(episode_rewards)



### 回合测试

In [None]:

print('============ test ============')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    print('test episode %d: reward = %.2f, steps = %d'%
            (episode, episode_reward, elapsed_steps))
print('average episode reward = %.2f ± %.2f'%
        (np.mean(episode_rewards), np.std(episode_rewards)))

In [14]:
# env.close()