# nature DQN

In [14]:
import gym
import numpy as np
import tensorflow as tf
import os
import sys

if "../" not in sys.path:
    sys.path.append("../")

In [11]:
env = gym.make('ALE/Breakout-v5')

VALID_ACTIONS = [0, 1, 2, 3]

<!-- <link rel="stylesheet" type="text/css" href="../../style.css"> -->
## 0. 前置知识

### DP, MC, TD
强化学习中策略可以分为:
- 目标策略(target policy): 智能体要学习的策略.
- 行为策略(behavior policy): 智能体与环境交互的策略,即用于生成行为的策略.

广义策略迭代:
- 策略评估(policy evaluation): estimate the value function $v_{\pi}$ for a given policy $\pi$.
- 策略改进(policy improvement): find an optimal policy.

on-policy指目标和行为策略相同, off-policy指目标和行为策略不同.  

TD方法介于Monte Carlo(MC)方法和Dynamic Programming(DP)方法之间, 既有MC方法直接从原始经验中学习的优点, 也有DP方法自举(bootstrap, 即用一个状态价值估计另一个状态价值)的优点.

DP更新:
$$V\left(S_{t}\right) \leftarrow E_{\pi}\left[R_{t+1}+\gamma V\left(S_{t+1}\right)\right]$$

MC更新:
$$V\left(S_{t}\right) \leftarrow V\left(S_{t}\right)+\alpha\left[G_{t}-V\left(S_{t}\right)\right]$$

TD更新: 
$$V\left(S_{t}\right) \leftarrow V\left(S_{t}\right)+\alpha\left[R_{t+1}+\gamma V\left(S_{t+1}\right)-V\left(S_{t}\right)\right]$$


### SARSA, Q-learning
Q-learning是一种off-policy TD方法.Q-learning 用 Q-Table 来存储每个状态动作对的 Q 值，若状态和动作空间是离散且维度不高时，是比较有效的；若状态和动作空间是高维连续时，就会出现“curse of dimensionality”，即随着维数的增加，计算量呈指数倍增长。

既然我们无法用一个表格来精确地存储与表示 $Q$ 值，我们可以用一个参数化 (parameterized) 函数来近似地表示动作值函数 $Q(s,a)$. 

SARSA update:
$$Q\left(S_{t}, A_{t}\right) \leftarrow Q\left(S_{t}, A_{t}\right)+\alpha\left[R_{t+1}+\gamma Q\left(S_{t+1}, A_{t+1}\right)-Q\left(S_{t}, A_{t}\right)\right]$$

Q-learning update:
$$Q\left(S_{t}, A_{t}\right) \leftarrow Q\left(S_{t}, A_{t}\right)+\alpha\left[R_{t+1}+\gamma \max _{a} Q\left(S_{t+1}, a\right)-Q\left(S_{t}, A_{t}\right)\right]$$


## 1. 经验回放(Experience Replay)

DQN 面临着几个挑战：

- 深度学习需要大量带标签的训练数据；
- 强化学习从 scalar reward 进行学习，但是 reward 经常是 sparse, noisy, delayed；
- 深度学习假设样本数据是独立同分布的，但是强化学习中采样的数据是强相关的

因此，DQN 采用经验回放（Experience Replay）机制，将训练过的数据进行储存到 Replay Buffer 中，在每个时间步 t 中，DQN 先将智能体获得的经验$ (S_t, A_t, R_t, S_{t+1}) $存入回放缓存中，然后从该缓存中均匀采样小批量样本用于 Q-Learning
更新。好处就是：
1. 数据利用率高；
2. 减少连续样本的相关性，从而减小方差（variance）。

## 2. 目标网络
它作为一个独立的网络，用来代替所需的 Q 网络来生成 Q-Learning 的目标，进一步提高神经网络的稳定性。此外，目标网络每 C 步将通过直接复制（硬更新）或者指数衰减平均（软更新）的方式与主 Q 网络同步。目标网络通过使用旧参数生成 Q-Learning 目标，使目标值的产生不受最新参数的影响，从而大大减少发散和震荡的情况。

例如，在动作$ (S_t, A_t) $上的更新使得 Q 值增加，此时$ S_t $和$ S_{t+1} $的相似性可能会导致所有动作 a 的$ Q(S_{t+1}, a) $值增加，从而使得由 Q 网络产生的训练目标值被过估计。但是如果使用目标网络产生训练目标，就能避免过估计的问题。


> Trick --- By using a Convolutional Neural Network as the function approximator on raw pixels of Atari games where the score is the reward we can learn to play many of those games at human-like performance.

数据预处理

In [15]:
class StateProcessor():
    """
    Processes a raw Atari images. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope("state_processor"):
            # tf.placeholder -- placeholder()函数是在神经网络构建graph的时候在模型中的占位，此时并没有把要输入的数据传入模型，它只会分配必要的内存。等建立session，在会话中，运行模型的时候通过feed_dict()函数向占位符喂入数据。
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output1 = tf.image.rgb_to_grayscale(self.input_state)
            #crop_to_bounding_box -- 从image中裁剪一个矩形部分(height, width)从左上角开始计算
            self.output2 = tf.image.crop_to_bounding_box(self.output1, 34, 0, 160, 160)
            self.output3 = tf.image.resize_images(
                self.output2, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output3)

    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [None]:
class Estimator():
    '''
    Q-value Estimator neural network.
    
    This network is used for both the Q-network and the target network.
    '''
    def __init__(self, scope="estimator", summaries_dir=None):
        self.scope = scope
        # Writes Tensorboard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)

def _build_model(self):
        """
        Builds the Tensorflow graph.
        """

        # Placeholders for our input
        # Our input are 4 grayscale frames of shape 84, 84 each
        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        X = tf.to_float(self.X_pl) / 255.0
        batch_size = tf.shape(self.X_pl)[0]

        # Three convolutional layers
        conv1 = tf.contrib.layers.conv2d(
            X, 32, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(
            conv2, 64, 3, 1, activation_fn=tf.nn.relu)

        # Fully connected layers
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS))

        # Get the predictions for the chosen actions only
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calculate the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

        # Summaries for Tensorboard
        self.summaries = tf.summary.merge([
            tf.summary.scalar("loss", self.loss),
            tf.summary.histogram("loss_hist", self.losses),
            tf.summary.histogram("q_values_hist", self.predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
        ])


    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 84, 84, 1]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 84, 84, 1]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        summaries, global_step, _, loss = sess.run(
            [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
            feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

## 3. DQN算法

[DQN流程图](./images/dqn.png)

[DQN算法](./images/dqn_alogrithm.png)

# Reference