# {class}`drlhp.reward_predictor.RewardPredictorNetwork` 测试

In [1]:
import logging
import sys
from pathlib import Path
from d2py.utils.log_config import config_logging

root_dir = Path(".").resolve()
temp_dir = root_dir/".temp"
temp_dir.mkdir(parents=True, exist_ok=True)
sys.path.extend([str(root_dir.parents[2]/"tests/gym-multigrid")])
temp_dir = root_dir/"images"

logger_name = "drlhp"
logger = logging.getLogger(logger_name)
config_logging(f'{temp_dir}/{logger_name}.log', logger_name, maxBytes=50000, backupCount=2)

In [2]:
from dataclasses import dataclass
from typing import Any
import tensorflow as tf

# import os.path as osp
import time
import logging
import numpy as np
from numpy.testing import assert_equal

from drlhp.utils import RunningStat, batch_iter
from drlhp.reward_predictor_core_network import net_cnn

2024-10-23 20:20:16.549806: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-23 20:20:16.568366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-23 20:20:16.574070: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-23 20:20:16.587941: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
@dataclass
class RewardPredictorNetwork:
    """用于预测人类对输入轨迹中每一帧的奖励。该模型通过训练来学习人类在成对轨迹之间的偏好。

    - 网络输入：
        1. `s1`/`s2`: 成对的轨迹。这些是两个不同的轨迹，模型需要比较它们并预测人类的偏好。
        2. `pref`: 每对轨迹之间的偏好。这是人类对这两个轨迹的偏好评分，通常是标量值，表示人类更倾向于选择哪个轨迹。

    - 网络输出：
        1. `r1`/`r2`: 每个轨迹中每一帧的预测奖励。模型会为每个轨迹中的每个帧生成奖励值，这些值表示人类对该帧的奖励程度。
        2. `rs1`/`rs2`: 每个轨迹在所有帧上的奖励总和。这是通过对每个轨迹中所有帧的奖励进行求和得到的，表示整个轨迹的总奖励。
        3. `pred`: 预测的偏好。模型输出值，表示它预测人类更倾向于选择哪个轨迹（`s1` 或 `s2`）。
    """
    core_network: Any
    dropout: Any
    batchnorm: Any
    lr: Any
    obs_shape: Any

    def __post_init__(self):
        training = tf.placeholder(tf.bool)
        # Each element of the batch is one trajectory segment.
        # (Dimensions are n segments x n frames per segment x ...)
        h, w, c = self.obs_shape
        s1 = tf.placeholder(tf.float32, shape=(None, None, h, w, c))
        s2 = tf.placeholder(tf.float32, shape=(None, None, h, w, c))
        # For each trajectory segment, there is one human judgement.
        pref = tf.placeholder(tf.float32, shape=(None, 2))

        # Concatenate trajectory segments so that the first dimension is just
        # frames
        # (necessary because of conv layer's requirements on input shape)
        s1_unrolled = tf.reshape(s1, [-1, h, w, c])
        s2_unrolled = tf.reshape(s2, [-1, h, w, c])

        # Predict rewards for each frame in the unrolled batch
        _r1 = self.core_network(
            s=s1_unrolled,
            dropout=self.dropout,
            batchnorm=self.batchnorm,
            reuse=False,
            training=training)
        _r2 = self.core_network(
            s=s2_unrolled,
            dropout=self.dropout,
            batchnorm=self.batchnorm,
            reuse=True,
            training=training)

        # Shape should be 'unrolled batch size'
        # where 'unrolled batch size' is 'batch size' x 'n frames per segment'
        c1 = tf.assert_rank(_r1, 1)
        c2 = tf.assert_rank(_r2, 1)
        with tf.control_dependencies([c1, c2]):
            # Re-roll to 'batch size' x 'n frames per segment'
            __r1 = tf.reshape(_r1, tf.shape(s1)[0:2])
            __r2 = tf.reshape(_r2, tf.shape(s2)[0:2])
        # Shape should be 'batch size' x 'n frames per segment'
        c1 = tf.assert_rank(__r1, 2)
        c2 = tf.assert_rank(__r2, 2)
        with tf.control_dependencies([c1, c2]):
            r1 = __r1
            r2 = __r2

        # Sum rewards over all frames in each segment
        _rs1 = tf.reduce_sum(r1, axis=1)
        _rs2 = tf.reduce_sum(r2, axis=1)
        # Shape should be 'batch size'
        c1 = tf.assert_rank(_rs1, 1)
        c2 = tf.assert_rank(_rs2, 1)
        with tf.control_dependencies([c1, c2]):
            rs1 = _rs1
            rs2 = _rs2

        # Predict preferences for each segment
        _rs = tf.stack([rs1, rs2], axis=1)
        # Shape should be 'batch size' x 2
        c1 = tf.assert_rank(_rs, 2)
        with tf.control_dependencies([c1]):
            rs = _rs
        _pred = tf.nn.softmax(rs)
        # Shape should be 'batch_size' x 2
        c1 = tf.assert_rank(_pred, 2)
        with tf.control_dependencies([c1]):
            pred = _pred

        preds_correct = tf.equal(tf.argmax(pref, 1), tf.argmax(pred, 1))
        accuracy = tf.reduce_mean(tf.cast(preds_correct, tf.float32))

        _loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=pref,
                                                           logits=rs)
        
        # Shape should be 'batch size'
        c1 = tf.assert_rank(_loss, 1)
        with tf.control_dependencies([c1]):
            loss = tf.reduce_sum(_loss)

        # Make sure that batch normalization ops are updated
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_ops):
            train = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(loss)

        # Inputs
        self.training = training
        self.s1 = s1
        self.s2 = s2
        self.pref = pref

        # Outputs
        self.r1 = r1
        self.r2 = r2
        self.rs1 = rs1
        self.rs2 = rs2
        self.pred = pred

        self.accuracy = accuracy
        self.loss = loss
        self.train = train


In [4]:
rpn = RewardPredictorNetwork(
    dropout=0.5, batchnorm=True,
    lr=1e-3,
    core_network=net_cnn,
    obs_shape=(84, 84, 4)
)

AttributeError: module 'tensorflow' has no attribute 'placeholder'