In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model
from itertools import product

# 单步奖励收集

这是一个用于练习策略梯度法 (policy gradient method) 的玩具题。  
8个格子首尾相连，其中随机3个格子奖励，奖励数值为0~1的均匀随机数。  
动作：选择一个格子。
计分规则：  
1. 如果格子有奖励（大于0），获得对应的奖励。
2. 如果格子是空的（等于0），且两侧的格子都有奖励，会获得两侧格子的总奖励。
3. 否则奖励为0。

## 相关函数
创建环境，计算理想得分，模拟多次计算期望

In [2]:
def make_slots():
    slots = np.zeros(8)
    slots[np.random.choice(8, size=3, replace=False)] = np.random.uniform(size=3)
    return slots

def run(slots, action):
    slots_cyclic = np.concatenate((slots, slots[0:2]))
    if slots_cyclic[action] != 0:
        return slots_cyclic[action]
    elif slots_cyclic[action-1] != 0 and slots_cyclic[action+1] != 0:
        return slots_cyclic[action-1] + slots_cyclic[action+1]
    else:
        return 0

def get_optimal_value(slots):
    value = slots.max()

    slots_cyclic = np.concatenate((slots, slots[0:2]))
    for i in np.arange(1, 9):
        if slots_cyclic[i] == 0 and slots_cyclic[i-1] != 0 and slots_cyclic[i+1] != 0:
            value2 = slots_cyclic[i-1] + slots_cyclic[i+1]
            if value2 > value:
                value = value2
    return value

def get_optimal_statics(n_rounds):
    values = []
    for i in np.arange(n_rounds):
        slots = make_slots()
        values.append(get_optimal_value(slots))
    return np.mean(values), np.std(values)

def get_baseline_value(n_rounds):
    value = 0
    for i in np.arange(n_rounds):
        slots = make_slots()
        action = np.random.choice(8)
        value += run(slots, action)
    return value/n_rounds
    

## 创建模型

In [3]:
def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model

def test_model(model, slots):
    prob = model(np.array([slots])).numpy()
    action = prob.argmax()
    return run(slots, action)

def test_model_n_rounds(model, n_rounds):
    rewards = 0
    slots_list = np.array([make_slots() for i in np.arange(n_rounds)])
    prob_list = model(slots_list).numpy()
    for i, prob in enumerate(prob_list):
        action = prob.argmax()
        rewards += run(slots_list[i], action)
    return rewards/n_rounds

def one_epoch(model, batch_size, n_test_rounds=0):
    slots_list = []
    y_target_list = []
    # rewards = 0
    for i in np.arange(batch_size):
        slots = make_slots()
        slots_list.append(slots)
    prob_list = model(np.array(slots_list)).numpy()
    for i, prob in enumerate(prob_list):
        y_target = np.zeros(8)
        slots = slots_list[i]
        # prob[7] = 1 - np.sum(prob[0:7])
        action = np.random.choice(8, p=prob)
        reward = run(slots, action)
        # rewards += reward
        y_target[action] = reward
        y_target_list.append(y_target)
    model.fit(np.array(slots_list), np.array(y_target_list), verbose = 0)
    if n_test_rounds > 0:
        return (test_model_n_rounds(model, n_test_rounds))
    # return rewards/n_rounds


## 测试初始值

In [4]:
def init_test(n_tests):
    optimilizer = 'adam'
    n_hidden_layers = 2
    n_dense_units = 64
    ratio_dropout = 0.5
    res = []
    for i in np.arange(n_tests):
        model = create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimilizer)
        scores = np.array([one_epoch(model, batch_size=128, n_test_rounds=10000) for i in np.arange(200)])
        argmax = scores.argmax()
        res.append((argmax, scores[argmax]))
        print(argmax, scores[argmax])
    return res

In [5]:
res = init_test(100)

140 0.7713881173943503
113 0.7835118355870212
122 0.8253750977022765
103 0.7748781818047275
143 0.7889368697398494
117 0.7887397550577548
131 0.789229170882854
91 0.777897915034964
150 0.8134063388221322
90 0.7750306950875012
123 0.81677637218345
131 0.8232512956023175
88 0.8016659332961854
119 0.8135112316031453
111 0.8143305969825757
96 0.7897723789092175
147 0.8024294515099405
118 0.8230995054257084
110 0.7946154881451905
81 0.7129443580902933
106 0.7963776661118621
135 0.8127652945909305
92 0.7836183679447718
123 0.7979029640578997
126 0.8091455659757768
117 0.8039397060637763
131 0.8038789055373614
124 0.8297481187969533
133 0.7836414092166974
117 0.7781721553383599
148 0.8054197433076951
138 0.805087095048984
112 0.7984693696089614
99 0.7832608466982857
104 0.8152274294909233
104 0.7740647903854213
112 0.7966500613234644
112 0.7497995435804522
110 0.7866082784026085
108 0.8131877697005063
102 0.8044192506299691
151 0.8084891269691725
140 0.7809752008746872
121 0.7479881866542702
