# 问题描述

有8个格子围成一圈，其中3个格子里加入0.0 - 1.0的均匀随机数，剩下5个为0。

可以采取的动作是选择一个格子。

计分规则：
* 如果所选格子中的数字大于0：得分为格子中的数字。否则，
* 如果所选格子左右两侧格子中的数字都大于0：得分为两侧格子中的数字相加。否则
* 得分为0。

In [97]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_states(n):
    states = np.zeros((n,8))
    for i in np.arange(n):
        idx = np.random.choice(8, size=3, replace=False)
        states[i, idx] = np.random.uniform(0.0, 1.0, size=3)
    return states
    
def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = make_states(n_rounds)
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(state, np.random.choice(8)) for state in make_states(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    probs, predicted_values = model(np.array(state_list))
    return probs.numpy().argmax(axis = 1), predicted_values

def test_model(model, n_test_rounds):
    state_list = make_states(n_test_rounds)
    actions, predicted_values = get_model_actions(model, state_list)
    rewards = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, optimal_values = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    values_error_rewards = np.mean(np.square(rewards - predicted_values))
    values_error_optimal = np.mean(np.square(optimal_values - predicted_values))
    return np.mean(rewards), np.std(rewards), accuracy, values_error_rewards, values_error_optimal

def one_batch(model, optimizer, batch_size=128, n_test_rounds=10000):
    states = make_states(batch_size)
    # _, optimal_values = get_optimal_actions_and_values(states)
    with tf.GradientTape() as tape:
        probs, predicted_values = model(states)

        actions = np.array([np.random.choice(8, p=prob) for prob in probs.numpy()])
        rewards = np.array([run(state, action) for state, action in zip(states, actions)])

        advantages = rewards - tf.stop_gradient(predicted_values)
        # advantages = rewards - optimal_values
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)

        action_probs = tf.gather_nd(probs, [[i, a] for i, a in enumerate(actions)])
        log_probs = tf.math.log(action_probs)
        actor_loss = -tf.reduce_mean(log_probs * advantages)

        values_loss = 1 * tf.reduce_mean(tf.square(rewards - predicted_values))

        total_loss = actor_loss + values_loss
    
    gradients = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return actor_loss, values_loss, total_loss

def train(model, optimizer, one_batch, max_batch=200, batch_size=32, test_lapse = 10, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        actor_loss, values_loss, total_loss = one_batch(model, optimizer, batch_size)
        if verbose & 1:
            print(i+1, "actor_loss=", actor_loss.numpy(), "values_loss=", values_loss.numpy(), "total_loss=", total_loss.numpy())
        if (i+1) % test_lapse == 0 or i + 1 == max_batch:
            score, _, accuracy, err1, err2 = test_model(model, n_test_rounds)
            if best_score < score:
                best_score = score
                best_idx = i
                best_weights = model.get_weights()
            if verbose & 2:
                print(i+1, "score=", score, "accuracy=", accuracy, "err1=", err1, "err2=", err2)
    return best_idx, best_score, best_weights
    
def create_model(n_hidden_layers, n_dense_units, ratio_dropout):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)

    x = Dense(64)(x) 
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Dropout(ratio_dropout)(x)
    
    probs = Dense(8, activation='softmax')(x)
    values = Dense(1)(x)
    model = Model(inputs, [probs, values])
    return model




# 实验场

In [104]:
model = create_model(n_hidden_layers=2, n_dense_units=128, ratio_dropout=0.2)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [105]:
best_idx, best_score, best_weights = train(model, optimizer, one_batch, max_batch=1000, batch_size=32, test_lapse = 50, n_test_rounds=10000, verbose = 1|2)

1 actor_loss= 0.0013009161 values_loss= 0.41810775 total_loss= 0.41940868
2 actor_loss= 0.005267076 values_loss= 0.3083955 total_loss= 0.3136626
3 actor_loss= -0.023667186 values_loss= 0.076057404 total_loss= 0.052390218
4 actor_loss= 0.0015249252 values_loss= 0.18431833 total_loss= 0.18584326
5 actor_loss= -0.013403527 values_loss= 0.1268548 total_loss= 0.11345128
6 actor_loss= 0.005255133 values_loss= 0.13206208 total_loss= 0.13731721
7 actor_loss= -0.020725131 values_loss= 0.09013246 total_loss= 0.06940733
8 actor_loss= 0.01888851 values_loss= 0.25606805 total_loss= 0.27495655
9 actor_loss= -0.024948508 values_loss= 0.124883726 total_loss= 0.09993522
10 actor_loss= -0.009337902 values_loss= 0.22799933 total_loss= 0.21866143
11 actor_loss= 0.0054150224 values_loss= 0.12573469 total_loss= 0.13114971
12 actor_loss= -0.011135027 values_loss= 0.16733696 total_loss= 0.15620193
13 actor_loss= -0.008783624 values_loss= 0.15976119 total_loss= 0.15097757
14 actor_loss= -0.011653431 values_los

In [100]:
test_model(model, 10000)

(np.float64(0.8630944939125264),
 np.float64(0.3682051278824428),
 np.float64(0.6608),
 np.float32(0.15495317),
 np.float32(0.1622129))

# 总结

```
advantages = rewards - tf.stop_gradient(predicted_values)
```
计算 advantages 时，要把 predicted_values 放进 tf.stop_gradient 里，计算导数时忽略它。这样 actor_loss 只和 probs 有关，critic_loss 只和 predicted_values 有关。

目前看来，policy 一直在进步，但 value 很快就不进步了。由于实验问题过于简单，看不出来所采取的算法是否有效。

考虑设计一个复杂一点的问题：
* 多步
* 对抗性

5*5格子，随机1格 为bot出生点、5格为奖励积分、1格为电池。bot 能量上限为5，初始为5，每走1步消耗1能量，电池增加3能量。能量耗尽游戏结束。游戏目标是获取尽可能多的积分。

