In [33]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

LOW, HIGH = 0.0, 1.0

def make_states(n):
    states = np.zeros((n,8))
    for i in np.arange(n):
        idx = np.random.choice(8, size=3, replace=False)
        states[i, idx] = np.random.uniform(LOW, HIGH, size=3)
    return states
    
def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = make_states(n_rounds)
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(state, np.random.choice(8)) for state in make_states(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = make_states(n_test_rounds)
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = make_states(batch_size)
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    _, optimal_values = get_optimal_actions_and_values(state_list)
    values = values - optimal_values
    values = (values - np.mean(values)) / (np.std(values) + 1e-8)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, epochs=1, batch_size=batch_size, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def one_batch_supervised(model, batch_size, n_test_rounds=0):
    state_list = make_states(batch_size)
    y_target_list = np.zeros((batch_size, 8))
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    for i in np.arange(batch_size):
        y_target_list[i, optimal_actions[i]] = 1
    model.fit(np.array(state_list), np.array(y_target_list), epochs=1, batch_size=batch_size, verbose=0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def train(one_batch, model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score, accuracy)
    return best_idx, best_score, best_weights

def train_supervised(one_batch_supervised, model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch_supervised(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights


def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


# 单步奖励收集

这是一个用于练习策略梯度法 (policy gradient method) 的玩具题。  
8个格子首尾相连，其中随机3个格子奖励，奖励数值为0~1的均匀随机数。  
动作：选择一个格子。
计分规则：  
1. 如果格子有奖励（大于0），获得对应的奖励。
2. 如果格子是空的（等于0），且两侧的格子都有奖励，会获得两侧格子的总奖励。
3. 否则奖励为0。

## 试验场

In [34]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [42]:
best_idx, best_score, best_weights = train(one_batch, model, max_batch=200, verbose = 1)
model.set_weights(best_weights)

0 0.8478646175654474 0.6338
1 0.8509879983317603 0.6326
2 0.8455333533438745 0.6284
3 0.837055844077785 0.6247
4 0.8510514737780546 0.6315
5 0.8446183210212119 0.6298
6 0.8461430372056797 0.6281
7 0.8394324125476497 0.6237
8 0.8417414600394233 0.6256
9 0.8399797613033337 0.6217
10 0.8417830714649859 0.6273
11 0.8309298049873165 0.6215
12 0.8456453375117766 0.6335
13 0.8439592889357922 0.6417
14 0.8427427111284628 0.6362
15 0.848011827368914 0.6456
16 0.8483297548688772 0.6448
17 0.8553990913506945 0.6386
18 0.8451176372540078 0.6343
19 0.8424569040099532 0.6312
20 0.8485820348358897 0.6309
21 0.847524332661327 0.6278
22 0.8369223338586891 0.6226
23 0.843021981654488 0.6235
24 0.8506109757835364 0.6418
25 0.8452753676050353 0.6367
26 0.837357587590032 0.6332
27 0.847893282123941 0.6331
28 0.8400311792379714 0.6323
29 0.8435311737716055 0.6389
30 0.837961941798909 0.6288
31 0.8363933636968283 0.6265
32 0.8354882268798094 0.6339
33 0.8309822348322418 0.6385
34 0.8341297535177641 0.6258
35

In [43]:
best_idx, *test_model(model, 10000)

(np.int64(72),
 np.float64(0.8502460178006955),
 np.float64(0.3824965504654445),
 np.float64(0.6578))

In [11]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [12]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
model.set_weights(best_weights)

0 0.40155409005897713
1 0.43835942170670744
2 0.47521455094780124
3 0.5233936808251424
4 0.5457541841190825
5 0.60900033488207
6 0.6395265413305641
7 0.6578620451271016
8 0.6911916340208485
9 0.7070164593537791
10 0.7151154144860967
11 0.7321296068984608
12 0.7251182328173948
13 0.7309836764964616
14 0.7589333375036049
15 0.7682199268337767
16 0.7675557165837719
17 0.7607641742508313
18 0.7641782073134503
19 0.7755925855301341
20 0.7770031560281603
21 0.7829396475602843
22 0.7956468143766171
23 0.7972822214302536
24 0.797411493945989
25 0.7803849629307795
26 0.7759102454787973
27 0.7636392430081078
28 0.770455245657486
29 0.7799667565660786
30 0.790556762839348
31 0.7928761250297651
32 0.7965587006606923
33 0.8100902418202696
34 0.8095332245864129
35 0.8120959575341389
36 0.8060285753678174
37 0.8054377181837991
38 0.7873472926820616
39 0.8020201904664144
40 0.813652638932629
41 0.8246549172733206
42 0.8207024059061694
43 0.8216845070555435
44 0.813161396662448
45 0.8044061940287046
46

In [13]:
best_idx, *test_model(model, 10000)

(np.int64(69),
 np.float64(0.8608465176411966),
 np.float64(0.38633711000818116),
 np.float64(0.6628))

## 一些测试结论

先做了一些模拟测试（10万次）确定评价模型的基准。如果采用最优解，单局游戏奖励的平均值为0.933，标准差0.347。采用随机策略的平均奖励是0.297。

首先初步对比了一下有无 BatchNormalization 和 Dropout 的情况，差距颇大。无 BatchNormalization 和 Dropout 时表现很差。观察 weights，应该是遇到了梯度消失和梯度爆炸。

之后的测试都用了 BatchNormalization 和 Dropout。


### 超参数

几个结论：
1. 对比强化学习和监督学习，强化学习算法的实现方式是正确的。
2. adam 比 sgd 更快，且更稳定（没有体现在上面的数据里），为什么？sgd模型的参数绝对值会越来越大，adam似乎没这个问题
3. 对于两个隐藏层的模型，sgd 比 adam 表现更好
4. 两个隐藏层的模型比一个隐藏层的模型效果差。为什么？
5. 为什么增大模型规模能提升模型效果和加快学习速度？看到过一个理论，说模型中大部分参数是没用的，只有部分参数起决定性作用，因为这部分参数的初始值很好。大的模型有更多的参数，有更大机会命中较好的初始参数。

之后调整测试 policy gradient method 采用超参数：optimalizer, n_hidden_layers, n_dense_units, ratio_dropout = 'adam', 1, 512, 0.5


观察了两个强化学习和监督学习模型，各自的得分是0.89和0.9，差距很小，最优解的命中率却是0.49和0.77。考虑：
1. 用 baseline 方法重新训练策略模型试试。
2. 训练过程中逐步降温。


### 降温

理论上是错误做法，实际效果也不好。


### 奖励 normalization

考虑以下行为
* 修改初始化时格子中随机数的取值区间
* 在获得的奖励上减去常数
* 在获得的奖励上乘以常数

理论上它们不应该影响策略，实际中它们对模型表现有重大影响。

所以，是否存在某个理想的适于模型学习的 G_t 的分布？

尝试在一个batch内对奖励作 normalization，效果较好。


* 有没有比 normalization 更好的办法？
* 用一个batch内的奖励的平均值和标准差做 normalization 合适吗？如果 batch_size 很小，感觉似乎不合适
* 用test_model返回值做normalization？实际效果不如在batch内做normalization

### baseline

以最优得分为baseline，在奖励上减去 baseline 再做 normalization。对于初始化时随机数区间较大的问题，表现有较大提升，且较为稳定。


### win_or_lose

命中最优解奖励1，否则-1。

加上归一化后，命中最优解的几率达到0.7左右，不过平均每局得分不见优势。

## 最大问题

不收敛。模型表现一开始变好，然后越来越差。为什么？假如说我现在正处于不断变差的阶段，继续训练，表现为什么不会提升？奖励信号为什么失效了？

尝试对每个state进行多次action抽样，无效。生成了一组样本，反复训练，在同组样本上测试，模型表现仍然会变差。

on-polity 测试模型表现？表现同样会逐渐变差

## 学习 gemini 

gemini 的训练过程很稳定，似乎没有越练越差。

## tf.GradientTape vs model.fit

看来 model.fit 和 tf.GradientTape 的差异是关键所在。网上搜了一下，似乎挺多人遇到过同样的问题，但有一些可能是代码写错了导致的。

model.fit 默认 batch_size=32，奇特的是，不手动指定batch_size 和指定 batch_size=32，模型的训练表现大不相同。

使用model.fit
* 不指定batch_size时模型在训练过程中非常不稳定，越训练越差。
* 指定batch_size=32，模型在训练过程中较为稳定，在较长的训练之后似乎有变差的倾向
* 指定batch_size=128，比设为32更加稳定，在较长的训练之后表现出变差的倾向

使用 tf.GradientTape，一次使用128个样本，比model.fit(batch_size=128)表现更稳定，训练速度稍慢，但后期表现更好，也没有显出变差的倾向



## gemini给的代码

出于效率考虑 evaluate_policy 及训练中调用model计算policy几率的部分要改一下

```python
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, ReLU
from tensorflow.keras.models import Sequential
import numpy as np
import matplotlib.pyplot as plt

# Game parameters
num_cells = 8
num_reward_cells = 3
reward_range = (5.0, 10.0)

# Neural network parameters
hidden_layer_size = 64
batch_size = 128
num_batches = 200
learning_rate = 0.001

# Create the policy network
model = Sequential([
    Dense(hidden_layer_size, input_shape=(num_cells,), kernel_initializer='he_normal'),
    BatchNormalization(),
    ReLU(),
    Dense(num_cells, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Function to generate a game
def generate_game():
    game_state = np.zeros(num_cells)
    reward_indices = np.random.choice(num_cells, num_reward_cells, replace=False)
    game_state[reward_indices] = np.random.uniform(reward_range[0], reward_range[1], num_reward_cells)
    return game_state

# Function to calculate reward
def calculate_reward(game_state, action):
    if game_state[action] > 0:
        return game_state[action]
    else:
        left_index = (action - 1) % num_cells
        right_index = (action + 1) % num_cells
        if game_state[left_index] > 0 and game_state[right_index] > 0:
            return game_state[left_index] + game_state[right_index]
        else:
            return 0

# Function to evaluate the policy
def evaluate_policy(model, num_episodes=10000):
    total_reward = 0
    for _ in range(num_episodes):
        game_state = generate_game()
        probs = model(np.expand_dims(game_state, axis=0)).numpy()[0]
        action = np.random.choice(num_cells, p=probs)
        total_reward += calculate_reward(game_state, action)
    return total_reward / num_episodes

# Training loop
rewards_history = []
for batch in range(num_batches):
    states = []
    actions = []
    rewards = []

    for _ in range(batch_size):
        game_state = generate_game()
        states.append(game_state)
        probs = model(np.expand_dims(game_state, axis=0)).numpy()[0]
        action = np.random.choice(num_cells, p=probs)
        actions.append(action)
        reward = calculate_reward(game_state, action)
        rewards.append(reward)

    # Normalize rewards
    rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-8)

    with tf.GradientTape() as tape:
        probs = model(np.array(states))
        action_probs = tf.gather_nd(probs, [[i, a] for i, a in enumerate(actions)])
        log_probs = tf.math.log(action_probs)
        loss = -tf.reduce_mean(log_probs * rewards)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Evaluate the policy
    average_reward = evaluate_policy(model)
    rewards_history.append(average_reward)

    if (batch + 1) % 10 == 0:
        print(f"Batch: {batch + 1}/{num_batches}, Average Reward: {average_reward:.3f}")

# Plot the average reward over batches
plt.plot(range(num_batches), rewards_history)
plt.xlabel("Batch")
plt.ylabel("Average Reward")
plt.title("Policy Gradient Training")
plt.show()
```
## 修改后的
```python
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Sequential
import numpy as np
import matplotlib.pyplot as plt

# Game parameters
num_cells = 8
num_reward_cells = 3
reward_range = (5.0, 10.0)

# Neural network parameters
hidden_layer_size = 512
batch_size = 128
num_batches = 10000
learning_rate = 0.001

# Create the policy network
model = Sequential([
    Dense(hidden_layer_size, input_shape=(num_cells,), kernel_initializer='he_normal'),
    BatchNormalization(),
    ReLU(),
    Dropout(0.5),
    Dense(num_cells, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Function to generate a game
def generate_game():
    game_state = np.zeros(num_cells)
    reward_indices = np.random.choice(num_cells, num_reward_cells, replace=False)
    game_state[reward_indices] = np.random.uniform(reward_range[0], reward_range[1], num_reward_cells)
    return game_state

# Function to calculate reward
def calculate_reward(game_state, action):
    if game_state[action] > 0:
        return game_state[action]
    else:
        left_index = (action - 1) % num_cells
        right_index = (action + 1) % num_cells
        if game_state[left_index] > 0 and game_state[right_index] > 0:
            return game_state[left_index] + game_state[right_index]
        else:
            return 0

# # Function to evaluate the policy
# def evaluate_policy(model, num_episodes=10000):
#     total_reward = 0
#     for _ in range(num_episodes):
#         game_state = generate_game()
#         probs = model(np.expand_dims(game_state, axis=0)).numpy()[0]
#         action = np.random.choice(num_cells, p=probs)
#         total_reward += calculate_reward(game_state, action)
#     return total_reward / num_episodes


# Function to evaluate the policy
def evaluate_policy(model, num_episodes=10000):
    total_reward = 0
    states = np.array([generate_game() for i in np.arange(num_episodes)])
    actions = model(states).numpy().argmax(axis = 1)
    for game_state, action in zip(states, actions):
        total_reward += calculate_reward(game_state, action)
    return total_reward / num_episodes


# Training loop
rewards_history = []
for batch in range(num_batches):
    states = np.array([generate_game() for i in np.arange(batch_size)])
    actions = [np.random.choice(num_cells, p=probs) for probs in model(states).numpy()]
    rewards = [calculate_reward(game_state, action) for game_state, action in zip(states, actions)]

    # Normalize rewards
    rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-8)

    with tf.GradientTape() as tape:
        probs = model(np.array(states))
        action_probs = tf.gather_nd(probs, [[i, a] for i, a in enumerate(actions)])
        log_probs = tf.math.log(action_probs)
        loss = -tf.reduce_mean(log_probs * rewards)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Evaluate the policy
    average_reward = evaluate_policy(model)
    rewards_history.append(average_reward)

    if (batch + 1) % 10 == 0:
        print(f"Batch: {batch + 1}/{num_batches}, Average Reward: {average_reward:.3f}")

# Plot the average reward over batches
plt.plot(range(num_batches), rewards_history)
plt.xlabel("Batch")
plt.ylabel("Average Reward")
plt.title("Policy Gradient Training")
plt.show()
```