In [10]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

UNIFORM_LOW, UNIFORM_HIGH = 0.0, 1.0

def make_state():
    state = np.zeros(8)
    state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(UNIFORM_LOW, UNIFORM_HIGH,size=3)
    return state

def run(state, action):
    state_cyclic = np.concatenate((state, state[0:2]))
    if state_cyclic[action] != 0:
        return state_cyclic[action]
    elif state_cyclic[action-1] != 0 and state_cyclic[action+1] != 0:
        return state_cyclic[action-1] + state_cyclic[action+1]
    else:
        return 0

def get_optimal_value(state):
    value = state.max()

    state_cyclic = np.concatenate((state, state[0:2]))
    for i in np.arange(1, 9):
        if state_cyclic[i] == 0 and state_cyclic[i-1] != 0 and state_cyclic[i+1] != 0:
            value2 = state_cyclic[i-1] + state_cyclic[i+1]
            if value2 > value:
                value = value2
    return value

def get_optimal_statics(n_rounds):
    values = np.array([get_optimal_value(make_state()) for i in np.arange(n_rounds)])
    return np.mean(values), np.std(values)

def get_baseline_value(n_rounds):
    values = [run(make_state(), np.random.choice(8)) for i in np.arange(n_rounds)]
    return np.mean(values)
    

def get_optimal_action(state):
    action = state.argmax()
    value = state[action]
    state_cyclic = np.concatenate((state, state[0:2]))
    for i in np.arange(1, 9):
        if state_cyclic[i] == 0 and state_cyclic[i-1] != 0 and state_cyclic[i+1] != 0:
            value2 = state_cyclic[i-1] + state_cyclic[i+1]
            if value2 > value:
                value = value2
                action = i % 8
    return action

def get_optimal_actions(state_list):
    return np.array([get_optimal_action(state) for state in state_list])
    

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_values = np.array([get_optimal_value(state) for state in state_list])
    accuracy = np.sum(np.abs(optimal_values - values) < 1e-6)/n_test_rounds
    return np.mean(values), np.std(values), accuracy

def train(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights

def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


## 试验场

## 不同奖励随机数区间

In [85]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [86]:
REWARD_OFFSET, REWARD_SCALE = 0, 1
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [87]:
UNIFORM_LOW, UNIFORM_HIGH = 0.0, 1.0
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(75),
 np.float64(0.8915266636809518),
 np.float64(0.4496365138743235),
 np.float64(0.4589))

In [88]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')
UNIFORM_LOW, UNIFORM_HIGH = 1.0, 2.0
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(50),
 np.float64(2.1845612182347165),
 np.float64(0.8181453849324378),
 np.float64(0.3295))

In [89]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(55),
 np.float64(7.661252113309711),
 np.float64(4.264092118060707),
 np.float64(0.2442))

## 奖励归一化

In [71]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = (values - mean) / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [72]:
UNIFORM_LOW, UNIFORM_HIGH = 0.0, 1.0

In [73]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [74]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 0.48450028870495615
1 0.567199260589272
2 0.649784180656774
3 0.6995362880177849
4 0.7143633744852105
5 0.7396556508091479
6 0.7658560720357701
7 0.7766654873607491
8 0.7745581523378092
9 0.7726639541447
10 0.777553533052179
11 0.7623883616783955
12 0.7595480762735588
13 0.7693552628589417
14 0.7685875796718504
15 0.8048357282045357
16 0.7971057692158275
17 0.8024816325197645
18 0.7918795924860157
19 0.7764481658791296
20 0.7908871671370273
21 0.7718546038373605
22 0.7575453774001877
23 0.7453947360624991
24 0.7306656740756865
25 0.713326680075315
26 0.7142487416314653
27 0.7348942441620832
28 0.7635444483806129
29 0.7919744525909784
30 0.8193827415827855
31 0.8168599539448765
32 0.8165965765409152
33 0.8321362600663319
34 0.8569491116862826
35 0.8531276780503234
36 0.8518449061079763
37 0.8570951380555298
38 0.8561199138925832
39 0.8490418487551975
40 0.8322851151952738
41 0.838230074920663
42 0.8430196474609253
43 0.8614930343027548
44 0.8581553552216319
45 0.8512203732409495
46 0.

(np.int64(86),
 np.float64(0.8976678007662203),
 np.float64(0.4564339516322845),
 np.float64(0.5143))

In [75]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(63),
 np.float64(12.283681408652946),
 np.float64(4.528869290742579),
 np.float64(0.4593))

In [80]:
get_optimal_statics(10000)

(np.float64(12.413667127536385), np.float64(3.621633191384911))

## 奖励归一化-不减去平均值

In [76]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = values / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [77]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

In [78]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [79]:
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(25),
 np.float64(9.92929386952984),
 np.float64(5.078171729283151),
 np.float64(0.2092))

## 奖励归一化-不除以标准差

In [81]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = values - mean
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [82]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

In [83]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [84]:
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(150),
 np.float64(11.112288744503003),
 np.float64(5.462234826955575),
 np.float64(0.3508))

# 用test_model 返回值做normalization

# 结论

以下行为理论上不会影响模型表现，但实际上会

* 修改初始化时格子中随机数的取值区间
* 在获得的奖励上减去常数
* 在获得的奖励上乘以常数

所以，是否存在某个理想的适于模型学习的 G_t 的分布？似乎对奖励作 normalization 效果较好。

怎么结合 normalization 和 baseline 呢？

In [57]:
tmp = np.array([[1,2,3],[4,5,6]])

In [60]:
tmp[[0,1], [1,2]] = [9,9]

In [61]:
tmp

array([[1, 9, 9],
       [4, 9, 9]])

In [67]:
1, *(2,3,4)

(1, 2, 3, 4)