In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_states(n):
    states = np.zeros((n,8))
    for i in np.arange(n):
        idx = np.random.choice(8, size=3, replace=False)
        states[i, idx] = np.random.uniform(0.0, 1.0, size=3)
    return states
    
def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = make_states(n_rounds)
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(state, np.random.choice(8)) for state in make_states(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = make_states(n_test_rounds)
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def one_batch(model, optimizer, batch_size=128, n_test_rounds=10000):
    states = make_states(batch_size)
    _, optimal_values = get_optimal_actions_and_values(states)
    with tf.GradientTape() as tape:
        probs = model(states)

        actions = np.array([np.random.choice(8, p=prob) for prob in probs.numpy()])
        rewards = np.array([run(state, action) for state, action in zip(states, actions)])
        rewards = rewards - optimal_values
        rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-8)

        action_probs = tf.gather_nd(probs, [[i, a] for i, a in enumerate(actions)])
        log_probs = tf.math.log(action_probs)
        loss = -tf.reduce_mean(log_probs * rewards)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

def train(model, optimizer, one_batch, max_batch=200, batch_size=32, test_lapse = 10, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        one_batch(model, optimizer, batch_size)
        if (i+1) % test_lapse == 0 or i + 1 == max_batch:
            score, _, accuracy = test_model(model, n_test_rounds)
            if best_score < score:
                best_score = score
                best_idx = i
                best_weights = model.get_weights()
            if verbose == 1:
                print(i+1, score, accuracy)
    return best_idx, best_score, best_weights

def create_model(n_hidden_layers, n_dense_units, ratio_dropout):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model




# 实验场

In [24]:
def create_model_policy_value(n_hidden_layers, n_dense_units, ratio_dropout):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    probs = Dense(8, activation='softmax')(x)
    values = Dense(1)(x)
    model = Model(inputs, [probs, values])
    return model

def one_batch(model, optimizer, batch_size=128, n_test_rounds=10000):
    states = make_states(batch_size)
    _, optimal_values = get_optimal_actions_and_values(states)
    with tf.GradientTape() as tape:
        probs = model(states)

        actions = np.array([np.random.choice(8, p=prob) for prob in probs.numpy()])
        rewards = np.array([run(state, action) for state, action in zip(states, actions)])
        rewards = rewards - optimal_values
        rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-8)

        action_probs = tf.gather_nd(probs, [[i, a] for i, a in enumerate(actions)])
        log_probs = tf.math.log(action_probs)
        loss = -tf.reduce_mean(log_probs * rewards)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [25]:
model = create_model_policy_value(n_hidden_layers=2, n_dense_units=256, ratio_dropout=0.2)

In [26]:
states = make_states(10)

In [30]:
model(states)[1].numpy()

array([[ 0.03974233],
       [ 0.04402713],
       [ 0.0739365 ],
       [-0.00650854],
       [ 0.02607722],
       [ 0.03477475],
       [ 0.03529577],
       [-0.00736863],
       [ 0.05722967],
       [-0.02273199]], dtype=float32)

In [6]:
model = create_model(n_hidden_layers=2, n_dense_units=256, ratio_dropout=0.2)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [20]:
best_idx, best_score, best_weights = train(model, optimizer, one_batch, max_batch=1000, batch_size=128, test_lapse = 20, n_test_rounds=10000, verbose = 1)

20 0.8960870149877559 0.8304
40 0.9076194765525001 0.8413
60 0.9029266044766424 0.8532
80 0.9038879968016187 0.853
100 0.9020123545903204 0.8316
120 0.9071614226348869 0.8422
140 0.9073540566056941 0.8305
160 0.9041681096216482 0.8346
180 0.9061435086673835 0.8217
200 0.9095106784353649 0.8369
220 0.9025841494865614 0.8319
240 0.905591764061468 0.8318
260 0.9095371271485961 0.8436
280 0.8983308780198241 0.8244
300 0.9046434174915423 0.8332
320 0.899258424497148 0.8412
340 0.9092407083110465 0.836
360 0.9180629451771088 0.8427
380 0.9133893399209071 0.8342
400 0.9062845166730105 0.8369
420 0.8953698476621019 0.824
440 0.9048430723551579 0.8366
460 0.9104745820641178 0.8477
480 0.9102073951387556 0.8491
500 0.9041883633057569 0.8392
520 0.9059591794708518 0.8281
540 0.909668978266422 0.84
560 0.9045257576769312 0.8425
580 0.9080147208624019 0.8435
600 0.9122698918135024 0.8448
620 0.9024951778338013 0.8391
640 0.9089059605319711 0.8186
660 0.9048960805727633 0.8277
680 0.9066121751905685

In [22]:
test_model(model, 10000)

(np.float64(0.9155448423118432),
 np.float64(0.3680161888532063),
 np.float64(0.8786))

In [21]:
best_model = clone_model(model)
best_model.set_weights(best_weights)
test_model(best_model, 10000)

(np.float64(0.9087939350877746),
 np.float64(0.3691985078241972),
 np.float64(0.8767))