In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_state():
    state = np.zeros(8)
    state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(size=3)
    return state

def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = np.array([make_state() for i in np.arange(n_rounds)])
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(make_state(), np.random.choice(8)) for i in np.arange(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = (values - mean) / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def one_batch_supervised(model, batch_size, n_test_rounds=0):
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    y_target_list = np.zeros((batch_size, 8))
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    for i in np.arange(batch_size):
        y_target_list[i, optimal_actions[i]] = 1
    model.fit(np.array(state_list), np.array(y_target_list), verbose=0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def train(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, accuracy, score_on_policy, accuracy_on_policy = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score, accuracy, score_on_policy, accuracy_on_policy)
    return best_idx, best_score, best_weights

def train_supervised(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch_supervised(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights


def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


## 试验场

## test on policy

In [2]:
def test_model_on_policy(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    prob_list = model(np.array(state_list)).numpy()
    actions = prob_list.argmax(axis = 1)
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    actions_on_policy = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values_on_policy = np.array([run(state, action) for state, action in zip(state_list, actions_on_policy)])
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    accuracy_on_policy = np.mean(actions_on_policy == optimal_actions)
    return np.mean(values), accuracy, np.mean(values_on_policy), accuracy_on_policy

In [3]:
def one_batch_onpolicy(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = (values - mean) / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model_on_policy(model, n_test_rounds)

In [4]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [5]:
for i in np.arange(200):
    values, accuracy, values_on_policy, accuracy_on_policy = one_batch_onpolicy(model, batch_size=128, n_test_rounds=10000)
    print(i, values, accuracy, values_on_policy, accuracy_on_policy)

0 0.3098974367525685 0.1378 0.2776780321328625 0.1279
1 0.42117728046687 0.213 0.28451701060718243 0.1268
2 0.4932152483466641 0.2671 0.2751537531708597 0.1272
3 0.56084751670181 0.3259 0.2895824399317276 0.1307
4 0.6207547040950259 0.358 0.2825327960402378 0.124
5 0.6387761625442787 0.3537 0.29464959540274727 0.1392
6 0.6318307879510128 0.3416 0.2933396257875227 0.1316
7 0.6046968020614106 0.3303 0.2889653679617586 0.128
8 0.5972844345643653 0.3281 0.2896693636369332 0.1274
9 0.5740356985893008 0.3094 0.28935958369048087 0.1357
10 0.5974169080614558 0.3253 0.3011134589674598 0.1361
11 0.6127966331725198 0.337 0.2906699519483163 0.1323
12 0.6329336850891722 0.3554 0.30001744592839885 0.1316
13 0.6780351817746898 0.3969 0.3027177660722706 0.1347
14 0.7073715869612727 0.4258 0.30401165840860356 0.135
15 0.7113411562239901 0.427 0.308152643468992 0.1463
16 0.7429540700977776 0.4413 0.30888090074297153 0.1414
17 0.7488386109308713 0.457 0.3120838057626316 0.1432
18 0.7517132376294161 0.474

In [6]:
for i in np.arange(1200):
    values, accuracy, values_on_policy, accuracy_on_policy = one_batch_onpolicy(model, batch_size=128, n_test_rounds=10000)
    print(i, values, accuracy, values_on_policy, accuracy_on_policy)

0 0.7596746994490317 0.5198 0.5336555711091575 0.3227
1 0.761241966911362 0.5214 0.523972488015776 0.3184
2 0.7721907829875001 0.5263 0.515286283925396 0.3077
3 0.7735662648023394 0.5447 0.5170596158511611 0.3193
4 0.7355438846071795 0.4935 0.5212519011002285 0.3124
5 0.7419624688795773 0.4998 0.5190053190302633 0.3211
6 0.7566430856433743 0.5151 0.5086927625974109 0.2991
7 0.7462757579491105 0.5197 0.5017425806027797 0.3075
8 0.7452287729165866 0.5096 0.500008578300694 0.292
9 0.7297574424519787 0.5046 0.4973857951797705 0.2985
10 0.7323799342328607 0.4871 0.5070305822852562 0.3053
11 0.7434147178005929 0.5026 0.5161043912488729 0.3101
12 0.7529530554615645 0.5133 0.5173735933501303 0.3131
13 0.7681134440949118 0.5369 0.5285362334285805 0.3283
14 0.7572488044931192 0.5214 0.522511674478576 0.3184
15 0.7119613966591327 0.4498 0.5216645773931258 0.3041
16 0.7364966309586787 0.4916 0.5209277771602568 0.3085
17 0.7181549222414441 0.4521 0.5230306671061047 0.3063
18 0.7420207480124998 0.48

# 总结

效果同样会变差