In [11]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_states(n):
    states = np.zeros((n,8))
    for i in np.arange(n):
        idx = np.random.choice(8, size=3, replace=False)
        states[i, idx] = np.random.uniform(0.0, 1.0, size=3)
    return states
    
def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = make_states(n_rounds)
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(state, np.random.choice(8)) for state in make_states(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = make_states(n_test_rounds)
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def one_batch(model, optimizer, batch_size=128):
    states = make_states(batch_size)
    _, optimal_values = get_optimal_actions_and_values(states)
    with tf.GradientTape() as tape:
        probs = model(states)

        actions = np.array([np.random.choice(8, p=prob) for prob in probs.numpy()])
        rewards = np.array([run(state, action) for state, action in zip(states, actions)])
        rewards = rewards - optimal_values
        rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-8)

        action_probs = tf.gather_nd(probs, [[i, a] for i, a in enumerate(actions)])
        log_probs = tf.math.log(action_probs)
        loss = -tf.reduce_mean(log_probs * rewards)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# def one_batch_supervised(model, batch_size, n_test_rounds=0):
#     state_list = make_states(batch_size)
#     y_target_list = np.zeros((batch_size, 8))
#     optimal_actions, _ = get_optimal_actions_and_values(state_list)
#     for i in np.arange(batch_size):
#         y_target_list[i, optimal_actions[i]] = 1
#     model.fit(np.array(state_list), np.array(y_target_list), verbose=0)
#     if n_test_rounds > 0:
#         return test_model(model, n_test_rounds)

def train(model, optimizer, one_batch, max_batch=200, batch_size=32, test_lapse = 10, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        one_batch(model, optimizer, batch_size)
        if (i+1) % test_lapse == 0 or i + 1 == max_batch:
            score, _, accuracy = test_model(model, n_test_rounds)
            if best_score < score:
                best_score = score
                best_idx = i
                best_weights = model.get_weights()
            if verbose == 1:
                print(i+1, score, accuracy)
    return best_idx, best_score, best_weights

# def train_supervised(one_batch_supervised, model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
#     best_weights = []
#     best_idx = 0
#     best_score = 0
#     for i in np.arange(max_batch):
#         score, std, accuracy = one_batch_supervised(model, batch_size, n_test_rounds)
#         if best_score < score:
#             best_score = score
#             best_idx = i
#             best_weights = model.get_weights()
#         if verbose == 1:
#             print(i, score, accuracy)
#     return best_idx, best_score, best_weights


def create_model(n_hidden_layers, n_dense_units, ratio_dropout):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model


# 试验场

In [12]:
model = create_model(n_hidden_layers=2, n_dense_units=512, ratio_dropout=0.5)
optimizer = tf.optimizers.Adam(0.001)

In [13]:
_ = train(model, optimizer, one_batch, max_batch=4000, batch_size=32, test_lapse = 100, n_test_rounds=10000, verbose = 1)

100 0.7276345527966676 0.4018
200 0.7911470032968214 0.5035
300 0.8103553989961112 0.5408
400 0.8009515855604905 0.5304
500 0.8333228817961957 0.5727
600 0.8356970170070003 0.6025
700 0.8463818129769335 0.6202
800 0.8416413510755975 0.6159
900 0.8569093292938236 0.6528
1000 0.8616178567598625 0.6643
1100 0.8702505438347953 0.6788
1200 0.8679873873734052 0.6949
1300 0.8659297938542931 0.6809
1400 0.8696715835212111 0.7
1500 0.877057048602939 0.6983
1600 0.8875521754110081 0.7377
1700 0.8842725854823451 0.7284
1800 0.8809257462717529 0.7288
1900 0.8876177776934504 0.7485
2000 0.8864925948571823 0.7549
2100 0.8878385319938729 0.7612
2200 0.8912983265267219 0.7687
2300 0.8950987120801795 0.7668
2400 0.8929729204855655 0.7767
2500 0.8920962908107003 0.7787
2600 0.8941713616571216 0.7824
2700 0.897787475585905 0.7906
2800 0.8942291909535354 0.8076
2900 0.8946862497830752 0.7898
3000 0.9021817937038088 0.7976
3100 0.8972236849215762 0.7973
3200 0.8996050898433937 0.7957
3300 0.899122822963334

In [14]:
model = create_model(n_hidden_layers=2, n_dense_units=512, ratio_dropout=0.5)
optimizer = tf.optimizers.Adam(0.001)

In [15]:
_ = train(model, optimizer, one_batch, max_batch=4000, batch_size=32, test_lapse = 100, n_test_rounds=10000, verbose = 1)

100 0.7519761029120887 0.4495
200 0.7893834719638262 0.5072
300 0.770999194918169 0.493
400 0.8066423764295776 0.5373
500 0.8136409161946815 0.5477
600 0.8036562079020012 0.566
700 0.8133766695459149 0.5891
800 0.8269362472087445 0.605
900 0.8325083844230161 0.607
1000 0.8218106473622429 0.6155
1100 0.8405433140028657 0.6203
1200 0.8210023132825274 0.5905
1300 0.837978421127829 0.622
1400 0.836082188424272 0.6367
1500 0.8416896395554911 0.629
1600 0.8452002162095439 0.6331
1700 0.8530886080349859 0.6729
1800 0.8637111266252657 0.6798
1900 0.8757167145337685 0.7153
2000 0.8839556967013963 0.7475
2100 0.8858641747309018 0.7431
2200 0.8655369692747167 0.7181
2300 0.8803766616628625 0.7528
2400 0.8871997306188294 0.7598
2500 0.8892972232710042 0.7677
2600 0.8784200908225263 0.7615
2700 0.8970306149585212 0.7888
2800 0.8954664887206687 0.7809
2900 0.8965240239212042 0.801
3000 0.8983178255503078 0.8118
3100 0.8982577881825772 0.7987
3200 0.8988590060961722 0.8102
3300 0.8935823004925196 0.8

In [16]:
model = create_model(n_hidden_layers=2, n_dense_units=512, ratio_dropout=0.5)
optimizer = tf.optimizers.Adam(0.001)

In [17]:
_ = train(model, optimizer, one_batch, max_batch=4000, batch_size=32, test_lapse = 100, n_test_rounds=10000, verbose = 1)

100 0.7255646257827664 0.4119
200 0.7687264791295916 0.4731
300 0.7768829942141884 0.4899
400 0.7848172653647887 0.5084
500 0.8010280137758524 0.5398
600 0.8143450219580173 0.5719
700 0.8135036067952068 0.5838
800 0.7989495996596852 0.5366
900 0.8191677695701768 0.5853
1000 0.8236937464904062 0.5969
1100 0.8424128181006918 0.6252
1200 0.8490940668078644 0.6449
1300 0.8650191516273419 0.7016
1400 0.8602154450232938 0.6764
1500 0.8652006613292702 0.6727
1600 0.8715325069354456 0.6949
1700 0.8654420301942349 0.7181
1800 0.8912190722023204 0.7516
1900 0.887906675435571 0.7643
2000 0.8921933270906611 0.763
2100 0.8963376218472189 0.7741
2200 0.8942250907387866 0.7911
2300 0.8893091081893015 0.7692
2400 0.8920781043103543 0.7692
2500 0.892379203092595 0.7801
2600 0.9020895272165191 0.8089
2700 0.8921508195139771 0.7891
2800 0.9014301409277927 0.7979
2900 0.8962258664775614 0.8049
3000 0.9003873941851882 0.7741
3100 0.8939416459050066 0.784
3200 0.8976175655979222 0.81
3300 0.898892305229114 

In [21]:
create_model(4, 64, 0.2).summary()