In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_state():
    state = np.zeros(8)
    state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(low=UNIFORM_LOW, high=UNIFORM_HIGH, size=3)
    return state

def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = np.array([make_state() for i in np.arange(n_rounds)])
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(make_state(), np.random.choice(8)) for i in np.arange(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def train(func, model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = func(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights

def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


In [2]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = (values - mean) / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

# batch复用

In [3]:
def one_batch_reuse(model, state_list, n_batch_reuse=128, n_test_rounds=0):
    batch_size = len(state_list)
    for i in np.arange(n_batch_reuse):
        y_target_list = np.zeros((batch_size, 8))
        prob_list = model(np.array(state_list)).numpy()
        actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
        values = np.array([run(state, action) for state, action in zip(state_list, actions)])
        mean, std = np.mean(values), np.std(values)
        values = (values - mean) / std
        for i in np.arange(batch_size):
            y_target_list[i, actions[i]] = values[i]
        model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

In [4]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

best_idx, best_score, best_weights = train(one_batch, model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 4.829852597140758
1 5.3212651831936135
2 5.903501837983301
3 6.175481934500547
4 6.524013034005621
5 6.960248471191531
6 7.266198477449879
7 7.6739663902835655
8 8.172840346250446
9 8.273870126878052
10 8.250145323337566
11 8.206516480497227
12 8.102744957821692
13 8.170213875771996
14 8.39474178665364
15 8.488371605010244
16 8.780472318534702
17 8.978606005481293
18 8.918554870867352
19 8.927616191377403
20 8.891693653628755
21 8.956906451418938
22 8.961960289948415
23 9.318032237687891
24 9.613071646099009
25 9.970287557663319
26 10.333431423311092
27 10.327541011059807
28 10.114674321076768
29 10.118625523716442
30 10.285271762693162
31 10.351948513386533
32 10.45116516744979
33 10.454698220585792
34 10.604032559776835
35 10.518224724330537
36 10.555349285368052
37 10.470463279510257
38 10.403277707736045
39 10.222455497652772
40 10.385566157362595
41 10.648740732517386
42 10.679795899040473
43 10.691086561260477
44 10.533677589733141
45 10.677003227543292
46 10.870935325178918
47

(np.int64(159),
 np.float64(11.082705312103693),
 np.float64(5.238672533240142),
 np.float64(0.6291))

In [5]:
tmp = model.get_weights()

In [6]:
tmp[0][0][0:30]

array([ 0.08510263,  0.07545429,  0.0392608 , -0.10232776, -0.03367168,
        0.01047329, -0.04693352, -0.16310753,  0.01435614, -0.06441449,
        0.04211158, -0.262705  ,  0.0636092 ,  0.06030984,  0.02522157,
        0.03171688,  0.00837675, -0.07020491, -0.0626253 ,  0.02308307,
       -0.0819066 , -0.15769088, -0.0139401 , -0.02882517, -0.05267152,
       -0.0334819 , -0.02682133, -0.01047497, -0.03932264,  0.06411365],
      dtype=float32)

In [7]:
state_list = np.array([make_state() for i in np.arange(128)])

In [8]:
def test_model_on_batch(model, state_list):
    actions = get_model_actions(model, state_list)
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    return np.mean(values), np.mean(actions == optimal_actions)

In [9]:
test_model_on_batch(model, state_list)

(np.float64(10.600948056395714), np.float64(0.6015625))

In [10]:
print(one_batch_reuse(model, state_list, n_batch_reuse=1, n_test_rounds=10000))

(np.float64(10.52099657003782), np.float64(5.981220806239014), np.float64(0.5958))


In [11]:
test_model_on_batch(model, state_list)

(np.float64(10.502968050650784), np.float64(0.625))

In [12]:
print(one_batch_reuse(model, state_list, n_batch_reuse=1, n_test_rounds=10000))

(np.float64(10.495729253295416), np.float64(5.973484498340809), np.float64(0.5937))


In [13]:
test_model_on_batch(model, state_list)

(np.float64(10.475521630743097), np.float64(0.625))

In [14]:
for i in np.arange(100):
    print(i, 1, one_batch_reuse(model, state_list, n_batch_reuse=1, n_test_rounds=10000))
    print(i, 2, test_model_on_batch(model, state_list))

0 1 (np.float64(10.667087691596421), np.float64(5.775281702274988), np.float64(0.5975))
0 2 (np.float64(10.864046963782354), np.float64(0.6484375))
1 1 (np.float64(10.735377288180697), np.float64(5.764734596505275), np.float64(0.6161))
1 2 (np.float64(10.900635412809567), np.float64(0.625))
2 1 (np.float64(10.43106647953918), np.float64(6.136465409961053), np.float64(0.5898))
2 2 (np.float64(10.389355858403299), np.float64(0.5859375))
3 1 (np.float64(10.087004160794542), np.float64(6.340228896885957), np.float64(0.5703))
3 2 (np.float64(10.145508072604372), np.float64(0.5859375))
4 1 (np.float64(10.473065372825872), np.float64(6.004023234212252), np.float64(0.5909))
4 2 (np.float64(10.414621966591216), np.float64(0.59375))
5 1 (np.float64(10.422364150349491), np.float64(6.109371519276432), np.float64(0.5885))
5 2 (np.float64(10.444192961938978), np.float64(0.6015625))
6 1 (np.float64(10.448177567939117), np.float64(6.056523280981272), np.float64(0.5886))
6 2 (np.float64(10.627779856242

# batch复用

In [15]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')
state_list = np.array([make_state() for i in np.arange(128)])

In [16]:
for i in np.arange(1000):
    print(i, 1, one_batch_reuse(model, state_list, n_batch_reuse=1, n_test_rounds=10000))
    print(i, 2, test_model_on_batch(model, state_list))

0 1 (np.float64(4.09619780064372), np.float64(5.058627201964093), np.float64(0.1327))
0 2 (np.float64(4.700337694562586), np.float64(0.1328125))
1 1 (np.float64(5.436439183710285), np.float64(5.238624814630774), np.float64(0.1749))
1 2 (np.float64(5.486343785601537), np.float64(0.171875))
2 1 (np.float64(5.972798259322848), np.float64(5.201813677860041), np.float64(0.1948))
2 2 (np.float64(6.4266287286250625), np.float64(0.203125))
3 1 (np.float64(6.204466208098834), np.float64(5.143973636695313), np.float64(0.2031))
3 2 (np.float64(6.536094619296377), np.float64(0.1953125))
4 1 (np.float64(6.426560748604298), np.float64(5.06448920229552), np.float64(0.2126))
4 2 (np.float64(6.6811978184699115), np.float64(0.203125))
5 1 (np.float64(6.74283428453969), np.float64(5.035563913214284), np.float64(0.2211))
5 2 (np.float64(7.016059271656267), np.float64(0.203125))
6 1 (np.float64(7.080899059349371), np.float64(4.893136014301907), np.float64(0.2329))
6 2 (np.float64(7.416547634101159), np.flo

In [17]:
test_model_on_batch(model, state_list)

(np.float64(5.927529886774805), np.float64(0.2265625))