In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_state():
    state = np.zeros(8)
    state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(size=3)
    return state

def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    return np.array([get_optimal_action_and_value(state) for state in state_list])
    
def get_optimal_statics(n_rounds):
    state_list = np.array([make_state() for i in np.arange(n_rounds)])
    values = get_optimal_actions_and_values(state_list)[:, 1]
    return np.mean(values), np.std(values)

def get_baseline_value(n_rounds):
    values = [run(make_state(), np.random.choice(8)) for i in np.arange(n_rounds)]
    return np.mean(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_values = get_optimal_actions_and_values(state_list)[:,1]
    accuracy = np.sum(np.abs(optimal_values - values) < 1e-6)/n_test_rounds
    return np.mean(values), np.std(values), accuracy

def train(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        mean, std, score = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights

def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


In [2]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = [np.random.choice(8, p=prob) for prob in prob_list]
    values = [run(state, action) for state, action in zip(state_list, actions)]
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, np.array(y_target_list), verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [3]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [4]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 0.121
1 0.1327
2 0.1552
3 0.2076
4 0.2474
5 0.266
6 0.2907
7 0.3147
8 0.3508
9 0.3485
10 0.3414
11 0.3354
12 0.3148
13 0.3083
14 0.3444
15 0.3739
16 0.394
17 0.3865
18 0.3926
19 0.3832
20 0.3685
21 0.3442
22 0.3653
23 0.3759
24 0.3937
25 0.3934
26 0.4014
27 0.3887
28 0.4029
29 0.4205
30 0.4645
31 0.4731
32 0.4859
33 0.4922
34 0.4617
35 0.4646
36 0.4801
37 0.4518
38 0.4795
39 0.4836
40 0.4755
41 0.453
42 0.4568
43 0.4509
44 0.443
45 0.4329
46 0.4422
47 0.459
48 0.4462
49 0.4622
50 0.4755
51 0.4753
52 0.5121
53 0.5377
54 0.5073
55 0.4795
56 0.5137
57 0.533
58 0.5197
59 0.5277
60 0.5087
61 0.4984
62 0.4919
63 0.4949
64 0.5356
65 0.5547
66 0.5329
67 0.5184
68 0.5086
69 0.5267
70 0.5302
71 0.5274
72 0.5011
73 0.5321
74 0.5233
75 0.5058
76 0.5128
77 0.5412
78 0.5467
79 0.5351
80 0.5036
81 0.5329
82 0.5678
83 0.5579
84 0.5584
85 0.5502
86 0.5096
87 0.4989
88 0.529
89 0.5132
90 0.5186
91 0.5505
92 0.5626
93 0.5435
94 0.5487
95 0.5524
96 0.5347
97 0.5378
98 0.5478
99 0.572
100 0.5628
101 0.57

(np.int64(102),
 np.float64(0.8202412159296478),
 np.float64(0.3931043585683344),
 np.float64(0.5665))

In [5]:
best_idx, *test_model(best_model, 10000)

(np.int64(102),
 np.float64(0.8240462081729213),
 np.float64(0.38881203033636336),
 np.float64(0.5709))

In [6]:
test_model(model, 10000)

(np.float64(0.7499971639603831),
 np.float64(0.35196283800930295),
 np.float64(0.459))

## 仅按输赢 归一化

In [7]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = [np.random.choice(8, p=prob) for prob in prob_list]
    optimal_actions = get_optimal_actions_and_values(state_list)[:,0]
    values = np.ones(batch_size)
    values[actions != optimal_actions] = -1
    values = (values - np.mean(values))/(np.std(values)+1e-6)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, np.array(y_target_list), verbose = 0)
    if n_test_rounds > 0:
        return (test_model(model, n_test_rounds))


In [8]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [9]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 0.1654
1 0.1764
2 0.195
3 0.249
4 0.3131
5 0.3562
6 0.3801
7 0.4079
8 0.4021
9 0.407
10 0.4087
11 0.4244
12 0.4374
13 0.441
14 0.4701
15 0.4779
16 0.4652
17 0.4726
18 0.4648
19 0.4706
20 0.4505
21 0.4663
22 0.5041
23 0.5233
24 0.5383
25 0.5458
26 0.5411
27 0.5287
28 0.5269
29 0.5126
30 0.5333
31 0.5195
32 0.5348
33 0.5242
34 0.5493
35 0.5627
36 0.5681
37 0.5467
38 0.5402
39 0.5379
40 0.5432
41 0.5597
42 0.5764
43 0.5822
44 0.5745
45 0.562
46 0.5572
47 0.5555
48 0.568
49 0.5725
50 0.568
51 0.5626
52 0.5748
53 0.5619
54 0.5592
55 0.552
56 0.5592
57 0.5537
58 0.5924
59 0.5955
60 0.5832
61 0.5993
62 0.5867
63 0.5978
64 0.615
65 0.6101
66 0.6017
67 0.6116
68 0.6077
69 0.6
70 0.6377
71 0.6495
72 0.6615
73 0.6434
74 0.6435
75 0.6381
76 0.6292
77 0.6314
78 0.6506
79 0.674
80 0.6666
81 0.6663
82 0.6622
83 0.6437
84 0.6432
85 0.6312
86 0.6495
87 0.6558
88 0.6631
89 0.663
90 0.6548
91 0.6527
92 0.6681
93 0.666
94 0.6669
95 0.6578
96 0.6604
97 0.6736
98 0.6868
99 0.6612
100 0.6569
101 0.6676
102

(np.int64(123),
 np.float64(0.8473169079760741),
 np.float64(0.4365368078402005),
 np.float64(0.6884))

In [10]:
best_idx, *test_model(best_model, 10000)

(np.int64(123),
 np.float64(0.8521283289017614),
 np.float64(0.4315049022615584),
 np.float64(0.6994))

In [11]:
test_model(model, 10000)

(np.float64(0.7763383614375445),
 np.float64(0.48880674804989865),
 np.float64(0.5952))