In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_state():
    state = np.zeros(8)
    state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(size=3)
    return state

def run(state, action):
    state_cyclic = np.concatenate((state, state[0:2]))
    if state_cyclic[action] != 0:
        return state_cyclic[action]
    elif state_cyclic[action-1] != 0 and state_cyclic[action+1] != 0:
        return state_cyclic[action-1] + state_cyclic[action+1]
    else:
        return 0

def get_optimal_value(state):
    value = state.max()

    state_cyclic = np.concatenate((state, state[0:2]))
    for i in np.arange(1, 9):
        if state_cyclic[i] == 0 and state_cyclic[i-1] != 0 and state_cyclic[i+1] != 0:
            value2 = state_cyclic[i-1] + state_cyclic[i+1]
            if value2 > value:
                value = value2
    return value

def get_optimal_statics(n_rounds):
    values = np.array([get_optimal_value(make_state()) for i in np.arange(n_rounds)])
    return np.mean(values), np.std(values)

def get_baseline_value(n_rounds):
    values = [run(make_state(), np.random.choice(8)) for i in np.arange(n_rounds)]
    return np.mean(values)
    

def get_optimal_action(state):
    action = state.argmax()
    value = state[action]
    state_cyclic = np.concatenate((state, state[0:2]))
    for i in np.arange(1, 9):
        if state_cyclic[i] == 0 and state_cyclic[i-1] != 0 and state_cyclic[i+1] != 0:
            value2 = state_cyclic[i-1] + state_cyclic[i+1]
            if value2 > value:
                value = value2
                action = i % 8
    return action

def get_optimal_actions(state_list):
    return np.array([get_optimal_action(state) for state in state_list])
    

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    return np.mean(values)

def test_model_accuracy(model, n_test_rounds):
    count = 0
    state_list = [make_state() for i in np.arange(n_test_rounds)]
    optimal_values = np.array([get_optimal_value(state) for state in state_list])
    model_actions = get_model_actions(model, state_list)
    model_values = np.array([run(state, action) for state, action in zip(state_list, model_actions)])
    return np.sum(np.abs(optimal_values - model_values) < 1e-6)/n_test_rounds

def one_batch_supervised(model, batch_size, n_test_rounds=0):
    state_list = []
    y_target_list = []
    # rewards = 0
    for i in np.arange(batch_size):
        state = make_state()
        action = get_optimal_action(state)
        y_target = np.zeros(8)
        y_target[action] = 1
        state_list.append(state)
        y_target_list.append(y_target)
    model.fit(np.array(state_list), np.array(y_target_list), verbose=0)
    if n_test_rounds > 0:
        return (test_model(model, n_test_rounds))

def train(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights

def train_supervised(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score = one_batch_supervised(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights


def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


In [2]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = [np.random.choice(8, p=prob) for prob in prob_list]
    values = [run(state, action) for state, action in zip(state_list, actions)]
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, np.array(y_target_list), verbose = 0)
    if n_test_rounds > 0:
        return (test_model_accuracy(model, n_test_rounds))


In [3]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [4]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, test_model(best_model, 10000), test_model_accuracy(best_model, 10000)

0 0.1652
1 0.2143
2 0.223
3 0.2157
4 0.2402
5 0.2746
6 0.3085
7 0.3169
8 0.3381
9 0.3438
10 0.3408
11 0.3464
12 0.3635
13 0.3868
14 0.4065
15 0.4025
16 0.3745
17 0.3723
18 0.3882
19 0.403
20 0.4046
21 0.3977
22 0.3824
23 0.3528
24 0.317
25 0.3209
26 0.342
27 0.3669
28 0.4271
29 0.4453
30 0.4491
31 0.4037
32 0.3706
33 0.3605
34 0.3501
35 0.3539
36 0.389
37 0.4079
38 0.4259
39 0.4309
40 0.4197
41 0.4338
42 0.4496
43 0.4316
44 0.4388
45 0.4378
46 0.4125
47 0.3917
48 0.4029
49 0.3969
50 0.4181
51 0.4133
52 0.4125
53 0.4075
54 0.4033
55 0.3947
56 0.4073
57 0.4254
58 0.4429
59 0.4124
60 0.4065
61 0.4092
62 0.4055
63 0.4175
64 0.4402
65 0.4521
66 0.4658
67 0.465
68 0.4595
69 0.4033
70 0.4153
71 0.4219
72 0.4352
73 0.4471
74 0.4578
75 0.4246
76 0.436
77 0.4391
78 0.4195
79 0.4195
80 0.4221
81 0.4293
82 0.4269
83 0.4093
84 0.4138
85 0.4173
86 0.419
87 0.4285
88 0.4395
89 0.425
90 0.4291
91 0.4281
92 0.4279
93 0.4387
94 0.4288
95 0.447
96 0.4665
97 0.4779
98 0.4744
99 0.4834
100 0.5046
101 0.511

(np.int64(101), np.float64(0.8550526694847563), np.float64(0.4984))

In [5]:
best_idx, test_model(best_model, 10000), test_model_accuracy(best_model, 10000)

(np.int64(101), np.float64(0.8505431153828918), np.float64(0.5049))

In [6]:
test_model(model, 10000), test_model_accuracy(model, 10000)

(np.float64(0.7954792072723128), np.float64(0.4016))

## 仅按输赢

In [7]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = [np.random.choice(8, p=prob) for prob in prob_list]
    optimal_actions = get_optimal_actions(state_list)
    values = np.ones(batch_size)
    values[actions != optimal_actions] = -1
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, np.array(y_target_list), verbose = 0)
    if n_test_rounds > 0:
        return (test_model_accuracy(model, n_test_rounds))


In [8]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [9]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, test_model(best_model, 10000), test_model_accuracy(best_model, 10000)

0 0.1772
1 0.1847
2 0.1847
3 0.1983
4 0.2144
5 0.2329
6 0.2582
7 0.2892
8 0.2844
9 0.3105
10 0.3266
11 0.3245
12 0.3209
13 0.3071
14 0.288
15 0.2805
16 0.2908
17 0.2798
18 0.2909
19 0.3148
20 0.3293
21 0.3339
22 0.3595
23 0.3661
24 0.3584
25 0.3628
26 0.3795
27 0.372
28 0.3474
29 0.3199
30 0.3179
31 0.3294
32 0.3322
33 0.3592
34 0.3759
35 0.3922
36 0.3917
37 0.3856
38 0.3974
39 0.4162
40 0.4461
41 0.4688
42 0.4677
43 0.4449
44 0.3986
45 0.3572
46 0.3201
47 0.3578
48 0.4207
49 0.4549
50 0.4919
51 0.4724
52 0.427
53 0.4414
54 0.4539
55 0.4509
56 0.4825
57 0.491
58 0.4928
59 0.4772
60 0.4842
61 0.4956
62 0.4866
63 0.4532
64 0.4705
65 0.4926
66 0.5137
67 0.5102
68 0.5167
69 0.522
70 0.5215
71 0.5157
72 0.4895
73 0.4755
74 0.4573
75 0.461
76 0.5024
77 0.4963
78 0.4919
79 0.5176
80 0.4885
81 0.473
82 0.4343
83 0.4375
84 0.483
85 0.4781
86 0.5026
87 0.5372
88 0.5416
89 0.5474
90 0.5371
91 0.4989
92 0.4775
93 0.5264
94 0.5241
95 0.5088
96 0.5153
97 0.4982
98 0.4965
99 0.5013
100 0.4976
101 0.4

(np.int64(130), np.float64(0.7895633281217262), np.float64(0.5721))

In [10]:
best_idx, test_model(best_model, 10000), test_model_accuracy(best_model, 10000)

(np.int64(130), np.float64(0.7821927187467178), np.float64(0.566))

In [11]:
test_model(model, 10000), test_model_accuracy(model, 10000)

(np.float64(0.7928495540050106), np.float64(0.5247))

## 仅按输赢 归一化

In [12]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = [np.random.choice(8, p=prob) for prob in prob_list]
    optimal_actions = get_optimal_actions(state_list)
    values = np.ones(batch_size)
    values[actions != optimal_actions] = -1
    values = (values - np.mean(values))/(np.std(values)+1e-6)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, np.array(y_target_list), verbose = 0)
    if n_test_rounds > 0:
        return (test_model_accuracy(model, n_test_rounds))


In [13]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [14]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, test_model(best_model, 10000), test_model_accuracy(best_model, 10000)

0 0.1353
1 0.1921
2 0.2316
3 0.2809
4 0.3063
5 0.3192
6 0.3418
7 0.3428
8 0.3567
9 0.3574
10 0.3573
11 0.3757
12 0.3628
13 0.3734
14 0.3814
15 0.4018
16 0.3868
17 0.3788
18 0.3767
19 0.3889
20 0.3954
21 0.424
22 0.4351
23 0.4504
24 0.4657
25 0.4874
26 0.4955
27 0.467
28 0.5011
29 0.5093
30 0.5067
31 0.5299
32 0.535
33 0.5275
34 0.5013
35 0.4828
36 0.462
37 0.459
38 0.4925
39 0.5076
40 0.5277
41 0.5282
42 0.5311
43 0.5194
44 0.5135
45 0.5055
46 0.5304
47 0.5368
48 0.5213
49 0.5235
50 0.5264
51 0.5238
52 0.5194
53 0.5389
54 0.5258
55 0.5344
56 0.5434
57 0.5316
58 0.5539
59 0.5611
60 0.5685
61 0.5665
62 0.5711
63 0.5787
64 0.5737
65 0.5645
66 0.5373
67 0.5195
68 0.491
69 0.5066
70 0.5303
71 0.5416
72 0.5391
73 0.5406
74 0.5547
75 0.5537
76 0.5767
77 0.5991
78 0.6039
79 0.6189
80 0.6168
81 0.6195
82 0.6025
83 0.5761
84 0.6007
85 0.6111
86 0.6036
87 0.5724
88 0.5739
89 0.6101
90 0.5996
91 0.5948
92 0.6103
93 0.633
94 0.6458
95 0.6559
96 0.6471
97 0.6628
98 0.6683
99 0.6651
100 0.6526
101 0.

(np.int64(98), np.float64(0.8583872570381754), np.float64(0.6655))

In [15]:
best_idx, test_model(best_model, 10000), test_model_accuracy(best_model, 10000)

(np.int64(98), np.float64(0.8613196933575142), np.float64(0.6643))

In [16]:
test_model(model, 10000), test_model_accuracy(model, 10000)

(np.float64(0.7676207804029727), np.float64(0.5196))

## 总结

命中最优解为赢，否则为输。

测试以下两种情况
* 赢奖励1，输奖励-1
* 赢奖励1，输奖励0

加上归一化后，命中最优解的几率达到0.66，不过平均每局得分不见优势