In [19]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

LOW, HIGH = 0.0, 1.0

def make_states(n):
    states = np.zeros((n,8))
    for i in np.arange(n):
        idx = np.random.choice(8, size=3, replace=False)
        states[i, idx] = np.random.uniform(LOW, HIGH, size=3)
    return states
    
def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = make_states(n_rounds)
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(state, np.random.choice(8)) for state in make_states(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = make_states(n_test_rounds)
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = make_states(batch_size)
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    _, optimal_values = get_optimal_actions_and_values(state_list)
    values = values - optimal_values
    values = (values - np.mean(values)) / (np.std(values) + 1e-8)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, batch_size=batch_size, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def train(one_batch, model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score, accuracy)
    return best_idx, best_score, best_weights

def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


# set batch_size=128

In [20]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [21]:
best_idx, best_score, best_weights = train(one_batch, model, max_batch=1000, batch_size=32, verbose = 1)

0 0.19497561830010496 0.0971
1 0.22997505192626996 0.1243
2 0.25066678280064714 0.1391
3 0.27701373941278523 0.1591
4 0.28846858787159935 0.1587
5 0.31119188359192956 0.1748
6 0.3432668422625414 0.1891
7 0.3653652551737709 0.1915
8 0.3934475509264584 0.2119
9 0.4123686232568457 0.216
10 0.441563762550708 0.238
11 0.4746040715412092 0.2517
12 0.48798609168433743 0.257
13 0.5102129951114328 0.2814
14 0.5162483390077384 0.2738
15 0.5249760023624598 0.2848
16 0.5389037890138512 0.2958
17 0.5633309799513506 0.3109
18 0.572495733684711 0.3204
19 0.5768923981121842 0.3266
20 0.6017760053053139 0.3529
21 0.6166094313868125 0.3568
22 0.6321010631952042 0.3686
23 0.6327595708622059 0.3631
24 0.6467726631713371 0.3802
25 0.6461220329592805 0.3776
26 0.6553036724062569 0.3843
27 0.6477778420325034 0.3757
28 0.6492762533560108 0.3851
29 0.641311879210333 0.3778
30 0.6452073273672652 0.3791
31 0.6528971244133476 0.3828
32 0.6558397560487208 0.391
33 0.6523541116908813 0.3837
34 0.6554811402173827 0.

# set batch_size=32

In [22]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [23]:
best_idx, best_score, best_weights = train(one_batch, model, max_batch=1000, batch_size=32, verbose = 1)

0 0.38196234086679254 0.1495
1 0.3910167131342437 0.1491
2 0.40755993537521307 0.1724
3 0.4457735203518855 0.1942
4 0.4610316579645524 0.1964
5 0.48497457999785665 0.2243
6 0.5147166964939455 0.2479
7 0.5325084414937288 0.2711
8 0.5478980666489487 0.2762
9 0.5683145733641879 0.2884
10 0.5726632235801914 0.289
11 0.594279646912925 0.3004
12 0.6030926791309003 0.3074
13 0.6090609619179463 0.3057
14 0.6183213439150137 0.3091
15 0.6243587700774306 0.3267
16 0.6284675044893142 0.3231
17 0.6295478595128299 0.3243
18 0.6340847314291175 0.3324
19 0.6364282951411694 0.3372
20 0.6407685246820455 0.3407
21 0.6448997519591274 0.3421
22 0.6505765770931529 0.3517
23 0.6536619197323668 0.3593
24 0.6595175903779669 0.3642
25 0.6643864413945078 0.369
26 0.6718300325548482 0.3748
27 0.6803411946522534 0.387
28 0.6793318302554551 0.3832
29 0.6848631471314744 0.3908
30 0.6829691008315529 0.3879
31 0.6918000200013951 0.4035
32 0.6921499686019144 0.3918
33 0.6996678593547249 0.4073
34 0.6982899356006064 0.4

# don't set batch_size

In [16]:
def one_batch_default_batch_size(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = make_states(batch_size)
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    _, optimal_values = get_optimal_actions_and_values(state_list)
    values = values - optimal_values
    values = (values - np.mean(values)) / (np.std(values) + 1e-8)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [17]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [18]:
best_idx, best_score, best_weights = train(one_batch_default_batch_size, model, max_batch=1000, batch_size=128, verbose = 1)

0 0.41015902589028747 0.2178
1 0.4761621882897688 0.2514
2 0.5273920206446062 0.2751
3 0.5633889802282765 0.2934
4 0.5753561807164733 0.2913
5 0.5940722730810407 0.3007
6 0.6263486897271037 0.3357
7 0.6693391252802425 0.3517
8 0.6837345302884851 0.3671
9 0.6829210687093585 0.3692
10 0.6945143745043619 0.3846
11 0.6945884366203394 0.3919
12 0.7097954769088017 0.4085
13 0.7158671455349396 0.4168
14 0.7135069003332616 0.4134
15 0.6975465258123439 0.3982
16 0.6959688167552763 0.4015
17 0.7111424704118101 0.4207
18 0.7388728831665947 0.4449
19 0.7419391430677933 0.4449
20 0.746989399829677 0.4378
21 0.7523931862698653 0.4443
22 0.7329373720861586 0.4291
23 0.7387294928106 0.4365
24 0.748785623197507 0.4454
25 0.7471358845949042 0.453
26 0.7407532891315838 0.4521
27 0.7312832541604887 0.4442
28 0.7425468440922198 0.4585
29 0.7213610752997445 0.4431
30 0.7219695923681334 0.4452
31 0.7378587134210459 0.4677
32 0.7381852823168727 0.491
33 0.7482590059059144 0.4926
34 0.7404268082948654 0.4721
3