In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_state():
    state = np.zeros(8)
    state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(UNIFORM_LOW, UNIFORM_HIGH, size=3)
    return state

def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = np.array([make_state() for i in np.arange(n_rounds)])
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(make_state(), np.random.choice(8)) for i in np.arange(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    _, optimal_values = get_optimal_actions_and_values(state_list)
    values = values - optimal_values
    if np.all(values == 0):
        values = np.ones(batch_size)
    else:
        mean, std = np.mean(values), np.std(values)
        values = (values - mean) / (std + 1e-6)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def one_batch_supervised(model, batch_size, n_test_rounds=0):
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    y_target_list = np.zeros((batch_size, 8))
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    for i in np.arange(batch_size):
        y_target_list[i, optimal_actions[i]] = 1
    model.fit(np.array(state_list), np.array(y_target_list), verbose=0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def train(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights

def train_supervised(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch_supervised(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights


def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


In [3]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

In [4]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [5]:
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(121),
 np.float64(11.917078576955625),
 np.float64(4.170351741393003),
 np.float64(0.7005))

In [9]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

In [10]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [11]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 5.859772199429873
1 6.649910802938405
2 7.182770792510998
3 7.444730045543821
4 7.839250948992575
5 8.206684940035936
6 8.583210277715896
7 8.767560992575476
8 9.0467092044756
9 9.24135345995374
10 9.461900042482926
11 9.567358043990676
12 9.625860866248592
13 9.627008385384059
14 9.562824261077074
15 9.610443500808268
16 9.610287852748751
17 9.869147783553792
18 9.883781729936807
19 10.084164297307368
20 10.180141024851604
21 10.298985559328248
22 10.461633849643745
23 10.634580166963433
24 10.60294898230726
25 10.464426983485343
26 10.181268245485011
27 10.292459788368628
28 10.32104142110808
29 10.63340802076631
30 10.481183041019579
31 10.617782970945077
32 10.670291332000081
33 10.694350718600171
34 10.702025010052038
35 10.786547291506333
36 10.68923433791767
37 10.680490561547625
38 10.698183720461213
39 10.788041974810753
40 10.749974765493967
41 10.822473370693512
42 11.020476432531723
43 10.93027328617356
44 10.876230233621664
45 10.952410419634212
46 11.02031041825896
47 1

(np.int64(131),
 np.float64(11.788106353691113),
 np.float64(4.247634284843103),
 np.float64(0.6722))

In [6]:
UNIFORM_LOW, UNIFORM_HIGH = 0.0, 1.0

In [7]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [8]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 0.32138941683386857
1 0.3743567748213963
2 0.4745871913104861
3 0.5501132452071898
4 0.6042927062413624
5 0.6197183771638114
6 0.632424517047074
7 0.6465698171990285
8 0.646570565082957
9 0.6539081508812664
10 0.6646291383190305
11 0.67198066500429
12 0.679575691849443
13 0.6772665683996091
14 0.680298147727281
15 0.6875836240621935
16 0.6822401955709378
17 0.687293220371486
18 0.685973427380719
19 0.6983637551286989
20 0.712125230769575
21 0.7266583097464152
22 0.7257095480026791
23 0.7412735267825085
24 0.7450553323292073
25 0.7470416771004699
26 0.752128857526208
27 0.7398440491354806
28 0.7302399508688788
29 0.7337888802647574
30 0.7442289777349718
31 0.734038314034855
32 0.7377422011950829
33 0.7345938828539992
34 0.7423509924025783
35 0.7386087112550657
36 0.7393769040328196
37 0.7143578780504124
38 0.7096704036234133
39 0.7088954321163654
40 0.7113938877148285
41 0.7018512159141861
42 0.7307929184482704
43 0.7438025127967675
44 0.7468096248199138
45 0.7413340756893387
46 0.729

(np.int64(148),
 np.float64(0.853402228468626),
 np.float64(0.38556618195967474),
 np.float64(0.6588))

In [12]:
UNIFORM_LOW, UNIFORM_HIGH = 0.0, 1.0

In [13]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [14]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 0.4048440571605294
1 0.46907714344871937
2 0.5241689795315955
3 0.5983389932131802
4 0.6571888789869056
5 0.6669920895371766
6 0.6712338906983546
7 0.6685672676420754
8 0.6717508305137275
9 0.67244243854631
10 0.6696816399197021
11 0.6866256888909767
12 0.6886611251163901
13 0.6830409937065969
14 0.6838618393551094
15 0.7048174272860344
16 0.7181770447220849
17 0.7403015266077451
18 0.7202203160378858
19 0.7251449359241826
20 0.7296688863165743
21 0.7190596352279089
22 0.7107079408543651
23 0.7263799364264596
24 0.753383883626021
25 0.7779120167528337
26 0.7917240369875816
27 0.7915928506825103
28 0.7757310749060613
29 0.7431681313283256
30 0.7435742237918854
31 0.7476028336989029
32 0.7665090281763071
33 0.8061362112556949
34 0.8082043020368797
35 0.7748536273153855
36 0.757900395624263
37 0.7500588950918482
38 0.7537629016379496
39 0.7776852497506523
40 0.7592925272470504
41 0.7349566746338103
42 0.7497951115042597
43 0.7696443956054996
44 0.7882808526704966
45 0.7950609742492911
4

(np.int64(182),
 np.float64(0.8490978593935108),
 np.float64(0.4051342976818108),
 np.float64(0.66))