In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

LOW, HIGH = 0.0, 1.0

# def make_state():
#     state = np.zeros(8)
#     state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(size=3)
#     return state

def make_states(n):
    states = np.zeros((n,8))
    for i in np.arange(n):
        idx = np.random.choice(8, size=3, replace=False)
        states[i, idx] = np.random.uniform(LOW, HIGH, size=3)
    return states

def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = make_states(n_rounds)
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(state, np.random.choice(8)) for state in make_states(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = make_states(n_test_rounds)
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = make_states(batch_size)
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = (values - mean) / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def one_batch_supervised(model, batch_size, n_test_rounds=0):
    state_list = make_states(batch_size)
    y_target_list = np.zeros((batch_size, 8))
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    for i in np.arange(batch_size):
        y_target_list[i, optimal_actions[i]] = 1
    model.fit(np.array(state_list), np.array(y_target_list), verbose=0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)

def train(one_batch, model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score, accuracy)
    return best_idx, best_score, best_weights

def train_supervised(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch_supervised(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights


def create_model(n_hidden_layers, n_dense_units, ratio_dropout):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model


In [2]:
def one_batch_autodiff(model, batch_size=128, n_test_rounds=10000):
    states = make_states(batch_size)
    _, optimal_values = get_optimal_actions_and_values(states)
    with tf.GradientTape() as tape:
        probs = model(states)

        actions = np.array([np.random.choice(8, p=prob) for prob in probs.numpy()])
        rewards = np.array([run(state, action) for state, action in zip(states, actions)])
        rewards = rewards - optimal_values
        rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-8)

        action_probs = tf.gather_nd(probs, [[i, a] for i, a in enumerate(actions)])
        log_probs = tf.math.log(action_probs)
        loss = -tf.reduce_mean(log_probs * rewards)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return test_model(model, n_test_rounds)

In [3]:
LOW, HIGH = 0.0, 1.0
model = create_model(1, 512, 0.5)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [4]:
best_idx, best_score, best_weights = train(one_batch_autodiff, model, max_batch=1000, batch_size=128, n_test_rounds=10000, verbose = 1)

0 0.22901126488729315 0.1134
1 0.2773744592576222 0.1394
2 0.29583670690277686 0.1489
3 0.31689257440517005 0.1496
4 0.3399804890610776 0.1648
5 0.35631664144870406 0.174
6 0.3763855061141363 0.1847
7 0.3931641921313486 0.1919
8 0.418730683636972 0.204
9 0.4340000548708129 0.2191
10 0.4590051532600219 0.2279
11 0.47863681727426854 0.2389
12 0.5058404558434253 0.2552
13 0.5226521038431029 0.2718
14 0.5382989949727579 0.2879
15 0.5530254797025219 0.2887
16 0.5731018966873996 0.3133
17 0.588954355762551 0.3205
18 0.6001878829517391 0.3218
19 0.6113103162862501 0.3231
20 0.6313882197724611 0.3435
21 0.6466089345961267 0.3583
22 0.6577887940697604 0.3564
23 0.6683811228598934 0.3602
24 0.6741402225986883 0.3653
25 0.6831823774937109 0.3759
26 0.6935504960657322 0.3783
27 0.6927931990190378 0.3838
28 0.7018341212014702 0.3871
29 0.7032266121190153 0.3895
30 0.7096159701108222 0.3921
31 0.7111349769625759 0.3964
32 0.7094057420956839 0.3922
33 0.7086301108583656 0.3938
34 0.7146585209592716 0

In [5]:
test_model(model, n_test_rounds=10000)

(np.float64(0.8644715765482164),
 np.float64(0.3728830070820806),
 np.float64(0.6642))

In [6]:
LOW, HIGH = 0.0, 1.0
model = create_model(1, 512, 0.5)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [7]:
best_idx, best_score, best_weights = train(one_batch_autodiff, model, max_batch=4000, batch_size=32, n_test_rounds=10000, verbose = 1)

0 0.31903624876567327 0.1276
1 0.3567682778132752 0.1504
2 0.3475719956922886 0.1466
3 0.36082579508587287 0.1721
4 0.3817886227147158 0.1942
5 0.4318067163274064 0.2367
6 0.495640428119772 0.2817
7 0.5340055067465058 0.2996
8 0.5535892931067925 0.3232
9 0.5658413498095469 0.3295
10 0.5803882289128595 0.3327
11 0.6001171399795171 0.3513
12 0.5975822356846102 0.3439
13 0.6088756285034111 0.3481
14 0.6118922099465535 0.3495
15 0.6243539888859725 0.3574
16 0.6297093855926867 0.3594
17 0.6370203772174078 0.368
18 0.6330752846087419 0.3614
19 0.6454745242236185 0.3631
20 0.6588223652336339 0.3846
21 0.6705929886607327 0.376
22 0.6627566447431152 0.3743
23 0.6703000516193101 0.3896
24 0.6682202620104735 0.3837
25 0.6638162477614938 0.3752
26 0.6649163212430002 0.3842
27 0.6573195590696328 0.3762
28 0.6551880843791694 0.3718
29 0.661254898080203 0.3748
30 0.6615665657792104 0.3876
31 0.6537517738160981 0.3742
32 0.6521161813079636 0.3744
33 0.6501654274016198 0.3702
34 0.6445077122855126 0.36

In [8]:
test_model(model, n_test_rounds=10000)

(np.float64(0.8898461000484845),
 np.float64(0.3795243443478929),
 np.float64(0.7673))

In [151]:
test_model(model, n_test_rounds=10000)

(np.float64(0.8217497851160271),
 np.float64(0.3467924302423787),
 np.float64(0.5647))

In [137]:
states = make_states(100)
model(states)

<tf.Tensor: shape=(100, 8), dtype=float32, numpy=
array([[9.99992490e-01, 5.28936596e-07, 5.30528434e-11, 2.33879541e-06,
        1.13300724e-09, 2.07271484e-07, 3.47202990e-06, 1.00838429e-06],
       [9.99999881e-01, 8.67419244e-12, 2.57586557e-08, 3.94267836e-08,
        1.78601550e-10, 1.34499416e-07, 4.61172801e-11, 3.30087823e-12],
       [9.99999881e-01, 1.16669369e-13, 9.67042335e-10, 5.82504357e-12,
        6.11961176e-11, 3.78782730e-08, 1.09093001e-07, 1.00275210e-08],
       [6.14557933e-08, 2.30886310e-09, 9.97769237e-01, 2.58338968e-08,
        2.98816616e-08, 2.23038183e-03, 2.04699319e-07, 7.82276396e-11],
       [6.33822992e-07, 5.74676157e-12, 3.33154025e-11, 5.68583722e-08,
        7.30261851e-10, 6.55111144e-05, 9.99933839e-01, 3.51089907e-10],
       [3.77447903e-03, 9.96040225e-01, 3.42114581e-05, 7.77787591e-06,
        1.54262722e-07, 4.66414463e-09, 8.12255385e-05, 6.19077327e-05],
       [6.30433703e-07, 9.99071002e-01, 9.03188542e-04, 5.60272952e-07,
        

In [138]:
states

array([[8.48806808, 0.        , 0.        , 0.        , 6.60376072,
        0.        , 0.        , 9.91509933],
       [0.        , 9.64808339, 0.        , 0.        , 6.59339783,
        0.        , 0.        , 5.10016045],
       [0.        , 6.96152903, 0.        , 0.        , 0.        ,
        0.        , 7.55799497, 8.36448771],
       [0.        , 5.12934277, 0.        , 7.21645069, 0.        ,
        7.93017538, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 5.95137203,
        6.90278539, 0.        , 7.95314404],
       [5.84958625, 0.        , 5.01194041, 0.        , 0.        ,
        0.        , 0.        , 7.49009551],
       [5.26204332, 0.        , 9.11558584, 0.        , 0.        ,
        8.17603527, 0.        , 0.        ],
       [0.        , 0.        , 6.52148   , 9.51132693, 0.        ,
        0.        , 0.        , 5.96821148],
       [6.05693784, 0.        , 0.        , 9.17645322, 0.        ,
        9.7015964 , 0.      

In [131]:
model(states).numpy().argmax(axis=1)

array([0, 5, 0, 3, 7, 0, 0, 0, 1, 0, 5, 0, 4, 4, 4, 6, 1, 0, 1, 1, 0, 1,
       0, 0, 7, 0, 2, 0, 4, 1, 7, 4, 0, 2, 3, 3, 5, 2, 3, 7, 7, 7, 6, 1,
       3, 7, 4, 2, 6, 6, 6, 6, 2, 7, 4, 7, 3, 4, 1, 7, 4, 1, 6, 5, 0, 7,
       6, 6, 0, 5, 7, 2, 3, 2, 2, 6, 2, 6, 6, 3, 1, 4, 2, 6, 1, 3, 1, 0,
       7, 0, 6, 4, 6, 0, 7, 3, 5, 3, 2, 7])