In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, BatchNormalization, ReLU, Dropout
from tensorflow.keras.models import Model, clone_model
from itertools import product

def make_state():
    state = np.zeros(8)
    state[np.random.choice(8, size=3, replace=False)] = np.random.uniform(low=UNIFORM_LOW, high=UNIFORM_HIGH, size=3)
    return state

def run(state, action):
    if state[action] != 0:
        return state[action]
    else:
        before, after = state[(action - 1) % 8], state[(action + 1) % 8]
        if before != 0 and after != 0:
            return before + after
        else:
            return 0

def get_optimal_action_and_value(state):
    values = np.array([run(state, action) for action in np.arange(8)])
    optimal_action = values.argmax()
    return optimal_action, values[optimal_action]

def get_optimal_actions_and_values(state_list):
    res = [get_optimal_action_and_value(state) for state in state_list]
    actions = np.array([pairs[0] for pairs in res])
    values = np.array([pairs[1] for pairs in res])
    return actions, values
    
def get_optimal_statics(n_rounds):
    state_list = np.array([make_state() for i in np.arange(n_rounds)])
    _, values = get_optimal_actions_and_values(state_list)
    return np.mean(values), np.std(values)

def get_random_policy_statics(n_rounds):
    values = [run(make_state(), np.random.choice(8)) for i in np.arange(n_rounds)]
    return np.mean(values), np.std(values)

def get_model_actions(model, state_list):
    return model(np.array(state_list)).numpy().argmax(axis = 1)

def test_model(model, n_test_rounds):
    state_list = np.array([make_state() for i in np.arange(n_test_rounds)])
    actions = get_model_actions(model, state_list)
    values = [run(state, action) for state, action in zip(state_list, actions)]
    optimal_actions, _ = get_optimal_actions_and_values(state_list)
    accuracy = np.mean(actions == optimal_actions)
    return np.mean(values), np.std(values), accuracy

def train(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    for i in np.arange(max_batch):
        score, std, accuracy = one_batch(model, batch_size, n_test_rounds)
        if best_score < score:
            best_score = score
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, score)
    return best_idx, best_score, best_weights

def create_model(n_hidden_layers, n_dense_units, ratio_dropout, optimizer):
    input_shape = (8,) 
    inputs = Input(shape=input_shape)

    x = inputs
    for i in np.arange(n_hidden_layers):
        x = Dense(n_dense_units)(x) 
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(ratio_dropout)(x)
    
    outputs = Dense(8, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy')
    return model


## 不同奖励随机数区间

In [2]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [3]:
REWARD_OFFSET, REWARD_SCALE = 0, 1
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [4]:
UNIFORM_LOW, UNIFORM_HIGH = 0.0, 1.0
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(123),
 np.float64(0.8365815629062882),
 np.float64(0.3977246518784473),
 np.float64(0.6199))

In [5]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')
UNIFORM_LOW, UNIFORM_HIGH = 1.0, 2.0
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(50),
 np.float64(2.2389738138722524),
 np.float64(0.8090951607156687),
 np.float64(0.5838))

In [6]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(16),
 np.float64(7.350747637099523),
 np.float64(4.218747072648944),
 np.float64(0.2109))

## 奖励归一化

In [7]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = (values - mean) / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [8]:
UNIFORM_LOW, UNIFORM_HIGH = 0.0, 1.0

In [9]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [10]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 0.324025908559573
1 0.4009826038292922
2 0.4725800851877768
3 0.4682001248583665
4 0.48242460928692527
5 0.526010639594943
6 0.5995684726922754
7 0.6610100941869914
8 0.694042042888597
9 0.7132010591748202
10 0.7206624662568765
11 0.7069426140291699
12 0.6994634719437034
13 0.6893706104115694
14 0.6782858412019099
15 0.678353416610625
16 0.6951821867455328
17 0.7120587897349145
18 0.7136139617660476
19 0.7245669052488646
20 0.7311229041500975
21 0.7383053916842431
22 0.7464501957408393
23 0.7548443546635606
24 0.7603790103438267
25 0.772539416803183
26 0.7629567852560326
27 0.755279451133706
28 0.7676895911268188
29 0.772191245481607
30 0.7744088217068392
31 0.7588307739776661
32 0.7455319997377027
33 0.752548264503542
34 0.7552880817753481
35 0.7528597581473969
36 0.7641584280103145
37 0.7619194169196221
38 0.7448756324930199
39 0.7492611036392145
40 0.7568774633533556
41 0.7997111782752722
42 0.7968888939523878
43 0.7867231779284182
44 0.7826488565782526
45 0.7877733243007257
46 0.

(np.int64(67),
 np.float64(0.8563241051847839),
 np.float64(0.38296496005009173),
 np.float64(0.6396))

In [11]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(36),
 np.float64(11.31440407582002),
 np.float64(5.000478602271562),
 np.float64(0.6284))

In [29]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(44),
 np.float64(10.605411811422607),
 np.float64(5.355002410544318),
 np.float64(0.5563))

In [12]:
get_optimal_statics(10000)

(np.float64(12.384626933669336), np.float64(3.5770176421557487))

## 奖励归一化-不减去平均值

In [13]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = values / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [14]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

In [15]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [16]:
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(12),
 np.float64(9.037710135837656),
 np.float64(4.121299193723644),
 np.float64(0.3399))

## 奖励归一化-不除以标准差

In [17]:
def one_batch(model, batch_size, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    mean, std = np.mean(values), np.std(values)
    values = values - mean
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


In [18]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

In [19]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [20]:
best_idx, best_score, best_weights = train(model, 200, verbose = 0)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

(np.int64(146),
 np.float64(10.49169860883918),
 np.float64(5.982210771388845),
 np.float64(0.575))

# 用test_model 返回值做normalization

In [21]:
def one_batch(model, batch_size, mean=0, std=1, n_test_rounds=0):
    y_target_list = np.zeros((batch_size, 8))
    state_list = np.array([make_state() for i in np.arange(batch_size)])
    prob_list = model(np.array(state_list)).numpy()
    actions = np.array([np.random.choice(8, p=prob) for prob in prob_list])
    values = np.array([run(state, action) for state, action in zip(state_list, actions)])
    # mean, std = np.mean(values), np.std(values)
    values = (values - mean) / std
    for i in np.arange(batch_size):
        y_target_list[i, actions[i]] = values[i]
    model.fit(state_list, y_target_list, verbose = 0)
    if n_test_rounds > 0:
        return test_model(model, n_test_rounds)


def train(model, max_batch=200, batch_size=128, n_test_rounds=10000, verbose = 0):
    best_weights = []
    best_idx = 0
    best_score = 0
    mean, std, accuracy = test_model(model, n_test_rounds)
    for i in np.arange(max_batch):
        mean, std, accuracy = one_batch(model, batch_size, mean, std, n_test_rounds)
        if best_score < mean:
            best_score = mean
            best_idx = i
            best_weights = model.get_weights()
        if verbose == 1:
            print(i, mean)
    return best_idx, best_score, best_weights

In [22]:
UNIFORM_LOW, UNIFORM_HIGH = 5.0, 10.0

In [23]:
model = create_model(n_hidden_layers=1, n_dense_units=512, ratio_dropout=0.5, optimizer='adam')

In [24]:
best_idx, best_score, best_weights = train(model, 200, verbose = 1)
best_model = clone_model(model)
best_model.set_weights(best_weights)
best_idx, *test_model(best_model, 10000)

0 4.231585917094805
1 5.491051363342523
2 6.213190833028432
3 7.285470936900939
4 7.895660811068464
5 8.30164959576419
6 8.730141687128494
7 8.98236819697338
8 9.266040121077285
9 9.357667937821564
10 9.540048164160147
11 9.603518234857834
12 9.782618769252414
13 9.810654878078562
14 9.81533259111374
15 9.943870575544574
16 10.061304793302345
17 10.15472793370369
18 10.310217960549917
19 10.380358647505517
20 10.353950152873997
21 10.487205065525796
22 10.377052851272378
23 10.51920339244479
24 10.417675211953576
25 10.331172750083851
26 10.297185246169994
27 10.182760981820978
28 10.030562135746095
29 10.219084992551068
30 10.359164977418049
31 10.59472277562399
32 10.460929440363772
33 10.502152001451952
34 10.628316176853692
35 10.669250687519595
36 10.85759815206089
37 11.06359115261493
38 10.893100927141344
39 10.869050441102107
40 10.813160526723369
41 10.434569782711364
42 10.305317898053175
43 10.020582350482739
44 10.056262676741538
45 10.408118520469081
46 10.560562366590114


(np.int64(56),
 np.float64(11.099993341046853),
 np.float64(5.153782088059489),
 np.float64(0.6062))

In [25]:
tmp = np.array([[1,2,3],[4,5,6]])

In [26]:
tmp[[0,1], [1,2]] = [9,9]

In [27]:
tmp

array([[1, 9, 3],
       [4, 5, 9]])

In [28]:
1, *(2,3,4)

(1, 2, 3, 4)