In [1]:
import tensorflow as tf

# fixed number of time steps in one episode
trading_period = 60

# 1 is zscore, the other 3 is one-hot encoding of the current postion of the trading algorithm
state_dim = 1+3

# RNN hidden state dimension
h_dim = 20

# number of actions
a_num = 4

# number of layer1 output
layer1_out_num = 100

# learning rate
lr = 1e-3

# update batch size
batch_size = 20

In [2]:
tf.reset_default_graph()

# policy network
o = tf.placeholder(tf.float32, [None, h_dim] , name="observations")

layer1 = tf.layers.Dense(units=layer1_out_num,
                         activation=tf.keras.layers.LeakyReLU(),
                         kernel_initializer=tf.contrib.layers.xavier_initializer()
                        )(o)

scores = tf.layers.Dense(units=a_num,
                         activation=tf.keras.layers.LeakyReLU(),
                         kernel_initializer=tf.contrib.layers.xavier_initializer()
                        )(layer1)

# train only the weights above
t_vars = tf.trainable_variables()

In [3]:
t_vars

[<tf.Variable 'dense/kernel:0' shape=(20, 100) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'dense_1/kernel:0' shape=(100, 4) dtype=float32_ref>,
 <tf.Variable 'dense_1/bias:0' shape=(4,) dtype=float32_ref>]

In [4]:
# for sampling an action during a episode
action_probs = tf.nn.softmax(scores)

# chosen actions
input_actions = tf.placeholder(tf.int32, [None], name="action_label")
advantages = tf.placeholder(tf.float32, [None], name="adjusted_reward_signal")

neg_log_select_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=scores, labels=input_actions)
loss = tf.reduce_mean(neg_log_select_prob * advantages)
grads = tf.gradients(loss, t_vars)

In [5]:
grads

[<tf.Tensor 'gradients/dense/MatMul_grad/MatMul_1:0' shape=(20, 100) dtype=float32>,
 <tf.Tensor 'gradients/dense/BiasAdd_grad/BiasAddGrad:0' shape=(100,) dtype=float32>,
 <tf.Tensor 'gradients/dense_1/MatMul_grad/MatMul_1:0' shape=(100, 4) dtype=float32>,
 <tf.Tensor 'gradients/dense_1/BiasAdd_grad/BiasAddGrad:0' shape=(4,) dtype=float32>]

In [6]:
accum_grads = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False)
               for tv in t_vars]

In [7]:
accum_grads

[<tf.Variable 'Variable:0' shape=(20, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_2:0' shape=(100, 4) dtype=float32_ref>,
 <tf.Variable 'Variable_3:0' shape=(4,) dtype=float32_ref>]

In [8]:
reset_grads = [grad.assign(tf.zeros_like(grad))
               for grad in accum_grads]

In [9]:
reset_grads

[<tf.Tensor 'Assign:0' shape=(20, 100) dtype=float32_ref>,
 <tf.Tensor 'Assign_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Tensor 'Assign_2:0' shape=(100, 4) dtype=float32_ref>,
 <tf.Tensor 'Assign_3:0' shape=(4,) dtype=float32_ref>]

In [10]:
evaluate_batch = [accum_grad.assign_add(grad/batch_size)
                  for accum_grad, grad in zip(accum_grads, grads)]

In [11]:
evaluate_batch

[<tf.Tensor 'AssignAdd:0' shape=(20, 100) dtype=float32_ref>,
 <tf.Tensor 'AssignAdd_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Tensor 'AssignAdd_2:0' shape=(100, 4) dtype=float32_ref>,
 <tf.Tensor 'AssignAdd_3:0' shape=(4,) dtype=float32_ref>]

In [12]:
adam = tf.train.AdamOptimizer(learning_rate=lr)
apply_grads = adam.apply_gradients(zip(accum_grads, t_vars))

In [13]:
# output the default graph which can be viewed on tensorboard
writer = tf.summary.FileWriter('.')
writer.add_graph(tf.get_default_graph())
writer.flush()