# Reinforcement Learning with Tensorflow

In [17]:
#Build Policy with tensorflow
import tensorflow as tf
import numpy as np
from tensorflow.contrib.layers import fully_connected
# 1. Specify the neural network architecture
n_inputs = 4 # == env.observation_space.shape[0]
n_hidden = 4 # it's a simple task, we don't need more hidden neurons
n_outputs = 1 # only outputs the probability of accelerating left
initializer = tf.contrib.layers.variance_scaling_initializer()
# 2. Build the neural network
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu,
weights_initializer=initializer)
logits = fully_connected(hidden, n_outputs, activation_fn=None,
weights_initializer=initializer)
outputs = tf.nn.sigmoid(logits)
# 3. Select a random action based on the estimated probabilities
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
init = tf.global_variables_initializer()

## Target Possibility

In [4]:
y = 1. - tf.to_float(action)
y

<tf.Tensor 'sub_2:0' shape=(?, 1) dtype=float32>

## Define the Cost Function and compute the grident

In [7]:
learning_rate = 0.01
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(
labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
'''Note that we are calling the optimizer’s compute_gradients() method
instead of the minimize() method. This is because we want to tweak the
gradients before we apply them.10 The compute_gradients() method returns
a list of gradient vector/variable pairs (one pair per trainable variable). Let’s
put all the gradients in a list, to make it more convenient to obtain their values:'''
grads_and_vars = optimizer.compute_gradients(cross_entropy)

## Put all the grident into a list

In [8]:
gradients = [grad for grad, variable in grads_and_vars]
gradients

[<tf.Tensor 'gradients_1/fully_connected/MatMul_grad/tuple/control_dependency_1:0' shape=(4, 4) dtype=float32>,
 <tf.Tensor 'gradients_1/fully_connected/BiasAdd_grad/tuple/control_dependency_1:0' shape=(4,) dtype=float32>,
 <tf.Tensor 'gradients_1/fully_connected_1/MatMul_grad/tuple/control_dependency_1:0' shape=(4, 1) dtype=float32>,
 <tf.Tensor 'gradients_1/fully_connected_1/BiasAdd_grad/tuple/control_dependency_1:0' shape=(1,) dtype=float32>]

## Calucate the grident and choose the good one, save them into a vector and then feed back to the optimizat
#

In [12]:
gradient_placeholders = []
grads_and_vars_feed = []

for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32,shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
#使用apply_gradients()函数来apply 经过选择更新的gradient
training_op = optimizer.apply_gradients(grads_and_vars_feed)
init = tf.global_variables_initializer()
#保存已经训练的模型
saver = tf.train.Saver()

# Execution phase
    On to the execution phase! We will need a couple of functions to compute the
    total discounted rewards, given the raw rewards, and to normalize the results
    across multiple episodes:

In [23]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards,discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

# Check

In [18]:
discount_rewards([10, 0, -50], discount_rate=0.8)

array([-22., -40., -50.])

In [24]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([ 1.26665318,  1.0727777 ])]