In [1]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


### The Bandits
Here, we define our bandits. For this example, we are using a four-armed bandit. The pullBandit function generates a random number from a normal distribution with a mean of 0. The lower the bandit number, the more likely a positive reward will be returned, We want our agent to learn to always choose the bandit that will give that positive reward.

In [2]:
# List out our bandits. Currently, bandit 4 is set to most often provide a positive reward
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)
def pullBandit(bandit):
    # Get a random number
    result = np.random.randn(1)
    return 1 if result > bandit else -1

### The Agent
The code below establishes our simple neural agent. It consists of a set of values for each of the bandits. Each value is an estimate of the value of the return from choosing the bandit. We use a policy gradient method to update the agent by moving the value for the selected action toward the received reward.

In [3]:
tf.reset_default_graph()

# Establish the feed-forward part of the network that does the actual choosing
weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, 0)

# Establish the training procedure, feeding the reward and chosen action into
# the network to compute the loss, using the loss to update the network
reward_holder = tf.placeholder(shape = [1], dtype = tf.float32)
action_holder = tf.placeholder(shape = [1], dtype = tf.int32)
responsible_weight = tf.slice(weights, action_holder, [1])
loss = -(tf.log(responsible_weight) * reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001)
update = optimizer.minimize(loss)

### Training the Agent
We will train our agent by taking actions in our environment and receiving rewards. Using the rewards and actions, we can know how to properly update our network in order to more often choose actions that will yield the highest rewards over time.

In [4]:
total_episodes = 1000
total_reward = np.zeros(num_bandits)  # Set scorecard for bandits to 0
e = 0.1  # Chance of taking a random action

init = tf.global_variables_initializer()

# Launch the TensorFlow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        # Choose either a random action or one from our network
        if np.random.rand(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        # Get our reward from picking one of the bandits
        reward = pullBandit(bandits[action])
        # Update the network
        _, resp, ww = sess.run(
            [update, responsible_weight, weights],
            feed_dict = {
                reward_holder: [reward],
                action_holder: [action]
            }
        )
        # Update our running tally of scores
        total_reward[action] += reward
        if i % 50 == 0:
            print("Running reward for the %d bandits: %s" % (num_bandits, str(total_reward)))
        i += 1
print("\n\nThe agent thinks bandit %d is the most promising..." % (np.argmax(ww)+1))
if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print("... and it was right!")
else:
    print("... and it was wrong!")

Running reward for the 4 bandits: [1. 0. 0. 0.]
Running reward for the 4 bandits: [-1. -1.  1. 14.]
Running reward for the 4 bandits: [-1. -1.  0. 61.]
Running reward for the 4 bandits: [ -2.   1.  -1. 105.]
Running reward for the 4 bandits: [ -1.   0.  -1. 151.]
Running reward for the 4 bandits: [ -1.  -1.   1. 198.]
Running reward for the 4 bandits: [  0.  -2.   1. 244.]
Running reward for the 4 bandits: [  0.  -1.   2. 288.]
Running reward for the 4 bandits: [  1.  -1.   3. 330.]
Running reward for the 4 bandits: [  0.  -2.   3. 378.]
Running reward for the 4 bandits: [ -4.  -2.   3. 424.]
Running reward for the 4 bandits: [ -4.  -1.   3. 469.]
Running reward for the 4 bandits: [ -4.  -2.   3. 518.]
Running reward for the 4 bandits: [ -5.  -2.   3. 561.]
Running reward for the 4 bandits: [ -5.  -4.   3. 609.]
Running reward for the 4 bandits: [ -6.  -5.   4. 656.]
Running reward for the 4 bandits: [ -6.  -5.   3. 703.]
Running reward for the 4 bandits: [ -6.  -5.   3. 747.]
Running 