In [2]:
import tensorflow as tf
import numpy as np

In [20]:
# List out our bandits. Currently bandit 4 (index#3) is set to most often provide a positive reward.
bandits = [0.2, 0, -0.2, -0.5]
num_bandits = len(bandits)
def pullBandit(bandit):
    # Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        # return a positive reward.
        return 1
    else:
        # return a negative reward.
        return -1

In [21]:
tf.reset_default_graph()

# These two lines established the feed-forward part of the network. This does the actual choosing.
weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, 0)

print("Weight : ", weights)
print("Chosen action : ", chosen_action)

Weight :  <tf.Variable 'Variable:0' shape=(4,) dtype=float32_ref>
Chosen action :  Tensor("ArgMax:0", shape=(), dtype=int64)


In [22]:
# The next six lines establish the training proceedure. We feed the reward and chosen action into the network
# to compute the loss, and use it to update the network.
reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
responsible_weight = tf.slice(weights,action_holder,[1])
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)

print("Rewards : ", reward_holder)
print("Actions : ", action_holder)
print("Responsible weights : ", responsible_weight)
print("Loss : ", loss)
print("Optimizer : ", optimizer)
print("Update : ", update)

Rewards :  Tensor("Placeholder:0", shape=(1,), dtype=float32)
Actions :  Tensor("Placeholder_1:0", shape=(1,), dtype=int32)
Responsible weights :  Tensor("Slice:0", shape=(1,), dtype=float32)
Loss :  Tensor("Neg:0", shape=(1,), dtype=float32)
Optimizer :  <tensorflow.python.training.gradient_descent.GradientDescentOptimizer object at 0x111499c88>
Update :  name: "GradientDescent"
op: "NoOp"
input: "^GradientDescent/update_Variable/ApplyGradientDescent"



In [26]:
total_episodes = 2000 # Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) # Set scoreboard for bandits to 0.
e = 0.2 # Set the chance of taking a random action.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        # Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        
        reward = pullBandit(bandits[action]) # Get our reward from picking one of the bandits.
        
        # Update the network.
        _, resp, ww = sess.run([update, responsible_weight, weights], 
                               feed_dict={reward_holder: [reward], action_holder: [action]})
        
        # Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print("Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward))
        i += 1

print("The agent thinks bandit " + str(np.argmax(ww) + 1) + " is the most promising....")

if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print("...and it was right!")
else:
    print("...and it was wrong!")

Running reward for the 4 bandits: [-1.  0.  0.  0.]
Running reward for the 4 bandits: [ -2.  -3.  -1.  19.]
Running reward for the 4 bandits: [ -2.   0.   0.  45.]
Running reward for the 4 bandits: [ -2.  -2.   1.  58.]
Running reward for the 4 bandits: [  1.  -2.   4.  74.]
Running reward for the 4 bandits: [  2.  -4.   5.  84.]
Running reward for the 4 bandits: [  1.  -2.   4.  92.]
Running reward for the 4 bandits: [   4.   -4.    4.  109.]
Running reward for the 4 bandits: [   4.   -2.    5.  134.]
Running reward for the 4 bandits: [   3.   -5.    4.  157.]
Running reward for the 4 bandits: [   1.   -3.    2.  163.]
Running reward for the 4 bandits: [   0.   -3.    3.  167.]
Running reward for the 4 bandits: [  -1.   -5.    3.  184.]
Running reward for the 4 bandits: [  -2.   -7.    3.  211.]
Running reward for the 4 bandits: [   0.   -5.    2.  222.]
Running reward for the 4 bandits: [   2.   -9.    4.  238.]
Running reward for the 4 bandits: [  -1.  -11.    4.  267.]
Running rewa