In [45]:
import tensorflow as tf
import numpy as np

In [46]:
#List out our bandits. Currently bandit 4 (index#3) is set to most often provide a positive reward.
bandits = [-0.05,0,0.8,-0.5]
num_bandits = len(bandits)
def pullBandit(bandit):
    #Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

In [47]:
tf.reset_default_graph()

#These two lines established the feed-forward part of the network. This does the actual choosing.
weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights,0)

#The next six lines establish the training proceedure. We feed the reward and chosen action into the network
#to compute the loss, and use it to update the network.
reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
responsible_weight = tf.slice(weights,action_holder,[1])
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)

### Training the Agent

In [48]:
total_episodes = 50000 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.initialize_all_variables()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        
        reward = pullBandit(bandits[action]) #Get our reward from picking one of the bandits.
        
        #Update the network.
        _,resp,ww = sess.run([update,responsible_weight,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
        
        #Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print ("Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward))
        i+=1
print ("The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print ("...and it was right!")
else:
    print ( "...and it was wrong!")

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Running reward for the 4 bandits: [ 1.  0.  0.  0.]
Running reward for the 4 bandits: [ -1.  -2.  -4.  22.]
Running reward for the 4 bandits: [ -1.  -2.  -5.  43.]
Running reward for the 4 bandits: [ -2.  -2.  -5.  64.]
Running reward for the 4 bandits: [ -2.  -4.  -6.  89.]
Running reward for the 4 bandits: [  -2.   -3.   -6.  106.]
Running reward for the 4 bandits: [  -1.   -4.   -8.  122.]
Running reward for the 4 bandits: [  -2.   -4.   -7.  126.]
Running reward for the 4 bandits: [  -1.   -3.   -8.  147.]
Running reward for the 4 bandits: [   0.   -4.   -8.  157.]
Running reward for the 4 bandits: [   0.   -3.   -9.  173.]
Running reward for the 4 bandits: [  -1.   -3.  -11.  196.]
Running reward for the 4 bandits: [   0.   -2.  -11.  220.]
Running reward for the 4 bandits: [   0.   -1.  -12.  236.]
Running reward for the 4 bandits: [  -1.   -1.  -12.  247.]
Running reward for the 4 bandits: [  -2.   -2.  -1

Running reward for the 4 bandits: [ -8.00000000e+00   1.00000000e+00  -7.80000000e+01   2.29000000e+03]
Running reward for the 4 bandits: [   -8.     4.   -78.  2303.]
Running reward for the 4 bandits: [   -8.     3.   -79.  2311.]
Running reward for the 4 bandits: [   -8.     3.   -79.  2323.]
Running reward for the 4 bandits: [   -8.     4.   -79.  2342.]
Running reward for the 4 bandits: [   -6.     4.   -79.  2346.]
Running reward for the 4 bandits: [ -6.00000000e+00   2.00000000e+00  -8.10000000e+01   2.36000000e+03]
Running reward for the 4 bandits: [   -6.     3.   -83.  2377.]
Running reward for the 4 bandits: [   -4.     3.   -83.  2387.]
Running reward for the 4 bandits: [   -4.     0.   -86.  2413.]
Running reward for the 4 bandits: [   -5.     0.   -87.  2423.]
Running reward for the 4 bandits: [ -6.00000000e+00   1.00000000e+00  -8.60000000e+01   2.43200000e+03]
Running reward for the 4 bandits: [   -6.     0.   -88.  2441.]
Running reward for the 4 bandits: [ -6.00000000e

Running reward for the 4 bandits: [  -12.   -15.  -154.  4134.]
Running reward for the 4 bandits: [  -11.   -15.  -157.  4142.]
Running reward for the 4 bandits: [  -12.   -15.  -157.  4157.]
Running reward for the 4 bandits: [  -12.   -13.  -159.  4181.]
Running reward for the 4 bandits: [  -11.   -13.  -159.  4200.]
Running reward for the 4 bandits: [  -12.   -13.  -159.  4211.]
Running reward for the 4 bandits: [  -11.   -11.  -161.  4212.]
Running reward for the 4 bandits: [  -11.   -12.  -162.  4230.]
Running reward for the 4 bandits: [  -12.   -14.  -162.  4251.]
Running reward for the 4 bandits: [  -11.   -15.  -162.  4263.]
Running reward for the 4 bandits: [  -10.   -15.  -162.  4282.]
Running reward for the 4 bandits: [   -9.   -17.  -162.  4307.]
Running reward for the 4 bandits: [  -10.   -15.  -162.  4326.]
Running reward for the 4 bandits: [  -10.   -15.  -162.  4344.]
Running reward for the 4 bandits: [   -9.   -14.  -162.  4358.]
Running reward for the 4 bandits: [   -7

Running reward for the 4 bandits: [ -6.00000000e+00  -1.80000000e+01  -2.32000000e+02   6.22900000e+03]
Running reward for the 4 bandits: [ -6.00000000e+00  -1.90000000e+01  -2.32000000e+02   6.25000000e+03]
Running reward for the 4 bandits: [ -5.00000000e+00  -1.90000000e+01  -2.32000000e+02   6.26300000e+03]
Running reward for the 4 bandits: [ -5.00000000e+00  -1.90000000e+01  -2.32000000e+02   6.27700000e+03]
Running reward for the 4 bandits: [ -5.00000000e+00  -1.90000000e+01  -2.34000000e+02   6.29900000e+03]
Running reward for the 4 bandits: [ -4.00000000e+00  -1.60000000e+01  -2.33000000e+02   6.32600000e+03]
Running reward for the 4 bandits: [ -3.00000000e+00  -1.60000000e+01  -2.33000000e+02   6.34300000e+03]
Running reward for the 4 bandits: [ -2.00000000e+00  -1.70000000e+01  -2.33000000e+02   6.36700000e+03]
Running reward for the 4 bandits: [ -2.00000000e+00  -1.70000000e+01  -2.33000000e+02   6.38300000e+03]
Running reward for the 4 bandits: [ -1.00000000e+00  -1.90000000

Running reward for the 4 bandits: [  0.00000000e+00  -5.00000000e+00  -2.93000000e+02   7.66500000e+03]
Running reward for the 4 bandits: [  1.00000000e+00  -6.00000000e+00  -2.94000000e+02   7.68600000e+03]
Running reward for the 4 bandits: [  1.00000000e+00  -5.00000000e+00  -2.95000000e+02   7.71400000e+03]
Running reward for the 4 bandits: [  1.00000000e+00  -5.00000000e+00  -2.95000000e+02   7.73600000e+03]
Running reward for the 4 bandits: [  0.00000000e+00  -3.00000000e+00  -2.97000000e+02   7.75700000e+03]
Running reward for the 4 bandits: [  0.00000000e+00  -2.00000000e+00  -2.98000000e+02   7.77100000e+03]
Running reward for the 4 bandits: [ -1.00000000e+00  -3.00000000e+00  -3.00000000e+02   7.79300000e+03]
Running reward for the 4 bandits: [  0.00000000e+00  -3.00000000e+00  -3.00000000e+02   7.80600000e+03]
Running reward for the 4 bandits: [  1.00000000e+00  -2.00000000e+00  -2.99000000e+02   7.82300000e+03]
Running reward for the 4 bandits: [  1.00000000e+00  -3.00000000

Running reward for the 4 bandits: [  1.30000000e+01  -2.00000000e+00  -3.39000000e+02   9.10100000e+03]
Running reward for the 4 bandits: [  1.30000000e+01  -2.00000000e+00  -3.40000000e+02   9.12000000e+03]
Running reward for the 4 bandits: [  1.40000000e+01  -2.00000000e+00  -3.40000000e+02   9.13700000e+03]
Running reward for the 4 bandits: [  1.40000000e+01  -1.00000000e+00  -3.40000000e+02   9.14600000e+03]
Running reward for the 4 bandits: [  1.20000000e+01  -1.00000000e+00  -3.41000000e+02   9.16300000e+03]
Running reward for the 4 bandits: [   14.     0.  -340.  9173.]
Running reward for the 4 bandits: [   12.     0.  -340.  9175.]
Running reward for the 4 bandits: [   12.     0.  -340.  9185.]
Running reward for the 4 bandits: [  1.10000000e+01  -1.00000000e+00  -3.42000000e+02   9.19900000e+03]
Running reward for the 4 bandits: [  1.40000000e+01  -1.00000000e+00  -3.43000000e+02   9.22500000e+03]
Running reward for the 4 bandits: [  1.40000000e+01  -2.00000000e+00  -3.4300000

Running reward for the 4 bandits: [  2.50000000e+01  -5.00000000e+00  -3.87000000e+02   1.05560000e+04]
Running reward for the 4 bandits: [  2.50000000e+01  -5.00000000e+00  -3.88000000e+02   1.05650000e+04]
Running reward for the 4 bandits: [  2.60000000e+01  -4.00000000e+00  -3.91000000e+02   1.05820000e+04]
Running reward for the 4 bandits: [  2.60000000e+01  -5.00000000e+00  -3.93000000e+02   1.06070000e+04]
Running reward for the 4 bandits: [  2.70000000e+01  -5.00000000e+00  -3.94000000e+02   1.06190000e+04]
Running reward for the 4 bandits: [  2.70000000e+01  -5.00000000e+00  -3.93000000e+02   1.06400000e+04]
Running reward for the 4 bandits: [  2.60000000e+01  -5.00000000e+00  -3.93000000e+02   1.06610000e+04]
Running reward for the 4 bandits: [  2.70000000e+01  -4.00000000e+00  -3.96000000e+02   1.06800000e+04]
Running reward for the 4 bandits: [  2.60000000e+01  -4.00000000e+00  -3.98000000e+02   1.06950000e+04]
Running reward for the 4 bandits: [  2.60000000e+01  -4.00000000

Running reward for the 4 bandits: [  3.00000000e+01  -2.00000000e+00  -4.65000000e+02   1.21020000e+04]
Running reward for the 4 bandits: [  3.20000000e+01  -2.00000000e+00  -4.65000000e+02   1.21160000e+04]
Running reward for the 4 bandits: [  3.20000000e+01  -1.00000000e+00  -4.65000000e+02   1.21350000e+04]
Running reward for the 4 bandits: [    32.      0.   -467.  12150.]
Running reward for the 4 bandits: [    33.      0.   -470.  12164.]
Running reward for the 4 bandits: [    34.      0.   -471.  12176.]
Running reward for the 4 bandits: [    35.      0.   -472.  12180.]
Running reward for the 4 bandits: [  3.50000000e+01  -1.00000000e+00  -4.72000000e+02   1.22010000e+04]
Running reward for the 4 bandits: [  3.60000000e+01  -1.00000000e+00  -4.72000000e+02   1.22220000e+04]
Running reward for the 4 bandits: [  3.40000000e+01  -1.00000000e+00  -4.73000000e+02   1.22370000e+04]
Running reward for the 4 bandits: [  3.30000000e+01  -1.00000000e+00  -4.72000000e+02   1.22470000e+04]


Running reward for the 4 bandits: [    29.      0.   -537.  13497.]
Running reward for the 4 bandits: [  2.90000000e+01  -2.00000000e+00  -5.36000000e+02   1.35100000e+04]
Running reward for the 4 bandits: [  2.90000000e+01  -2.00000000e+00  -5.36000000e+02   1.35300000e+04]
Running reward for the 4 bandits: [  2.90000000e+01  -2.00000000e+00  -5.36000000e+02   1.35420000e+04]
Running reward for the 4 bandits: [  2.80000000e+01  -2.00000000e+00  -5.36000000e+02   1.35510000e+04]
Running reward for the 4 bandits: [  3.30000000e+01  -2.00000000e+00  -5.37000000e+02   1.35650000e+04]
Running reward for the 4 bandits: [  3.20000000e+01  -3.00000000e+00  -5.37000000e+02   1.35850000e+04]
Running reward for the 4 bandits: [  3.20000000e+01  -3.00000000e+00  -5.37000000e+02   1.36010000e+04]
Running reward for the 4 bandits: [  3.40000000e+01  -3.00000000e+00  -5.36000000e+02   1.36220000e+04]
Running reward for the 4 bandits: [  3.30000000e+01  -3.00000000e+00  -5.38000000e+02   1.36450000e+

Running reward for the 4 bandits: [  3.90000000e+01   9.00000000e+00  -6.02000000e+02   1.49230000e+04]
Running reward for the 4 bandits: [  3.90000000e+01   1.10000000e+01  -6.02000000e+02   1.49290000e+04]
Running reward for the 4 bandits: [  3.90000000e+01   1.00000000e+01  -6.05000000e+02   1.49390000e+04]
Running reward for the 4 bandits: [  4.10000000e+01   8.00000000e+00  -6.06000000e+02   1.49520000e+04]
Running reward for the 4 bandits: [  4.10000000e+01   8.00000000e+00  -6.06000000e+02   1.49660000e+04]
Running reward for the 4 bandits: [  4.00000000e+01   9.00000000e+00  -6.08000000e+02   1.49860000e+04]
Running reward for the 4 bandits: [  4.00000000e+01   9.00000000e+00  -6.09000000e+02   1.50090000e+04]
Running reward for the 4 bandits: [  4.20000000e+01   9.00000000e+00  -6.09000000e+02   1.50270000e+04]
Running reward for the 4 bandits: [  4.20000000e+01   1.20000000e+01  -6.10000000e+02   1.50530000e+04]
Running reward for the 4 bandits: [  4.20000000e+01   1.30000000

Running reward for the 4 bandits: [    44.     24.   -669.  16554.]
Running reward for the 4 bandits: [    44.     25.   -671.  16583.]
Running reward for the 4 bandits: [    45.     25.   -671.  16600.]
Running reward for the 4 bandits: [    45.     25.   -673.  16620.]
Running reward for the 4 bandits: [    46.     27.   -677.  16649.]
Running reward for the 4 bandits: [    46.     26.   -677.  16664.]
Running reward for the 4 bandits: [    44.     26.   -677.  16686.]
Running reward for the 4 bandits: [    45.     25.   -678.  16701.]
Running reward for the 4 bandits: [    45.     26.   -677.  16713.]
Running reward for the 4 bandits: [    45.     26.   -679.  16729.]
Running reward for the 4 bandits: [    46.     24.   -679.  16744.]
Running reward for the 4 bandits: [    45.     22.   -679.  16747.]
Running reward for the 4 bandits: [    45.     22.   -679.  16761.]
Running reward for the 4 bandits: [    44.     24.   -680.  16775.]
Running reward for the 4 bandits: [    43.     2