## Policy Gradient based Multi-armed Bandit

Import libraries

In [1]:
import tensorflow as tf
import numpy as np

Defining Bandit

In [2]:
bandits = [0.2, 0, -0.2, -5] #The lower the bandit number the more likely a positive reward will be returned.
num_bandits = len(bandits)

def pull_bandit(bandit):
    result = np.random.randn(1)
    if result > bandit:
        #return positive reward.
        return 1
    else:
        #return negative reward.
        return -1

Defining Agent

In [3]:
tf.reset_default_graph()

#weights are defined for the feed forward network.
weights = tf.Variable(tf.ones([num_bandits]))
#action is choosent based on greedy approach.
choosen_action = tf.argmax(weights, 0)

reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
responsible_weight = tf.slice(weights, action_holder, [1])
loss = -(tf.log(responsible_weight) * reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)

Training the Agent :

Our Agent is trained by taking actions in our environment, and receiveing rewards. Using the rewards and actions, we can know how to properly update our network in order to more often choose actions that will yield the highest rewards over time.

In [4]:
total_episodes = 1000 #This parameter sets how many times we are going to play the game.
total_reward = np.zeros(num_bandits) #Initializing the scoreboard of bandits to zero.
e = 0.2 #This parameter sets the chance of taking random action.

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        #Choose either a random action or from our network.
        if(np.random.rand(1) < e):
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(choosen_action)
            
        #On choosing an action by picking one of the bandits we get a reward.
        reward = pull_bandit(bandits[action])
        
        #Updating the weights in the network
        _, resp, ww = sess.run([update, responsible_weight, weights], feed_dict={reward_holder: [reward], action_holder: [action]})
        
        #Updating our scoreboard
        total_reward[action] += reward
        if(i%50 == 0):
            print("Scoreboard  :  " + str(total_reward))
        i += 1

print("Our trained agent believes the most promising agent is : " + str(np.argmax(ww) + 1))
if(np.argmax(ww) == np.argmax(-np.array(bandits))):
    print("Yes! your agent is good.")
else:
    print("Sorry, better train you agent.")

Scoreboard  :  [-1.  0.  0.  0.]
Scoreboard  :  [ -4.  -1.   0.  28.]
Scoreboard  :  [ -5.  -3.   2.  69.]
Scoreboard  :  [  -4.   -1.    2.  108.]
Scoreboard  :  [  -2.   -1.    0.  150.]
Scoreboard  :  [  -2.    0.    0.  191.]
Scoreboard  :  [  -1.    3.    0.  233.]
Scoreboard  :  [  -2.    9.   -1.  273.]
Scoreboard  :  [  -2.   10.   -1.  318.]
Scoreboard  :  [  -5.   10.    0.  362.]
Scoreboard  :  [  -1.   12.    1.  405.]
Scoreboard  :  [   0.   15.    1.  449.]
Scoreboard  :  [   0.   16.    1.  490.]
Scoreboard  :  [  -1.   17.   -1.  530.]
Scoreboard  :  [   0.   18.   -2.  569.]
Scoreboard  :  [   0.   14.   -3.  614.]
Scoreboard  :  [   0.   14.   -4.  657.]
Scoreboard  :  [   0.   13.   -3.  703.]
Scoreboard  :  [   0.   13.    0.  744.]
Scoreboard  :  [   0.   11.    2.  780.]
Our trained agent believes the most promising agent is : 4
Yes! your agent is good.
