## Solving Contextual Bandit problem using Policy Gradients

Import Libraries

In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

The problem : Contextual Bandits
    
*  The environment consists of several states(bandits) and are independent of each other. 
*  Given a state(bandit) the agent understands the environment and tries to make best possible action(pull arm) which results in better rewards.

In [2]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        self.bandits = np.array([[0.2, 0.0, -0.0, -5], [0.1, -5, 1, 0.25], [-5, 5, 5, 5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        #Choosing a random bandit 
        self.state = np.random.randint(0, self.num_bandits)
        return self.state
    
    def pullArm(self, action):
        #
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        if(result > bandit):
            return 1
        else:
            return -1

Let's define the agent...

In [3]:
class agent():
    def __init__(self, lr, s_size, a_size):
        #defining feed-forward network which takes states as input and produce action as output.
        self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
        output = slim.fully_connected(state_in_OH, a_size, biases_initializer=None, activation_fn=tf.nn.sigmoid, weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output, [-1])
        self.choosen_action = tf.argmax(self.output, 0)
        
        #We train the neural network by feeding the reward and choosen action to compute loss and therefore update the network
        self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
        self.loss = -(tf.log(self.responsible_weight) * self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

Training the agent:

In [12]:
tf.reset_default_graph()  

#Loading the contextual bandit.
cBandit = contextual_bandit()
#Loading the agent.
myAgent = agent(lr=0.001, s_size=cBandit.num_bandits, a_size=cBandit.num_actions)
#Lets define the weights which are optimizer during the training phase for making maximum rewards
weights = tf.trainable_variables()[0]

#The total number of games played by the agent.
total_episodes = 10000 
#Set the scoreboard to zero.
total_rewards = np.zeros([cBandit.num_bandits, cBandit.num_actions])
#The chance of taking a random action.
e = 0.1

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while(i<total_episodes):
        #Get a random state from the environment.
        s = cBandit.getBandit()
        
        #Choose either a random action or one from our network.
        if(np.random.rand(1) < e):
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.choosen_action, feed_dict={myAgent.state_in: [s]})
        
        #Now lets perform the action(pull arm) to observe the reward. 
        reward = cBandit.pullArm(action)
        
        #Update the network
        feed_dict = {myAgent.reward_holder: [reward], myAgent.action_holder: [action], myAgent.state_in: [s]}
        _, ww = sess.run([myAgent.update, weights], feed_dict=feed_dict)
        
        #Update our scoreboard
        total_rewards[s, action] += reward
        if(i%500==0):
            print("The average reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_rewards, axis=1)))
        i += 1
 
#Let's evaluate our actions.
for a in range(cBandit.num_bandits):
    print("The most likely action is " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1))
    if(np.argmax(ww[a]) == np.argmin(cBandit.bandits[a])):
        print(":) Yes, its correct!")
    else:
        print(":( No, it isn't!")

The average reward for each of the 3 bandits: [ 0.    0.25  0.  ]
The average reward for each of the 3 bandits: [ 31.25  41.75  34.75]
The average reward for each of the 3 bandits: [ 71.25  79.5   68.5 ]
The average reward for each of the 3 bandits: [ 108.5   120.    102.25]
The average reward for each of the 3 bandits: [ 147.5   155.5   137.25]
The average reward for each of the 3 bandits: [ 183.    189.75  174.  ]
The average reward for each of the 3 bandits: [ 217.25  230.75  212.25]
The average reward for each of the 3 bandits: [ 253.    268.75  247.  ]
The average reward for each of the 3 bandits: [ 290.75  307.    283.  ]
The average reward for each of the 3 bandits: [ 333.5   344.25  316.  ]
The average reward for each of the 3 bandits: [ 370.    382.25  353.  ]
The average reward for each of the 3 bandits: [ 408.75  425.75  380.75]
The average reward for each of the 3 bandits: [ 449.    465.25  414.5 ]
The average reward for each of the 3 bandits: [ 489.75  502.75  446.25]
The 