# Contextual Bandit
[reference](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-1-5-contextual-bandits-bff01d1aad9c)

[ *bandit problem*, *stateful without transition* ]

![](block.png)

Basically, we have multiple bandits with multiple arms. Which bandit's arm are we pulling? That is the context or **state** in RL terms. This state is given as input to our neural network which produces an action based on the state.

In [2]:
import tensorflow as tf
import numpy as np

In [64]:
class ContextualBandit(object):
    
    def __init__(self):
        self.state = 0
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def sampleBandit(self): # return the index of a random state
        self.state = np.random.randint(self.num_bandits)
        return self.state
    
    def stateOneHot(self):
        one_hot_state = np.zeros(self.num_bandits)
        one_hot_state[self.state] = 1
        return one_hot_state
        
    def pullArm(self, action):
        return 1 if self.bandits[self.state][action] < np.random.randn(1) else -1  

In [65]:
class Agent(object):
    
    def __init__(self, lr, num_bandits, num_actions):
        self.state = tf.placeholder(shape=[1], dtype=tf.int32)
        W = tf.Variable(tf.zeros([num_bandits, num_actions]))
        b = tf.Variable(tf.zeros([num_actions]))
        y = tf.nn.softmax(tf.nn.sigmoid(tf.gather(W,self.state) + b))
        # flatten
        y = tf.reshape(y, [-1])
        
        # action
        self.action = tf.argmax(y,0)
        
        # train function
        self.reward_ = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_ = tf.placeholder(shape=[1], dtype=tf.int32)
        credit_y = tf.slice(y, self.action_, [1])
        loss = -(tf.log(credit_y)*self.reward_)
        self.train_fn = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss)

In [66]:
# params
tf.reset_default_graph()

cbandit = ContextualBandit()
agent = Agent(lr=0.001, num_bandits=cbandit.num_bandits, num_actions=cbandit.num_actions)
num_epi = 10000
rall = np.zeros(cbandit.bandits.shape)
e = 0.1

## Training

In [67]:
init = tf.initialize_all_variables()
with tf.Session() as sess:
    # init session
    sess.run(init)
    for i in range(num_epi):
        # get state
        state_v = cbandit.sampleBandit()
        # choose an action
        if np.random.randn(1) > e:
            action_v = sess.run(agent.action, feed_dict = {agent.state : [state_v]})
        else:
            action_v = np.random.randint(cbandit.num_actions)
        # let us see what the reward is, for the chosen action
        reward_v = cbandit.pullArm(action_v)
        # update the neural network (agent) based on the reward and
        #  the aciton that led to it
        sess.run(agent.train_fn, feed_dict = {
                agent.state : [state_v],
                agent.reward_ : [reward_v],
                agent.action_ : [action_v]
            })
        # accumulate reward
        rall[cbandit.state][action_v] += reward_v
        
        if i % 1000 == 0:
            print('Reward status : {}'.format(rall))

Reward status : [[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 1.  0.  0.  0.]]
Reward status : [[  -5.    5.   -7.  192.]
 [  -3.   42.  -33.  -14.]
 [ 198.  -41.  -53.  -44.]]
Reward status : [[  -7.   10.  -12.  400.]
 [ -20.   91.  -72.  -25.]
 [ 390.  -83.  -99.  -92.]]
Reward status : [[ -13.   10.    4.  601.]
 [ -29.  135. -107.  -29.]
 [ 584. -136. -129. -130.]]
Reward status : [[ -21.    1.   -1.  782.]
 [ -33.  192. -127.  -24.]
 [ 794. -176. -167. -179.]]
Reward status : [[ -35.   -6.    0.  975.]
 [ -64.  233. -166.  -26.]
 [ 994. -216. -207. -229.]]
Reward status : [[  -43.     2.     6.  1184.]
 [  -77.   284.  -191.   -48.]
 [ 1191.  -258.  -250.  -273.]]
Reward status : [[  -53.    -8.    16.  1386.]
 [ -124.   342.  -212.   -44.]
 [ 1382.  -304.  -283.  -331.]]
Reward status : [[  -70.   -11.    15.  1595.]
 [ -130.   395.  -247.   -51.]
 [ 1556.  -338.  -320.  -381.]]
Reward status : [[  -73.    -6.    22.  1788.]
 [ -152.   433.  -281.   -53.]
 [ 1732.  -382.  -367.  -428

In [73]:
for i,item in enumerate(rall):
    print('The best action for bandit #{0} is action #{1}'.format(i+1, np.argmax(item) + 1))

The best action for bandit #1 is action #4
The best action for bandit #2 is action #2
The best action for bandit #3 is action #1
