# The Multi Armed Bandit Problem

### Dependencies

In [1]:
import numpy as np
import tensorflow as tf

### n-armed bandit

In [2]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

In [3]:
def pull_bandit(bandit):
    """
    Pull a bandit.
    
    Generates a random number from a normal
    distribution with mean of 0.
    
    :param bandit: float
        The bandit to be pulled.
    
    :return reward: int
        The reward for pulling a bandit.
        The lower the bandit the more likely it
        is for a positive reward.
    """
    reward = 1 if np.random.randn(1) > bandit else -1
    return reward

## The Agent

In [7]:
tf.reset_default_graph()

# Weights: contains the value that estimates how good a 
#          choosen (pulled) bandit is.
# Action:  is the action that gives the maximum 
#          weight value
weights = tf.Variable(tf.ones(shape=[num_bandits]))
choosen_action = np.argmax(weights, axis=0)

# Reward: reward from pulling a bandit/lever
# Action: the action that lead to the reward
reward_placeholder = tf.placeholder(tf.float32, shape=[1])
action_placeholder = tf.placeholder(tf.int32, shape=[1])

responsible_weight = tf.slice(weights, action_placeholder, [1])
print('Weights:', weights)
print('R weights:', responsible_weight)
# loss = -log(π) * A
loss = -tf.log(responsible_weight) * reward_placeholder

optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-3)
train = optimizer.minimize(loss)

Weights: <tf.Variable 'Variable:0' shape=(4,) dtype=float32_ref>
R weights: Tensor("Slice:0", shape=(1,), dtype=float32)


## Training the Agent