# The Multi Armed Bandit Problem

### Dependencies

In [1]:
import numpy as np
import tensorflow as tf

### n-armed bandit

In [2]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

In [3]:
def pull_bandit(bandit):
    """
    Pull a bandit.
    
    Generates a random number from a normal
    distribution with mean of 0.
    
    :param bandit: float
        The bandit to be pulled.
    
    :return reward: int
        The reward for pulling a bandit.
        The lower the bandit the more likely it
        is for a positive reward.
    """
    reward = 1 if np.random.randn(1) > bandit else -1
    return reward

## The Agent

In [4]:
tf.reset_default_graph()

# Weights: contains the value that estimates how good a 
#          choosen (pulled) bandit is.
# Action:  is the action that gives the maximum 
#          weight value
weights = tf.Variable(tf.ones(shape=[num_bandits]))
choosen_action = tf.argmax(weights, axis=0)

# Reward: reward from pulling a bandit/lever
# Action: the action that lead to the reward
reward_placeholder = tf.placeholder(tf.float32, shape=[1])
action_placeholder = tf.placeholder(tf.int32, shape=[1])
# action slice from the weight
responsible_weight = tf.slice(weights, action_placeholder, [1])

# loss = -log(π) * A
loss = -tf.log(responsible_weight) * reward_placeholder

optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-3)
train = optimizer.minimize(loss)

## Training the Agent

### Tensorflow's `Session`

In [5]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

### Hyperparameters

In [6]:
episodes = 1000
log_interval = 100
rewards = np.zeros(shape=num_bandits)
e = 0.7  # Epsilon:

### Training

In [7]:
print('* - random action')

for i in range(episodes):
    # Explore or Exploit
    if np.random.randn(1) > e:
        action = np.random.randint(num_bandits)
        is_random = True
    else:
        action = sess.run(choosen_action)
        is_random = False
    # Reward from taking that action
    reward = pull_bandit(bandits[action])
    # Train the net
    _, _weight = sess.run([train, weights], 
                          feed_dict={reward_placeholder:[reward],
                                     action_placeholder:[action]})
    rewards[action] += reward
    # Logging
    if i % log_interval == 0:
        rand_msg = '*' if is_random else ''
        print(f'{i}: Bandit {action} looks good\tRewards: {rewards}{rand_msg}')
    if i+1 == episodes:
        print(f'\nAgent says bandit {action} is the best...')
        if np.argmax(_weight) == np.argmax(-np.array(bandits)):
            print(f'...and it was right!')
        else:
            print(f'...and it was wrong!')

* - random action
0: Bandit 0 looks good	Rewards: [-1.  0.  0.  0.]
100: Bandit 0 looks good	Rewards: [ -3.   1.  -2.  75.]*
200: Bandit 3 looks good	Rewards: [  -4.    5.    1.  155.]
300: Bandit 3 looks good	Rewards: [  -3.    4.    5.  239.]
400: Bandit 3 looks good	Rewards: [  -1.    6.    5.  323.]
500: Bandit 3 looks good	Rewards: [  -4.    9.    2.  404.]
600: Bandit 3 looks good	Rewards: [  -5.    8.    7.  487.]
700: Bandit 3 looks good	Rewards: [  -6.   11.    9.  565.]
800: Bandit 0 looks good	Rewards: [ -11.   13.   16.  647.]*
900: Bandit 3 looks good	Rewards: [ -14.   13.   13.  731.]

Agent says bandit 3 is the best...
...and it was right!
