# Contextual Bandit Problem

## Reinforcement Learning approach

In [1]:
# dependencies
import sys
import os

import numpy as np
import tensorflow as tf

### The `ContextualBandit` (Slot machine)

In [2]:
class ContextualBandit():
    """
    Contextual Bandit class.
    
    A three four-armed bandit is being used here. 
    What this means is that each bandit has four arms that can 
    be pulled. 
    
    Each bandit has different success probabilities for 
    each arm, and as such requires different actions to 
    obtain the best result.
    
    The agent should learn how to always choose the 
    bandit-arm that will most often give a positive 
    reward, depending on the Bandit presented.
    
    -------- Attributes -------
    :bandits
    :states
    :num_bandits
    :num_actions
    
    -------- Methods --------
    getBandit:
        Returns a random state.
    pullBandit:
        Returns a reward for an action.
    """
    
    def __init__(self):
        self.bandits = np.array([[.2,  0, -0,  -5],
                                 [.1, -5,  1, .25],
                                 [-5,  5,  5,   5]])
        self.state = 0
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
    
    def getBandit(self):
        """Returns a random state"""
        self.state = np.random.randint(0, len(self.bandits))
        return self.state
    
    def pullBandit(self, action):
        """
        Returns a reward for an action.
        
        A random number from a normal distribution with a mean of 0
        is generated. The lower the bandit number, the more likely 
        a positive reward will be returned. 
        """
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        return 1 if result > bandit else -1

### The `Agent` (Gambler)

In [3]:
class Agent():
    """
    The agent takes as input the current state, and returns an action. 
    
    This allows the agent to take actions which are conditioned 
    on the state of the environment, a critical step toward being 
    able to solve full RL problems. 
    
    The agent uses a single set of weights, within which each 
    value is an estimate of the value of the return from choosing 
    a particular arm given a bandit.
    
    A policy gradient method is used to update the agent 
    by moving the value for the selected action toward the 
    recieved reward.
    
    :param learning_rate: float
    :param state_size: int
    :param action_size: int
    """
    
    def __init__(self, learning_rate, state_size, action_size):
        # inputs
        with tf.variable_scope('agent') as scope:
            self.state = tf.placeholder(tf.int32, shape=[1], name='state')
            state_oh = tf.contrib.layers.one_hot_encoding(self.state, state_size)
            # feed forward net
            output = tf.contrib.layers.fully_connected(state_oh, action_size,
                                                       activation_fn=tf.nn.sigmoid,
                                                       weights_initializer=tf.ones_initializer(),
                                                       biases_initializer=None, scope=scope)
            # action & network output
            self.output = tf.reshape(output, shape=[-1])
            self.action = tf.argmax(self.output, axis=0, name='action')
            # reward, action, weights
            self.reward_holder = tf.placeholder(tf.float32, shape=[1], name='reward_holder')
            self.action_holder = tf.placeholder(tf.int32,   shape=[1], name='action_holder')
            self.weight = tf.slice(self.output, self.action_holder, size=[1], name='weight')
            # loss function
            loss = -(tf.log(self.weight) * self.reward_holder)
            self.loss = tf.reduce_mean(loss, name='loss')
            # Optimizer
            self.global_step = tf.Variable(0, trainable=False, name='global_step')
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.trainer = optimizer.minimize(self.loss, global_step=self.global_step)

### Environement and Agent Setup

In [4]:
tf.reset_default_graph()

bandit = ContextualBandit()
agent = Agent(learning_rate=1e-2, 
              state_size=bandit.num_bandits, 
              action_size=bandit.num_actions)
weights = tf.trainable_variables()[0]

### Tensorflow's Session

In [5]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

### Tensorboard

In [6]:
env_name = f"{bandit.num_bandits}-armed-bandit{'s' if bandit.num_bandits > 1 else ''}"
saved_dir = os.path.join('saved', env_name)
tensorboard_dir = os.path.join(saved_dir, 'tensorboard')
logdir = os.path.join(tensorboard_dir, 'log')

tf.summary.scalar('agent/loss', agent.loss)
merged = tf.summary.merge_all()

model_dir = os.path.join(saved_dir, 'models')
model_path = os.path.join(model_dir, 'model.ckpt')

writer = tf.summary.FileWriter(logdir=logdir, graph=sess.graph)
saver = tf.train.Saver()

if tf.gfile.Exists(model_dir):
    try:
        sys.stdout.write(f'INFO: Restoring latest checkpoint\n')
        
        last_ckpt = tf.train.latest_checkpoint(model_dir)
        saver.restore(sess=sess, save_path=last_ckpt)

        sys.stdout.write(f'INFO: Successfully restored last checkpoint — {last_ckpt}')
        sys.stdout.flush()
    except Exception as ex:
        sys.stderr.write(f'ERR: Could not load checkpoint. {ex}')
        sys.stderr.flush()
else:
    tf.gfile.MakeDirs(model_dir)
    
    sys.stdout.write(f'INFO: Checkpoint file does not exist.\n'
                     f'Creating checkpoint it {model_dir}')
    sys.stdout.flush()

INFO: Restoring latest checkpoint
INFO:tensorflow:Restoring parameters from saved/3-armed-bandits/models/model.ckpt-9901
INFO: Successfully restored last checkpoint — saved/3-armed-bandits/models/model.ckpt-9901

### Hyperparameters

In [7]:
episodes = 10000
save_interval = 100
e = 0.1
rewards = np.zeros(shape=[bandit.num_bandits, bandit.num_actions])

### Training the `Agent`

In [8]:
for i in range(episodes):
    state = bandit.getBandit()

    if np.random.rand(1) < e:
        action = np.random.randint(bandit.num_actions)
    else:
        action = sess.run(agent.action, feed_dict={agent.state: [state]})
    
    reward = bandit.pullBandit(action)
    
    feed_dict = {agent.reward_holder: [reward], 
                 agent.action_holder: [action],
                 agent.state: [state]}
    _, i_global, _weights = sess.run([agent.trainer, agent.global_step, weights], feed_dict=feed_dict)
    
    
    if i%save_interval == 0:
        saver.save(sess=sess, save_path=model_path, global_step=agent.global_step)
        summary = sess.run(merged, feed_dict=feed_dict)
        writer.add_summary(summary=summary, global_step=i_global)
        
    rewards[state, action] += reward
    sys.stdout.write(f'\rEpisode: {i+1:,}\tGlobal step: {i_global:,}'
                     f'\tMean reward: {np.mean(rewards, axis=1)}')
    sys.stdout.flush()

Episode: 10,000	Global step: 19,901	Mean reward: [ 764.5  753.   701.5]5]