In [1]:
import numpy as np
import tensorflow as tf

slim = tf.contrib.slim

### The `ContextualBandit` (Slot machine)

In [None]:
class ContextualBandit():
    """
    Contextual Bandit class.
    
    A three four-armed bandit is being used here. 
    What this means is that each bandit has four arms that can 
    be pulled. 
    
    Each bandit has different success probabilities for 
    each arm, and as such requires different actions to 
    obtain the best result.
    
    The agent should learn how to always choose the 
    bandit-arm that will most often give a positive 
    reward, depending on the Bandit presented.
    
    -------- Attributes -------
    :bandits
    :states
    :num_bandits
    :num_actions
    
    -------- Methods --------
    getBandit:
        Returns a random state.
    pullBandit:
        Returns a reward for an action.
    """
    
    def __init__(self):
        self.bandits = np.array([[.2,  0, -0,  -5],
                                 [.1, -5,  1, .25],
                                 [-5,  5,  5,   5]])
        self.state = 0
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
    
    def getBandit(self):
        """Returns a random state"""
        self.state = np.random.randn(0, len(self.bandits))
        return self.state
    
    def pullBandit(self, action):
        """
        Returns a reward for an action.
        
        A random number from a normal distribution with a mean of 0
        is generated. The lower the bandit number, the more likely 
        a positive reward will be returned. 
        """
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        return 1 if result > bandit else -1

### The `Agent` (Gambler)

In [None]:
class Agent():
    """
    The agent takes as input the current state, and returns an action. 
    
    This allows the agent to take actions which are conditioned 
    on the state of the environment, a critical step toward being 
    able to solve full RL problems. 
    
    The agent uses a single set of weights, within which each 
    value is an estimate of the value of the return from choosing 
    a particular arm given a bandit.
    
    A policy gradient method is used to update the agent 
    by moving the value for the selected action toward the 
    recieved reward.
    
    :param learning_rate: float
    :param state_size: int
    :param action_size: int
    """
    
    def __init__(self, learning_rate, state_size, action_size):
        # inputs
        self.state = tf.placeholder(tf.int32, shape=[1])
        state_oh = tf.contrib.layers.one_hot_encoding(self.state, state_size)
        # feed forward net
        output = tf.contrib.layers.fully_connected(state_oh, action_size,
                                                   activation_fn=tf.nn.sigmoid,
                                                   weights_initializer=tf.ones_initializer(),
                                                   biases_initializer=None)
        # action & network output
        self.output = tf.reshape(output, shape=[-1])
        self.action = tf.argmax(self.output, axis=0)
        # reward, action, weights
        self.reward_holder = tf.placeholder(tf.float32, shape=[1])
        self.action_holder = tf.placeholder(tf.int32, shape=[1])
        self.weight = tf.slice(self.output, self.action_holder, size=[1])
        # loss function
        loss = -(tf.log(self.weight) * self.reward_holder)
        # Optimizer
        global_step = tf.Variable(0, trainable=False, name='global_step')
        optimzer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.trainer = optimizer.minimize(loss, global_step=global_step)