In [1]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


# The Contextual Bandits
Here we define our contextual bandits. In this example, we are using three four-armed bandits. What this means is that each bandit has four arms that can be pulled. Each bandit has different success probabilities for each arm, and as such requires different actions to obtain the best result. The `pullBandit` function generates a random number from a normal distribution with a mean of 0. The lower the bandit number, the more likely a positive reward will be returned. We want our agent to learn to always choose the bandit-arm that will most often give a positive reward, depending on the bandit presented.

In [2]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        # List our bandits. Currently, arms 4, 2, and 1 are
        # the most optimal, respectively
        self.bandits = np.array(
            [[0.2, 0, -0.0, -5],
             [0.1, -5, 1, 0.25],
             [-5, 5, 5, 5]]
        )
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        # Returns a random state for each episode
        self.state = np.random.randint(0, len(self.bandits))
        return self.state
    
    def pullArm(self, action):
        # Get a random number
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        return 1 if result > bandit else -1

### The policy-based agent
The code below establishes our simple neural agent. It takes as input the current state and returns an action. This allows the agent to take actions which are conditioned on the state of the environment. This allows the agent to ake actions which are conditioned on the state of the environment, a critical step toward being able to solve full RL problems. The agent uses a single set of weights, within which each value is an estimate of the value of the return from choosing a particular arm given a bandit. We use a policy gradient method to update the agent by moving the value for the selected action toward the received reward.

In [3]:
class agent():
    def __init__(self, lr, s_size, a_size):
        # Establish the feed-forward part of the network
        # The agent takes a state and produces an action
        self.state_in = tf.placeholder(shape = [1], dtype = tf.int32)
        state_in_OH = tf.one_hot(self.state_in, s_size)
        output = tf.contrib.layers.fully_connected(
            state_in_OH, a_size, biases_initializer = None,
            activation_fn = tf.nn.sigmoid, weights_initializer = tf.ones_initializer()
        )
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output, 0)
        
        # Establish the training procedure by feeding the
        # reward and chosen action into the network to compute
        # the loss, using it to update the network
        self.reward_holder = tf.placeholder(shape = [1], dtype = tf.float32)
        self.action_holder = tf.placeholder(shape = [1], dtype = tf.int32)
        self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate = lr)
        self.update = optimizer.minimize(self.loss)

### Training the agent
We will train our agent by getting a state from the environment, take an action, and receive a reward. Using these three things, we can know how to properly update our network in order to more often choose actions given states that will yield the highest rewards over time.

In [4]:
tf.reset_default_graph()

cBandit = contextual_bandit()
myAgent = agent(lr = 0.001, s_size = cBandit.num_bandits, a_size = cBandit.num_actions)
weights = tf.trainable_variables()[0]

total_episodes = 10000
total_reward = np.zeros([cBandit.num_bandits, cBandit.num_actions])
e = 0.1  # chance of taking a random action

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        s = cBandit.getBandit()  # Get a state from the environment
        
        # Choose either a random action or one from our network
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(
                myAgent.chosen_action, feed_dict = {
                    myAgent.state_in: [s]
                }
            )
        # Get our reward for this action given this bandit
        reward = cBandit.pullArm(action)
        # Update our network
        feed_dict = {
            myAgent.reward_holder: [reward],
            myAgent.action_holder: [action],
            myAgent.state_in: [s],
        }
        _, ww = sess.run([myAgent.update, weights], feed_dict = feed_dict)
        # Update our running tally of scores
        total_reward[s, action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the %d bandits: %s" % (
                cBandit.num_bandits, str(np.mean(total_reward, axis = 1))
            ))
        i += 1
for a in range(cBandit.num_bandits):
    print("The agent thinks action %d for bandit %d is the most promising..." % (
        np.argmax(ww[a])+1, a+1
    ))
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("... and it was right!")
    else:
        print("... and it was wrong!")

Instructions for updating:
Use the retry module or similar alternatives.
Mean reward for each of the 3 bandits: [ 0.   -0.25  0.  ]
Mean reward for each of the 3 bandits: [38.5  34.5  36.25]
Mean reward for each of the 3 bandits: [79.25 72.75 66.75]
Mean reward for each of the 3 bandits: [122.25 109.5   97.  ]
Mean reward for each of the 3 bandits: [159.5  148.25 131.  ]
Mean reward for each of the 3 bandits: [196.   187.5  169.25]
Mean reward for each of the 3 bandits: [238.75 222.   206.5 ]
Mean reward for each of the 3 bandits: [277.   254.   248.25]
Mean reward for each of the 3 bandits: [311.75 294.75 285.75]
Mean reward for each of the 3 bandits: [353.25 332.5  320.  ]
Mean reward for each of the 3 bandits: [387.75 371.   357.5 ]
Mean reward for each of the 3 bandits: [427.75 409.   392.5 ]
Mean reward for each of the 3 bandits: [461.75 452.   430.  ]
Mean reward for each of the 3 bandits: [500.5  490.25 467.5 ]
Mean reward for each of the 3 bandits: [541.   524.75 503.  ]
Mean r