In [0]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import random

In [0]:
num_machines = 3
num_levers = 4

class ContextualBandit:
  
  def __init__(self):
    self.state = 0
    self.bandits = [ [ random.uniform(-2, 2) for lever in range(num_levers) ] for machine in range(num_machines) ]
    
  def get_bandit(self):
    ''' Returns a random state '''
    self.state = np.random.randint(0, num_machines)
    return self.state
  
  def pull_arm(self, selected_lever):
    # Returns the random reward achieved by pulling selected_lever for machine given by self.state
    probability = np.random.normal(loc=0, scale=1, size=None)
    return 1 if self.bandits[self.state][selected_lever] > probability else -1

In [0]:
def to_onehot(num, total_size):
  arr = np.zeros(shape=[total_size])
  arr[num] = 1
  return arr


class Agent:
  
  def __init__(self, lr, s_size,a_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in,s_size)
        output = slim.fully_connected(state_in_OH,a_size,\
            biases_initializer=None,activation_fn=tf.nn.sigmoid,weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output,[-1])
        self.chosen_action = tf.argmax(self.output,0)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

In [22]:
tf.reset_default_graph()

bandit = ContextualBandit()
agent = Agent(lr=0.001, s_size=num_machines, a_size=num_levers)
weights = tf.trainable_variables()[0]

total_episodes = 10000
total_reward = np.zeros([num_machines, num_levers])
total_trials = np.zeros([num_machines, num_levers])

eps = 0.1

init = tf.initialize_all_variables()

with tf.Session() as sess:
  sess.run(init)
  
  for episode in range(total_episodes):
    state = bandit.get_bandit()
    
    prob_random_action = random.uniform(0, 1)
    if prob_random_action < eps:
      action = np.random.randint(0, num_levers)
    else:
      action = sess.run(agent.chosen_action, feed_dict={agent.state_in: [state]})
    
    reward = bandit.pull_arm(action)
    
    sess.run(agent.update, feed_dict={agent.reward_holder: [reward], agent.action_holder: [action], agent.state_in: [state]})
    
    total_reward[state, action] += reward
    total_trials[state, action] += 1
    
    if episode % 100 == 0:
      print('Episode:', episode, 'rewards:')
      print(total_reward)
  print('Finally agent tried each action so many times')
  print(total_trials)

Episode: 0 rewards:
[[1. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode: 100 rewards:
[[26.  0. -4.  0.]
 [-8. -8. -7. -7.]
 [ 0. 30.  2.  1.]]
Episode: 200 rewards:
[[ 48.   2.  -4.  -1.]
 [-15. -15. -15. -15.]
 [  0.  57.   2.   1.]]
Episode: 300 rewards:
[[ 72.   2.  -6.  -2.]
 [-23. -23. -23. -23.]
 [  0.  86.   2.   1.]]
Episode: 400 rewards:
[[ 96.   3.  -6.  -5.]
 [-30. -31. -31. -30.]
 [  0. 108.   3.   2.]]
Episode: 500 rewards:
[[126.   4.  -7.  -6.]
 [-38. -37. -37. -37.]
 [  0. 131.   3.  -1.]]
Episode: 600 rewards:
[[151.   4.  -7.  -8.]
 [-45. -45. -44. -45.]
 [  0. 159.   5.   0.]]
Episode: 700 rewards:
[[182.   4.  -7.  -9.]
 [-52. -51. -52. -52.]
 [  2. 184.   5.   1.]]
Episode: 800 rewards:
[[207.   3.  -8.  -9.]
 [-59. -59. -60. -60.]
 [  4. 204.   7.   1.]]
Episode: 900 rewards:
[[234.   4.  -9.  -9.]
 [-70. -71. -70. -69.]
 [  3. 226.   7.   1.]]
Episode: 1000 rewards:
[[259.   4. -10. -10.]
 [-77. -77. -77. -76.]
 [  4. 254.   8.   3.]]
Episode: 1100 rewards:
[[