In [0]:
import tensorflow as tf
import numpy as np
import random

In [0]:
num_machines = 3                               # Choice of machines from which we can choose to select anyone, all machines having a random reward, but with different probabilities of success

reward_thresholds = np.linspace(start=-2, stop=2, num=num_machines).tolist()

def get_reward(reward_threshold):              # This function computes reward which is a probabilistic function of reward threshold
  probability = np.random.normal(loc=0.0, scale=1.0, size=None)
  return 1 if probability < reward_threshold else -1

In [3]:
# Let's test out how well this function does
print('reward_thresholds:', reward_thresholds)

for reward_threshold in np.random.uniform(0, 1, size=10):
  print( get_reward(reward_threshold) )

reward_thresholds: [-2.0, 0.0, 2.0]
1
-1
1
1
-1
1
1
1
1
1


In [0]:
# Let's build out our model

tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_machines]))
chosen_action = tf.argmax(weights, axis=0)

reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
responsible_weight = tf.slice(input_=weights, begin=action_holder, size=[1])
loss = -(tf.log(responsible_weight) * reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update_model = optimizer.minimize(loss)

In [11]:
# Let's train our model

num_episodes = 1000
total_reward = np.zeros(num_machines)
num_trials = np.zeros(num_machines)

eps = 0.1
init = tf.initialize_all_variables()


def select_action(sess, eps):
  probability = random.uniform(0, 1)
  if probability < eps:
    return np.random.randint(low=0, high=num_machines, size=None)
  else:
    action = sess.run(chosen_action)
  return action


with tf.Session() as sess:
  sess.run(init)
  
  for episode in range(num_episodes):
    #print('At start of episode', episode, 'total_reward is', total_reward)
    
    action = select_action(sess, eps)
    #print('action:', action)
    
    reward = get_reward(reward_thresholds[action])
    sess.run(update_model, feed_dict={reward_holder: [reward], action_holder: [action]})
    
    total_reward[action] += reward
    num_trials[action] += 1
    
    if episode % 20 == 0:
      print('episode:', episode, 'rewards:', total_reward)
print('The actions have been tried for following number of times', num_trials)

episode: 0 rewards: [-1.  0.  0.]
episode: 20 rewards: [-1.  0. 16.]
episode: 40 rewards: [-4.  0. 33.]
episode: 60 rewards: [-4.  0. 49.]
episode: 80 rewards: [-4.  0. 69.]
episode: 100 rewards: [-5.  0. 86.]
episode: 120 rewards: [ -5.  -1. 105.]
episode: 140 rewards: [ -6.  -2. 121.]
episode: 160 rewards: [ -6.  -3. 138.]
episode: 180 rewards: [ -7.  -3. 151.]
episode: 200 rewards: [ -8.  -2. 169.]
episode: 220 rewards: [ -8.  -2. 187.]
episode: 240 rewards: [ -8.  -2. 207.]
episode: 260 rewards: [ -8.  -2. 227.]
episode: 280 rewards: [ -9.  -2. 246.]
episode: 300 rewards: [ -9.  -2. 266.]
episode: 320 rewards: [ -9.  -2. 284.]
episode: 340 rewards: [ -9.  -3. 303.]
episode: 360 rewards: [-13.  -2. 316.]
episode: 380 rewards: [-13.  -2. 334.]
episode: 400 rewards: [-14.  -1. 348.]
episode: 420 rewards: [-14.  -1. 366.]
episode: 440 rewards: [-15.  -2. 384.]
episode: 460 rewards: [-16.  -2. 401.]
episode: 480 rewards: [-17.  -3. 415.]
episode: 500 rewards: [-18.  -3. 430.]
episode: 5