In [1]:
#Multi-armed bandit exploration

#import statements
import numpy as np

In [3]:
#set up all valid parameters
#for now we'll be going by the example given in this video:
#https://www.youtube.com/watch?v=e3L4VocZnnQ
#later on in the code we'll go over solutions for k-armed bandits

means = [10, 8, 5]
stdevs = [5, 4, 2.5]
num_meals = 300
optimal = max(means) * num_meals # optimal happiness value on average

In [10]:
#the first naive strategy: explore only
def basic_naive_explore():
  happiness = 0

  times_per_restaurant = num_meals / len(means)
  happiness = [times_per_restaurant * i for i in means]
  happiness = sum(happiness)

  regret = optimal - happiness
  return regret
basic_naive_explore()

700.0

In [13]:
#the second naive strategy: exploit only
def basic_naive_exploit():
  happiness = 0
  num_restaurants = len(means)
  samples = [np.random.normal(i, j, 1)[0] for i, j in zip(means, stdevs)]
  
  exploit_res = samples.index(max(samples))

  meals_left = num_meals - num_restaurants

  happiness = happiness + sum(samples)

  for x in range(meals_left):
    happiness = happiness + np.random.normal(means[exploit_res], stdevs[exploit_res], 1)[0]

  regret = optimal - happiness
  return regret
basic_naive_exploit()

697.7241573114848

In [20]:
#the epsilon greedy strategy
epsilon = 10 #we're going for a ten percent chance of exploration

def basic_epsilon_greedy():
  happiness = 0
  samples = [np.random.normal(i, j, 1)[0] for i, j in zip(means, stdevs)]
  averages = {}
  num_values = {}
  for x in range(len(means)):
    averages[x] = samples[x]
    num_values[x] = 1
  
  meals_left = num_meals - len(means)
  epsilon_preds = np.random.randint(0, epsilon, meals_left) # in one go we have all of our epislons

  for eps in epsilon_preds:
    if eps == 0:
      rand_res = np.random.randint(0, len(means), 1)[0]
      res_score = np.random.normal(means[rand_res], stdevs[rand_res], 1)[0]
      happiness = happiness + res_score
      num_values[rand_res] = num_values[rand_res] + 1
      averages[rand_res] = averages[rand_res] + ((res_score + averages[rand_res])/(num_values[rand_res]))
    else:
      max_average_res = max(averages, key=averages.get)
      res_score = np.random.normal(means[max_average_res], stdevs[max_average_res], 1)[0]
      happiness = happiness + res_score

  regret = optimal - happiness
  return regret
basic_epsilon_greedy()

141.53532193260753