In [19]:
import numpy as np

def pull_bandit_arm(bandits, bandit_number):
  """Pull arm in position bandit_number and return the obtained reward."""
  result = np.random.uniform()
  return int(result <= bandits[bandit_number])

In [20]:
import numpy as np

def take_epsilon_greedy_action(epsilon, average_rewards):
    """Take random action with probability epsilon, else take best action."""
    result = np.random.uniform()
    if result < epsilon:
        return np.random.randint(0, len(average_rewards))  # Random action
    else:
        return np.argmax(average_rewards)  # Greedy action

In [21]:
# Probability of success of each bandit
bandits = [0.1, 0.3, 0.05, 0.55, 0.4]
num_iterations = 1000
epsilon = 0.1

# Store info to know which one is the best action in each moment
total_rewards = [0 for _ in range(len(bandits))]
total_attempts = [0 for _ in range(len(bandits))]
average_rewards = [0.0 for _ in range(len(bandits))]

for iteration in range(num_iterations+1):
  action = take_epsilon_greedy_action(epsilon, average_rewards)
  reward = pull_bandit_arm(bandits, action)
  
  # Store result
  total_rewards[action] += reward
  total_attempts[action] += 1
  average_rewards[action] = total_rewards[action] / float(total_attempts[action])
  
  if iteration % 100 == 0:
    print('Average reward for bandits in iteration {} is {}'.format(iteration,
                                  ['{:.2f}'.format(elem) for elem in average_rewards]))

# Print results
best_bandit = np.argmax(average_rewards)
print('\nBest bandit is {} with an average observed reward of {:.4f}'
      .format(best_bandit, average_rewards[best_bandit]))
print('Total observed reward in the {} episodes has been {}'
      .format(num_iterations, sum(total_rewards)))

Average reward for bandits in iteration 0 is ['0.00', '0.00', '0.00', '0.00', '0.00']
Average reward for bandits in iteration 100 is ['0.16', '0.67', '0.00', '0.49', '0.00']
Average reward for bandits in iteration 200 is ['0.16', '0.45', '0.00', '0.53', '0.33']
Average reward for bandits in iteration 300 is ['0.15', '0.45', '0.00', '0.54', '0.33']
Average reward for bandits in iteration 400 is ['0.15', '0.46', '0.00', '0.56', '0.17']
Average reward for bandits in iteration 500 is ['0.15', '0.43', '0.08', '0.55', '0.12']
Average reward for bandits in iteration 600 is ['0.16', '0.40', '0.13', '0.56', '0.11']
Average reward for bandits in iteration 700 is ['0.16', '0.38', '0.13', '0.54', '0.11']
Average reward for bandits in iteration 800 is ['0.16', '0.32', '0.12', '0.54', '0.17']
Average reward for bandits in iteration 900 is ['0.17', '0.38', '0.10', '0.55', '0.19']
Average reward for bandits in iteration 1000 is ['0.16', '0.33', '0.08', '0.55', '0.18']

Best bandit is 3 with an average