In [101]:
import numpy as np
import matplotlib.pyplot as plt

In [102]:
#make the slot machines with probability of reward (all rewards are same)
machines = np.array([0.2, 0.5, 0.6, 0.1])
cumulative_rewards = np.zeros_like(machines)
picks = np.zeros_like(machines)

In [103]:
#generate reward based on probability and machine choice
def gen_reward(machines, idx):
    rewards = np.zeros_like(machines)
    picks = np.zeros_like(machines)
    if np.random.uniform(0,1,1) < machines[idx]:
        rewards[idx] = 1
    picks[idx] = 1
    print(f"Choice {picks}")
    return rewards, picks

In [104]:
def accumulate_rewards(new_reward, prev_cumulative_rewards):
    return prev_cumulative_rewards + new_reward

In [105]:
def apply_policy(policy, machines, cumulative_rewards, old_picks, **kwargs):
    new_reward, pick = policy(machines, cumulative_rewards, old_picks, **kwargs)
    cumulative_rewards = accumulate_rewards(new_reward, cumulative_rewards)
    cumulative_picks = old_picks + pick
    return cumulative_rewards, cumulative_picks

In [106]:
def greedy_policy(machines, cumulative_rewards, picks, **kwargs):
    means = cumulative_rewards / picks
    max_mean = np.max(means)
    max_mean_idx = np.argmax(means)

    return gen_reward(machines, max_mean_idx) 

In [107]:
def epsilon_greedy_policy(machines, cumulative_rewards, picks, **kwargs):
    epsilon = kwargs.get("epsilon", 0.1)
    exploit = np.random.uniform(0,1,1) < 1-epsilon
    if exploit:
        print("Exploiting")
        return greedy_policy(machines, cumulative_rewards, picks)
    else:
        print("Exploring")
        idx = np.random.randint(0,machines.shape[0],1)
        pick = np.zeros_like(machines)
        pick[idx] = 1
        return gen_reward(machines, idx)

In [108]:
def decaying_epsilon_greedy_policy(machines, cumulative_rewards, picks, **kwargs):
    iters=kwargs.get("iters", 0)
    eps_0 = kwargs.get("eps_0", 0.1)
    epsilon = eps_0 * np.exp(-iters)
    print(f"Epsilon {epsilon}")
    return epsilon_greedy_policy(machines, cumulative_rewards, picks, epsilon=epsilon)

In [109]:
def upper_confidence_bound_policy(machines, cumulative_rewards, picks, **kwargs):
    steps = kwargs.get("iters", 0)
    confidence = kwargs.get("c", 0.1)
    mean = cumulative_rewards / picks

    mean_mod = mean + confidence * np.sqrt((1/np.log(steps))*picks)

    idx = np.argmax(mean_mod)
    return gen_reward(machines, idx)

In [110]:
for i in range(100):
    cumulative_rewards, picks = apply_policy(upper_confidence_bound_policy, machines, cumulative_rewards, picks, iters=i, c=0.1)
    print(f"Cumulative rewards: {cumulative_rewards}")

Choice [1. 0. 0. 0.]
Cumulative rewards: [0. 0. 0. 0.]
Choice [0. 1. 0. 0.]
Cumulative rewards: [0. 1. 0. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 1. 1. 0.]
Choice [0. 0. 0. 1.]
Cumulative rewards: [0. 1. 1. 0.]
Choice [0. 1. 0. 0.]
Cumulative rewards: [0. 2. 1. 0.]
Choice [0. 1. 0. 0.]
Cumulative rewards: [0. 3. 1. 0.]
Choice [0. 1. 0. 0.]
Cumulative rewards: [0. 3. 1. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 3. 2. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 3. 3. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 3. 3. 0.]
Choice [0. 1. 0. 0.]
Cumulative rewards: [0. 4. 3. 0.]
Choice [0. 1. 0. 0.]
Cumulative rewards: [0. 4. 3. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 4. 4. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 4. 4. 0.]
Choice [0. 1. 0. 0.]
Cumulative rewards: [0. 4. 4. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 4. 5. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 4. 5. 0.]
Choice [0. 0. 1. 0.]
Cumulative rewards: [0. 4. 5. 0.]
Choice [0.

  mean = cumulative_rewards / picks
  mean_mod = mean + confidence * np.sqrt((1/np.log(steps))*picks)
  mean_mod = mean + confidence * np.sqrt((1/np.log(steps))*picks)
  mean_mod = mean + confidence * np.sqrt((1/np.log(steps))*picks)
