In [8]:
import numpy as np 
import random 
from copy import deepcopy
from math import sqrt, log, exp 
from numba import jit

class Bandit:
    class Arm:
        def __init__(self, prob=0):
            self.p = prob

        def get(self):
            return 1 if random.uniform(0, 1) < self.p else 0
            
    def __init__(self, probs):
        self.arms = [Bandit.Arm(prob) for prob in probs]
    
    def get(self, arm_no):
        return self.arms[arm_no].get()


def eps_greedy(N, eps, bandit):
    theta = [0, 0, 0]
    count = [0, 0, 0]
    total_reward = 0
    for _ in range(N):
        It = 0
        if random.uniform(0, 1) < eps:
            It = random.choice([0, 1, 2])
        else:
            It = theta.index(max(theta))
        count[It] += 1
        r_It = bandit.get(It)
        total_reward += r_It
        theta[It] += (1/count[It])*(r_It - theta[It])
    return theta, total_reward

def UCB(N, c, bandit):
    count = [0, 0, 0]
    theta = [0, 0, 0]
    total_reward = 0
    for t in range(3):
        count[t] = 1
        theta[t] = bandit.get(t)
    for t in range(3, N):
        temp = [ (theta[j] + c*sqrt(2*log(t+1)/count[j]) ) for j in range(3) ]
        It = temp.index(max(temp))
        count[It] += 1
        r_It = bandit.get(It)
        total_reward += r_It
        theta[It] += (1/count[It])*(r_It - theta[It])
    return theta, total_reward

def TS(N, params, bandit):
    ret_params = deepcopy(params)
    total_reward = 0
    for _ in range(N):
        theta = [0, 0, 0]
        for j in range(3):
            theta[j] = np.random.beta(ret_params[j][0], ret_params[j][1])
        It = theta.index(max(theta))
        r_It = bandit.get(It)
        total_reward += r_It
        ret_params[It][0] += r_It
        ret_params[It][1] += 1-r_It
    theta = [p[0]/(p[0] + p[1]) for p in ret_params]
    return theta, total_reward

@jit
def choice(weights):
    It = -1
    rand = random.uniform(0, 1)
    if rand < weights[0]: 
        It = 0
    elif rand >= weights[0] and rand < weights[0] + weights[1]: 
        It = 1
    else: 
        It = 2
    return It

@jit
def softmax(l):
    mx = np.max(l)
    return np.exp(l-mx)/sum(np.exp(l-mx))

@jit
def gradient(N, bandit, baseline=-1, step=1):
    H = np.zeros((3), dtype=np.float)
    count = np.zeros((3), dtype=np.int)
    avg_reward = 0
    for t in range(N):
        policy = softmax(H)
        It = choice(policy)
        r_It = bandit.get(It)
        count[It] += 1
        avg_reward = (t/(t+1))*avg_reward + (1/(t+1))*r_It 
        base = avg_reward if baseline == -1 else baseline
        for i in range(3):
            if i == It: 
                H[i] += step*(r_It - base)*(1 - policy[i])
            else: 
                H[i] -= step*(r_It - base)*policy[i]
    theta = [c/N for c in count]
    return theta, avg_reward*N
    

In [2]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = eps_greedy(5000, 0.1, b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4446.639
Average theta: th_1 = 0.900159507301239, th_2 = 0.8007918450181126, th_3 = 0.7010354658938532


In [3]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = eps_greedy(5000, 0.5, b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4248.687
Average theta: th_1 = 0.9002200708744997, th_2 = 0.7995294760097454, th_3 = 0.6999810509648006


In [4]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = eps_greedy(5000, 0.9, b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4047.709
Average theta: th_1 = 0.8995240125627245, th_2 = 0.8004824330595257, th_3 = 0.6990227015118962


In [5]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = UCB(5000, 1, b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4386.713
Average theta: th_1 = 0.9001177894046423, th_2 = 0.7965628276828748, th_3 = 0.6921078673811206


In [6]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = UCB(5000, 5, b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4124.582
Average theta: th_1 = 0.9000463271269632, th_2 = 0.7996617018744864, th_3 = 0.6991316735593176


In [7]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = UCB(5000, 10, b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4062.801
Average theta: th_1 = 0.8998802453840349, th_2 = 0.799791114200115, th_3 = 0.6999170824489821


In [8]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = TS(5000, [[1,1], [1,1], [1,1]], b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4485.132
Average theta: th_1 = 0.8998376042910347, th_2 = 0.7518697097593835, th_3 = 0.6281500933057753


In [9]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = TS(5000, [[601, 401], [401, 601], [2, 3]], b)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 3804.395
Average theta: th_1 = 0.6910863698637356, th_2 = 0.4001996007984122, th_3 = 0.6684201315490548


In [4]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, 0)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4262.282
Average theta: th_1 = 0.6196346000000009, th_2 = 0.2870460000000005, th_3 = 0.09331940000000019


In [9]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, 0.8)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4488.284
Average theta: th_1 = 0.9849928000000011, th_2 = 0.007526199999999998, th_3 = 0.007480999999999993


In [10]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, 5)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4125.355
Average theta: th_1 = 0.49969440000000026, th_2 = 0.25001620000000024, th_3 = 0.2502894


In [11]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, 20)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4080.204
Average theta: th_1 = 0.4402403999999996, th_2 = 0.2798478000000011, th_3 = 0.2799118000000011


In [12]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, -1, 0.2)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4459.882
Average theta: th_1 = 0.9467187999999999, th_2 = 0.026628400000000014, th_3 = 0.02665279999999999


In [13]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, -1, 1)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4488.188
Average theta: th_1 = 0.9842648000000003, th_2 = 0.00790840000000001, th_3 = 0.007826800000000002


In [14]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, -1, 2)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4489.696
Average theta: th_1 = 0.9857398000000006, th_2 = 0.0071183999999999935, th_3 = 0.0071417999999999855


In [15]:
b = Bandit([0.9, 0.8, 0.7])
th_total = [0, 0, 0]
reward_total = 0
for _ in range(1000):
    th, re = gradient(5000, b, -1, 5)
    for i in range(3):
        th_total[i] += th[i]
    reward_total += re
print("Average aggragate reward: {}".format(reward_total/1000))
print("Average theta: th_1 = {}, th_2 = {}, th_3 = {}".format(th_total[0]/1000, th_total[1]/1000, th_total[2]/1000))

Average aggragate reward: 4461.788
Average theta: th_1 = 0.9502760000000012, th_2 = 0.024842400000000014, th_3 = 0.024881600000000028
