In [3]:
import numpy as np

In [89]:
class BanditAlgorithm(object):
    
    def __init__(self, k_arms):
        pass
    
    def select_arm(self):
        pass
    
    def update(self, chosen_arm, reward):
        pass

In [126]:
class UCB(BanditAlgorithm):
    
    def __init__(self,k_arms,beta=1):
        super().__init__(k_arms)
        
        self.emperical_mean = np.zeros(k_arms)
        self.count = np.zeros(k_arms)
        self.beta = beta
        
    def select_arm(self):
        
        if 0 in self.count:
            return self.count.tolist().index(0)
        
        ucb_values = self.values + beta * np.sqrt(np.log(self.count.sum()) / self.counts)

        return np.argmax(ucb_values)

    def update(self, chosen_arm, reward):

        self.count[chosen_arm] += 1

        n = self.count[chosen_arm]

        new_emperical_mean = ((n - 1) / n) * self.emperical_mean[chosen_arm] + (1 / n) * reward

        self.emperical_mean[chosen_arm] = new_emperical_mean

In [139]:
class ThompsonSampling(BanditAlgorithm):
    
    def __init__(self,k_arms,beta=1):
        
        super().__init__(k_arms)
        
        self.successes = np.zeros(k_arms)
        self.fails = np.zeros(k_arms)
        
    def select_arm(self):

        theta = np.random.beta(a=(self.successes + 1), b=(self.fails + 1))

        return np.argmax(theta)

    def update(self, chosen_arm, reward):

        self.successes[chosen_arm] += reward
        self.fails[chosen_arm] += (1-reward)


In [140]:
class BernoulliArm(object):

    def __init__(self, mu):
        self.mu = mu
  
    def draw(self):
        return np.random.binomial(n=1, p=self.mu)       

In [141]:
np.zeros((2,5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [142]:
class BanditSimulation(object):
    
    def __init__(self, k_arms, T, number_of_runs):
        self.T = T
        self.number_of_runs = number_of_runs
        self.k_arms = k_arms
        self.cumulative_regret = {"Thompson Sampling":np.zeros((T,number_of_runs)),
                                  "UCB":np.zeros((T,number_of_runs))}
    
    def run(self):
        
        for run_number in range(self.number_of_runs):
            
            mu_vector = np.random.uniform(size=self.k_arms)
              
            arms = [BernoulliArm(mu) for mu in mu_vector]
            
            ucb = UCB(self.k_arms)
          
            thompson_sampling = ThompsonSampling(self.k_arms)
            
            for t in range(self.T):
                
                ucb_chosen_arm = ucb.select_arm()
                ucb_reward = arms[ucb_chosen_arm].draw()
                ucb.update(ucb_chosen_arm, ucb_reward)
                ucb_regret = max(mu_vector) - ucb_reward 
                
                thompson_sampling_chosen_arm = thompson_sampling.select_arm()
                thompson_sampling_reward = arms[thompson_sampling_chosen_arm].draw()
                thompson_sampling.update(thompson_sampling_chosen_arm, thompson_sampling_reward)
                thompson_sampling_regret = max(mu_vector) - thompson_sampling_reward

                if t==0:
                    self.cumulative_regret["UCB"][t,run_number] = ucb_regret
                    self.cumulative_regret["Thompson Sampling"][t,run_number] = thompson_sampling_regret
                else:
                    self.cumulative_regret["UCB"][t,run_number] = ucb_regret + self.cumulative_regret["UCB"][t-1,run_number]
                    self.cumulative_regret["Thompson Sampling"][t,run_number] = thompson_sampling_regret + self.cumulative_regret["Thompson Sampling"][t-1,run_number]

                    
                

In [143]:
sim = BanditSimulation(6,500,8)

In [144]:
sim.run()

AttributeError: 'UCB' object has no attribute 'values'

In [45]:
ucb = UCB(7)

In [98]:
a = np.random.randn(3,6)

In [102]:
a

array([[-0.65221473,  0.35312628, -1.06987642,  0.55398235,  0.45936365,
         0.88627747],
       [ 0.1055198 , -1.37234696, -0.10268464, -0.98547921,  0.00611771,
        -1.47898373],
       [-0.74205881, -1.2999804 , -2.09954308, -0.25071237,  0.52428051,
         0.22179928]])

In [101]:
a[1,3]


-0.9854792134620183

In [57]:
np.random.choice([1,2,3])

3

In [67]:
np.where(ucb.count==0)

(array([0, 1, 2, 3, 4, 5, 6]),)

In [54]:
np.random.choice(np.where(ucb.count==0))

ValueError: a must be 1-dimensional

In [47]:
ucb.select_arm()

0