In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
class MultiArmBandit:
    def __init__(self, steps = 10000, k = 10,\
                mu = 0, sigma = 1, epsilon = 0.01, \
                stationary = True, alpha = None,\
                init_q = 0, upper_bound = False):
        self.steps = steps
        self.k = k
        self.epsilon = epsilon
        self.mu = 0
        self.sigma = 1
        self.stationary = stationary
        self.alpha = alpha
        self.init_q = init_q
        self.upper_bound = upper_bound
        
        self.reset_values()
                
        self.overall_average_rewards =  np.zeros(steps)
        self.overall_percentage_optimal = np.zeros(steps)
        
    def reset_values(self):
        if self.stationary:
            self.q_star = np.random.normal(self.mu, self.sigma, self.k)        
        else:
            init_value = np.random.normal(self.mu, self.sigma)
            self.q_star = np.ones(self.k) * init_value
        
        self.estimates = np.ones(self.k) * self.init_q
        self.counts = np.zeros(self.k)
        
        self.total_reward = 0
        self.average_rewards = np.zeros(self.steps)
        
        self.optimal_count = 0
        self.percentage_optimal = np.zeros(self.steps)
    
    # Returns index of action selected
    def select_action(self, step):
        
        if not self.stationary:
            #Update Q Values With Random Walk
            self.q_star += np.random.normal(0,0.01,self.k)
        
        prob = random.random()
        selected_action = None
        
        if self.upper_bound:
            temp = (( np.log(step + 1.0)/self.counts)**0.5) * 2 + self.estimates
            selected_action = np.argmax(temp)
        else:
            if prob < self.epsilon:
                selected_action =  random.randint(0,self.k - 1)
            else:
                selected_action = np.argmax(self.estimates)
        
        optimal_action = np.argmax(self.q_star)
        if selected_action == optimal_action:
            self.optimal_count += 1
        
        return selected_action
    
    #Performs an independent move
    def make_move(self, step):
        action = self.select_action(step)
        q_star_action = self.q_star[action]
        variance = 1
        
        #Get reward from normal with mean q*(a) and vairance 1
        reward = np.random.normal(q_star_action, variance)
        
        # Update Step
        old_count = self.counts[action]
        old_estimate = self.estimates[action]
        
        new_count = old_count + 1
        if not self.alpha:
            new_estimate = old_estimate + (reward - old_estimate)*1.0/new_count
        else:
            new_estimate = old_estimate + (reward - old_estimate) * self.alpha
            
        self.counts[action] = new_count
        self.estimates[action] = new_estimate
        
        self.total_reward += reward
        average_reward = self.total_reward/(step + 1)
        self.average_rewards[step] = average_reward
        
        self.percentage_optimal[step] = self.optimal_count/(step + 1)

    def play_game_once(self):
        for step in range(self.steps):
            self.make_move(step)
    
    def play(self, runs = 1000):
        for run in tqdm(range(runs)):
            self.play_game_once()
            self.overall_average_rewards += self.average_rewards
            self.overall_percentage_optimal += self.percentage_optimal 
            self.reset_values()
        self.overall_average_rewards = self.overall_average_rewards/runs
        self.overall_percentage_optimal = self.overall_percentage_optimal/runs