In [1]:
import numpy as np

In [2]:
from scipy.stats import norm
import matplotlib.pyplot as plt

In [3]:
def plot_distribution(ax, pdfs, title=''): 
    x = np.linspace(0., 10., 200)
    ymax = 0    
    for index, pdf in enumerate(pdfs):
        y = norm.pdf(x, pdf.mean, np.sqrt(pdf.var))

        p = ax.plot(x, y, lw = 2)
        c = p[0].get_markeredgecolor()    
        ax.fill_between(x, y, 0, color=c, alpha=0.2 )    
        ax.autoscale(tight=True)
        ax.vlines(pdf.mean, 0, y.max(), colors = c, linestyles = "--", lw = 2)    

        ymax = max( ymax, y[1:].max()*1.05 )
    ax.set_ylim([0,ymax])

def plot_regret(ax, regret, title=''): 
    ax.plot(regret)

def plot(regret, mab):
    fig, axs = plt.subplots(1, 2, figsize=(6, 3))
    plot_regret(axs[0], regret)
    plot_distribution(axs[1], mab)

In [4]:
class Arm:
    def __init__(self, mean, var):
        self.mean = mean
        self.var = var

    def sample(self):
        return np.random.normal(self.mean, np.sqrt(self.var))

    def name(self):
        return 'N(' + str(self.mean) + ',' + str(self.var) + ')'

# Thompson Sampling

In this exercise we will run thompson sampling for 2-armed bandit with gaussian distribution.
For simplicity assume we know variance of distribution of arms and only mean is unknown
for prior assume gaussian distribution.

## 1.
if variance was unknown what prior distribtion would be suitable?

## 2.
Implement Thompson Sampling algorithm. For comparison also implement ϵ-Greedy and UCB algorithms

In [5]:
class ThompsonSampling:
    def __init__(self, var_list, **kwargs):
        """
        variance of arms are known to policy
        """
        pass

    def select_arm(self, *args):
        # ==================================== Your Code (Begin) ==================================
        
        # select arm based on estimate of prior distribution
        
        # return index of selected arm

        # ==================================== Your Code (End) ====================================
        pass

    def update(self, idx, reward):
        # ==================================== Your Code (Begin) ==================================
        
        # update prior based on reward
        
        # ==================================== Your Code (End) ====================================
        pass

In [6]:
class UCB:    
    def __init__(self, n_bandits, c_level):
        """
        c_level: coefficient of uncertainty
        """
        pass
    
    def select_arm(self, t):
        """
        t: step time
        """
        # ==================================== Your Code (Begin) ==================================
        
        # select arm based on UCB
        
        # return index of selected arm

        # ==================================== Your Code (End) ====================================
        pass

    def update(self, idx, reward):
        # ==================================== Your Code (Begin) ==================================
        
        # update based on reward
        
        # ==================================== Your Code (End) ====================================
        pass

In [7]:
class eGreedy:    
    def __init__(self, n_bandits, epsilon):
        """
        epsilon must be given
        """
        pass
    
    def select_arm(self, *args):
        # ==================================== Your Code (Begin) ==================================
        
        # select arm based on e-Greedy
        
        # return index of selected arm

        # ==================================== Your Code (End) ====================================
        pass

    def update(self, idx, reward):
        # ==================================== Your Code (Begin) ==================================
        
        # update based on reward
        
        # ==================================== Your Code (End) ====================================
        pass

## 3
run simulation for arms described as cells below and describe the differences of regret with different variance in arms distributions

rum_sim1 must return cumulitive regret formulated as 
$$
R(T)=\sum_{i=1}^2 N_i(T) \Delta_i
$$

where $N_i(T)$ is number of times arm $i$ was selected until step $T$, $\Delta_i=\mu^*-\mu_i$, $\mu^*$ is largest mean in arms distribtions and $\mu_i$ is mean of distribution of arm $i$

to get average regret we rum simulation 50 times.

In [8]:
def run_sim1(policy, mab, step_num=100):
    """
    run simulation of multi-armed bandit
    mab: list of arms
    """
    best_mean = np.max([b.mean for b in mab])
    regret = []
    for k in range(step_num):          
        # ==================================== Your Code (Begin) ==================================
        
        # run policy algorithm and return cumulative regret
 
        # ==================================== Your Code (End) ====================================
        continue
    return regret

### 3.1
Assume Multi-Armed Bandit variables are as follows.

In [9]:
mab = [Arm(6, 0.5), Arm(4, 0.5)]

#### Thompson Sampling
run and describe the result.

In [None]:
regret = [run_sim1(ThompsonSampling([b.var for b in mab]), mab) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)

#### ϵ-Greedy
run for different values of ϵ and compare results.

In [None]:
epsilon = None
regret = [run_sim1(eGreedy(2, epsilon=epsilon), mab) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)

#### UCB
run for different values of confidence level and compare results.


In [None]:
c_level = None
regret = [run_sim1(UCB(2, c_level=c_level), mab) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)

### 3.2
Assume Multi-Armed Bandit variables are as follows.

In [19]:
mab = [Arm(6, 10), Arm(4, 10)]

#### Thompson Sampling
run and compare results.

In [None]:
regret = [run_sim1(ThompsonSampling([b.var for b in mab]), mab, step_num=500) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)

#### ϵ-Greedy
run for different values of ϵ and compare results.

In [None]:
epsilon = None
regret = [run_sim1(eGreedy(2, epsilon=epsilon), mab, step_num=500) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)

#### UCB
run for different values of confidence level and compare results.

In [None]:
c_level = None
regret = [run_sim1(UCB(2, c_level=c_level), mab, step_num=500) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)

## 4
simulation below assumes a non-stationary multi-armed bandit. specifically in this simulation mean value of distribution of first arm changes in step 100. describe the result of thompson sampling.

In [31]:
def run_sim2(ts, mab, step_num=200, change_step=100):
    init_mean = mab[0].mean
    best_mean = np.max([b.mean for b in mab])
    regret = []
    for i in range(step_num):
        if i == change_step:
            mab[0].mean = 2
            best_mean = np.max([b.mean for b in mab])
            
                  
        # ==================================== Your Code (Begin) ==================================
        
        # run tompson sampling algorithm and return cumulative regret
        
        # ==================================== Your Code (End) ====================================
    mab[0].mean = init_mean
    return regret

In [None]:
mab = [Arm(6, 0.5), Arm(4, 0.5)]
regret = [run_sim2(ThompsonSampling([b.var for b in mab]), mab) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)

### 4.1
change thompson sampling algorithm to improve results in non-stationary MAB.

In [33]:
class NewThompsonSampling:
    def __init__(self, var_list, buffer_size=30, **kwargs):
        pass

    def select_arm(self, *args):
        # ==================================== Your Code (Begin) ==================================
        
        # select arm based on estimate of prior distribution

        # return index of selected arm

        # ==================================== Your Code (End) ====================================
        pass

    def update(self, idx, reward):
        # ==================================== Your Code (Begin) ==================================
        
        # update prior based on reward
        
        # ==================================== Your Code (End) ====================================
        pass

In [None]:
mab = [Arm(6, 0.5), Arm(4, 0.5)]
regret = [run_sim2(NewThompsonSampling([b.var for b in mab]), mab) for _ in range(50)]
plot(np.mean(regret, axis=0), mab)