# Colab Notebook created for ARAP module at UPC ETSETB.

**Authors**: Josep Vidal, Juan José Nieto, Margarita Cabrera-Bean


In [2]:
Student = '' #@param {type:"string"}

# MULTI-ARMED BANDIT

[Open task description in another tab](https://drive.google.com/file/d/10g5cJlfS_VcfEcrmdn4uNaRvEsSah8ax/view?usp=sharing)


Multi-armed bandit (MAB) framework has attracted a lot of attention in various applications, from recommender systems and information retrieval to healthcare and finance, due to its stellar performance combined with certain attractive properties, such as learning from less feedback. The multi-armed bandit field is currently flourishing, as novel problem settings and algorithms motivated by various practical applications are being introduced.

Use Matlab or Python to solve next questions:



*   Use the code provided for e-greedy algorithm. Check the reward on a single run for smaller values of the variance of the Gaussian f(r|a) in example 2.1 in slides. Extract conclusions.
*   Think of a practical application that can be modeled with an m-armed bandit. Guess a meaningful f(r|a) ∀a (Gaussian, binary Bernoulli, exponential, etc.) for that application. You may get some inspiration from [this paper](https://drive.google.com/file/d/1bOpOjRyHXAB91XF6uADV7tcWo2D2zHsK/view?usp=sharing). Assume stationarity over time.
*   Program it, use the base code provided, where Gaussian rewards have been assumed.
*   Check the average reward obtained in convergence for several values of the parameter associated to the UCB technique. Check also the number of correct decisions.
*   Plot the rewards for every action on a single run.
*   Make the environment non-stationary and include the appropriate changes.









In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
np.random.seed(4)

### Plot functions

In [None]:
'''Insert title, and axis labels to plots'''
def insert_labels(labels, ax):
    if 'title' in labels:
        ax.set_title(labels['title']) 
    if 'xlabel' in labels:
        ax.set_xlabel(labels['xlabel']) 
    if 'ylabel' in labels:
	    ax.set_ylabel(labels['ylabel'])

def generate_plot(r, ba, q, strg, par):
    str_legend = ['{}, {}={}'.format('UCB' if strg else 'e-greedy', 'c' if strg else 'delta', p) for p in par]

    conf_1 = {'title': 'Average reward', 'xlabel': 'steps', 'ylabel': 'avg reward'}
    conf_2 = {'title': 'Percentage optimal actions', 'xlabel': 'steps', 'ylabel': '% optimal actions'}
    conf_3 = {'title': str_legend[-1], 'xlabel': 'steps', 'ylabel': 'Q(a)'}

    fig, ax = plt.subplots(1,3, figsize=(18, 7))
    for i, (data, conf) in enumerate(zip([r, ba, q],[conf_1, conf_2, conf_3])):
        for j, d in enumerate(data):
            ax[i].plot(d)
            insert_labels(conf, ax[i])
        if i < 2:
            ax[i].legend(str_legend)
    
    ax[0].plot(np.ones(len(d))*np.max(meanA)) # Include the optimum reward in the display

### Multi-armed bandit setting

In [None]:
m = 10                                   # number of actions
dispMeansA = 1.5                        # dispersion in the values of means for every action
dispStd = 0.05                           # dispersion in the values of variances for every action
meanA = np.random.randn(m)*dispMeansA   # means for every action
bestAction = np.argmax(meanA)           # index of the best action
stdA = np.random.rand(m)*dispStd        # std deviations for every action
alpha = 0.01                            # time constant for incremental estimation of Q in time-varying environment

### Simulation setting

In [None]:
NRuns = 200                             # number of independent runs to be averaged
NSteps = 500                            # number of time steps per run

r = np.zeros((NRuns, NSteps))           # instantaneous rewards
Q = np.zeros((m, NSteps))               # average reward per action
BA = np.zeros((NRuns, NSteps))          # best action collection

Strat = 0                               # 0: e-greedy, 1: UCB

### Decision taking setting

In [None]:
if Strat:
    par = np.array([0.5, 1, 2])        # values of c UCB                       
else:
    par = np.array([0, 0.1, 1])        # values of delta e-greedy       

### Random trials for each parameter of the algorithm and for all independent runs

In [None]:
avg_r = []
avg_ba = []
for e in range(len(par)):                                       # parameters for the method
    BA = np.zeros((NRuns, NSteps))                              # identifies if best action has been selected
    for i in range(NRuns):
        Q = np.zeros((m, NSteps))                               # average reward per action
        Q[:,0] = np.random.randn(m)*0.1                         # initialization of Q
        ta = np.zeros((m))                                      # times each action is selected
        for j in range(1, NSteps):
            # e-greedy
            if not Strat:
                I = np.argmax(Q[:,j-1])                         # select best action
                if np.random.rand() > min(1, m*par[e]/j):       # e-greedy with decaying epsilon
                    a = I
                else:
                    randIndex = np.random.randint(m-1)          # select an action other than greedy one
                    a = randIndex + (randIndex >= I)
                
                ta[a] += 1
                r[i,j] = meanA[a] + np.random.randn()*stdA[a]   # obtain the gaussian reward
                Q[:,j] = Q[:, j-1]                              # update Q function
                Q[a,j] += 1/ta[a] * (r[i,j] - Q[a, j])
            # UCB
            else:
                # COMPLETE THE CODE FOR UCB HERE
                pass # remove

            BA[i,j] += bestAction == a
                
    avg_r.append(np.mean(r.copy(), axis=0))
    avg_ba.append(np.mean(BA.copy(), axis=0)*100)


In [None]:
generate_plot(avg_r, avg_ba, Q, Strat, par)