In [31]:
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

In [38]:
N = 10
Q = np.random.rand(N)
print Q
p = figure(width=800, height=600)
p.vbar(x=range(N), width=0.5, bottom=0, top=Q)
show(p, notebook_handle=True)

[ 0.02519201  0.0325917   0.25339887  0.34291258  0.20260037  0.37306524
  0.64978318  0.44163615  0.61484768  0.5233074 ]


In [39]:
# Epsilon

q = np.zeros(N)
n = np.zeros(N)
steps = 2000
p = figure(width=800, height=600)
pa = figure(width=800, height=600)
epsilon = [0, 0.02, 0.2]
colors = ['green', 'red', 'black']
for idx, eps in enumerate(epsilon):
    rs = []
    ax = []
    for t in range(steps):
        if np.random.rand() < eps and t < 2000:
            a = np.random.randint(0, N)
        else:
            a = np.argmax(q)
        r = Q[a] + np.random.normal(0, 1)
        rs.append(r)
        ax.append(a)
        q[a] = (n[a] * q[a] + r) / (n[a] + 1)
        # q[a] = q[a] + 0.9 * (r - q[a])
        n[a] += 1
    rs_ = np.cumsum(rs)
    ax_ = np.cumsum([axi == np.argmax(Q) for axi in ax])
    ind = np.array(range(steps))+1.0
    p.circle(x=range(steps), y=rs_/ind, color=colors[idx], size=1, legend='Epsilon:%f' % eps)
    pa.circle(x=range(steps), y=ax_/ind, color=colors[idx], size=1, legend='Epsilon:%f' % eps)
    p.legend.location = "bottom_left"
    pa.legend.location = "bottom_left"
show(p)
show(pa)

In [22]:
rs_

array([    1.86567912,     2.79774701,     3.57923899, ...,  1784.5777636 ,
        1785.44299275,  1785.91473201])

In [40]:
# UCB

q = np.zeros(N)
n = np.zeros(N)
steps = 2000
p = figure(width=800, height=600)
pa = figure(width=800, height=600)
cs = [1, 2, 3]
colors = ['green', 'red', 'black']
for idx, c in enumerate(cs):
    rs = []
    ax = []
    for t in range(steps):
        a = np.argmax(q + c * np.sqrt(np.log(t+1)/(n+1)))
        r = Q[a] + np.random.normal(0, 1)
        rs.append(r)
        ax.append(a)
        q[a] = (n[a] * q[a] + r) / (n[a] + 1)
        # q[a] = q[a] + 0.9 * (r - q[a])
        n[a] += 1
    rs_ = np.cumsum(rs)
    ax_ = np.cumsum([axi == np.argmax(Q) for axi in ax])
    ind = np.array(range(steps))+1.0
    p.circle(x=range(steps), y=rs_/ind, color=colors[idx], size=1, legend='C:%f' % c)
    pa.circle(x=range(steps), y=ax_/ind, color=colors[idx], size=1, legend='C:%f' % c)
    p.legend.location = "bottom_left"
    pa.legend.location = "bottom_left"
show(p)
show(pa)

In [41]:
# Gradient 
h = np.zeros(N)
steps = 2000
p = figure(width=800, height=600)
pa = figure(width=800, height=600)

alpha = [0.1, 0.2, 0.4]
colors = ['green', 'red', 'black']

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

for idx, alp in enumerate(alpha):
    rs = []
    ax = []
    for t in range(steps):
        # a = np.argmax(q + c * np.sqrt(np.log(t+1)/(n+1)))
        pi = softmax(h)
        a = np.random.choice(N, 1, p=pi)
        r = Q[a] + np.random.normal(0, 1)
        rs.append(r)
        ax.append(a)
        h = h - alp * (r - np.mean(rs)) * pi
        h[a] = h[a] + alp * (r - np.mean(rs))

        # q[a] = q[a] + 0.9 * (r - q[a])
    rs_ = np.cumsum(rs)
    ax_ = np.cumsum([axi == np.argmax(Q) for axi in ax])
    ind = np.array(range(steps))+1.0
    p.circle(x=range(steps), y=rs_/ind, color=colors[idx], size=1, legend='Alpha:%f' % alp)
    pa.circle(x=range(steps), y=ax_/ind, color=colors[idx], size=1, legend='Alpha:%f' % alp)
    p.legend.location = "bottom_left"
    pa.legend.location = "bottom_left"
show(p)
show(pa)

array([  9.90412406e-01,   3.88307964e-04,   4.78145529e-04,
         4.22421151e-04,   3.73881678e-04,   4.07098410e-04,
         1.23934432e-03,   4.93706357e-03,   1.20191021e-03,
         1.39421411e-04])