In [1]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


### Epsilon-greedy policy: baseline

In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [3]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


521.509269952774

In [4]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1540.7609683932544, 0.007703804841966272, 184)

In [5]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


### Upper Confidence Bound 

In [7]:
class UCB:
    def __init__(self, eps):
        self.eps = eps
        self.t = 1
        
    def __call__(self, history):
        teta = history['clicks']
        n = history['impressions'] + 1
        teta /= n
        idx = np.argmax(teta + self.eps * np.sqrt(2*np.log(self.t) / n))
        self.t += 1
        return history.index[idx]

Сначала переберем коэффициент $\varepsilon$ (exploitation-exploration) в далеком диапазоне, а потом больше значений вокруг получившегося значения.

In [10]:
seed = 18475
for i, eps in enumerate([0.1, 2, 10]):
    np.random.seed(seed=seed)
    s = time.time()
    policy = UCB(eps = eps)
    print(f"{i+1}. eps = {eps}")
    output = simulation(policy, n=200000, seed=seed)
    e = time.time()
    print('time spent: ', e - s)
    print('results:')
    print(output['regret'], output['regret']/output['rounds'],  output['total_banners'])
    print('--------------------------------------------------------------------')

1. eps = 0.1
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
time spent:  428.83235454559326
results:
258.3158253051061 0.0012915791265255305 184
--------------------------------------------------------------------
2. eps = 2
1 impressions have been simulated
10001 impressions have

**Вывод:** слишком большие коэффициенты не подходят - даже при $\varepsilon = 2$ слишком много исследования среды. При $\varepsilon = 0.1$ уже получилось побить бейзлайн. Проверим еще значения вокруг $0.1$.

In [9]:
seed = 18475
for i, eps in enumerate(np.arange(0.05, 1, 0.1)):
    np.random.seed(seed=seed)
    s = time.time()
    policy = UCB(eps = eps)
    print(f"{i+1}. eps = {eps}")
    output = simulation(policy, n=200000, seed=seed)
    e = time.time()
    print('time spent: ', e - s)
    print('results:')
    print(output['regret'], output['regret']/output['rounds'],  output['total_banners'])
    print('--------------------------------------------------------------------')

1. eps = 0.05
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
time spent:  451.9842474460602
results:
2424.8962079893777 0.012124481039946889 184
--------------------------------------------------------------------
2. eps = 0.15000000000000002
1 impressions have been simulated
1000

160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
time spent:  432.03131556510925
results:
9153.641516745054 0.04576820758372527 184
--------------------------------------------------------------------
10. eps = 0.9500000000000002
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impress

**Вывод:** $\varepsilon$, побеждающий бейзлайн лежит от примерно 0.1 до 0.25. При этом лучший результат получился при $\varepsilon = 0.1$.

In [11]:
seed = 18475
for i, eps in enumerate(np.arange(0.11, 0.15, 0.01)):
    np.random.seed(seed=seed)
    s = time.time()
    policy = UCB(eps = eps)
    print(f"{i+1}. eps = {eps}")
    output = simulation(policy, n=200000, seed=seed)
    e = time.time()
    print('time spent: ', e - s)
    print('results:')
    print(output['regret'], output['regret']/output['rounds'],  output['total_banners'])
    print('--------------------------------------------------------------------')

1. eps = 0.11
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
time spent:  441.46486163139343
results:
260.841092152149 0.0013042054607607448 184
--------------------------------------------------------------------
2. eps = 0.12
1 impressions have been simulated
10001 impressions h

**Вывод**: наилучший результат получился для $\varepsilon = 0.1$. 

*regret(UCB)* = 0.0013 < *regret(eps-greedy)* = 0.0077