In [2]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [3]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [4]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


607.1075055599213

In [5]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1540.7609683932544, 0.007703804841966272, 184)

In [6]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


# Upper confidence bound

In [7]:
class UpperConfidenceBound:
    def __init__(self, tradeoff):
        self.tradeoff = tradeoff
        self.t = 0

    def __call__(self, history: pd.DataFrame):
        self.t += 1
        exploitation = history['clicks'] / (history['impressions'] + 1)
        exploration = np.sqrt(np.log(self.t) / (history['impressions'] + 1))
        i = np.argmax(exploitation + self.tradeoff * exploration)
        return history.index[i]

С помощью Grid Search, оптимизируем tradeoff. Симуляция будем проводить по 50000, чтобы побыстрее найти баланс

In [8]:
for tradeoff in [0.0, 0.25, 0.5, 0.75, 1.0]:
    np.random.seed(seed=seed)
    start = time.time()
    output = simulation(UpperConfidenceBound(tradeoff), n=50000, seed=seed)
    end = time.time()
    print(f"tradeoff={tradeoff} result={output['regret'], output['regret']/output['rounds'],  output['total_banners']} time {end - start}")


1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.0 result=(4347.373087493353, 0.08694746174986705, 52) time 183.82172966003418
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.25 result=(188.49624262724933, 0.0037699248525449864, 52) time 180.10674333572388
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.5 result=(553.0571392630012, 0.011061142785260025, 52) time 179.22453451156616
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have

Лучший регрет 188 при трейдофе 0.25 на данном этапе. Проверим близкие значения.

In [9]:
for tradeoff in [0.05, 0.15, 0.35]:
    np.random.seed(seed=seed)
    start = time.time()
    output = simulation(UpperConfidenceBound(tradeoff), n=50000, seed=seed)
    end = time.time()
    print(f"tradeoff={tradeoff} result={output['regret'], output['regret']/output['rounds'],  output['total_banners']} time {end - start}")


1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.05 result=(1354.0384005915591, 0.027080768011831183, 52) time 181.20058965682983
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.15 result=(69.92987091948771, 0.0013985974183897542, 52) time 183.04944682121277
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.35 result=(333.23838142807404, 0.006664767628561481, 52) time 189.5778636932373


tradeoff 0.15 на данный момент лучший 69.9, при 0.25 был выше - 188. От 0.15 до 0.25 увеличивается. Поищем в районе до 0.15

In [10]:
for tradeoff in [0.075, 0.1, 0.125]:
    np.random.seed(seed=seed)
    start = time.time()
    output = simulation(UpperConfidenceBound(tradeoff), n=50000, seed=seed)
    end = time.time()
    print(f"tradeoff={tradeoff} result={output['regret'], output['regret']/output['rounds'],  output['total_banners']} time {end - start}")


1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.075 result=(46.73681910123482, 0.0009347363820246964, 52) time 180.17704129219055
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.1 result=(76.34013721002074, 0.001526802744200415, 52) time 178.6619954109192
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
tradeoff=0.125 result=(55.807639488154045, 0.001116152789763081, 52) time 182.29276514053345


Выбираем лучший результат 0.075. Просимулируем на 200000

In [12]:
np.random.seed(seed=seed)
start = time.time()
output = simulation(UpperConfidenceBound(0.075), n=200000, seed=seed)
end = time.time()
print(f"tradeoff={tradeoff} result={output['regret'], output['regret']/output['rounds'],  output['total_banners']} time {end - start}")


1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
tradeoff=0.125 result=(133.3224466487009, 0.0006666122332435045, 184) time 739.410647392273


Получили регрет 133 при tradeoff 0.075. Бейзлайн победили