In [3]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [4]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [5]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
base_time = end - start
base_time

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


344.92228960990906

In [6]:
# baseline regret
bese_regret = output['regret']
base_avg_regret = output['regret']/output['rounds']
base_banners = output['total_banners']

bese_regret, base_avg_regret, base_banners

(1540.7609683932544, 0.007703804841966272, 184)

In [7]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


# UCB policy

In [8]:
class UCB:
    def __init__(self, c):
        self.c = c 
        self.t = 0

    def __call__(self, history: pd.DataFrame):
        self.t += 1
        n = history['impressions'] + 1  # добавляем 1 чтобы не было деления на 0
        exploitation = history['clicks'] / n
        exploration = np.sqrt(2 * np.log(self.t) / n)
        u = exploitation + self.c * exploration
        a = np.argmax(u)
        return history.index[a]   

In [9]:
# функция для перебора множителя
def tuning(list_of_c):
    df = pd.DataFrame(columns=['с', 'regret', 'avg_regret', 'banners', 'time'])
    for c in list_of_c:
        
        seed = 18475
        np.random.seed(seed=seed)
        start = time.time()
        output = simulation(UCB(c=c), n=200000, seed=seed)
        end = time.time()
        cur_row = [c, output['regret'], output['regret']/output['rounds'], output['total_banners'], end - start]
        df.loc[len(df)] = cur_row
        
        print(f"с : {cur_row[0]}, regret : {cur_row[1]}, avg_regret : {cur_row[2]}, banners : {cur_row[3]}, time : {cur_row[4]} ")
    return df.round(3)

# Попытка №1
Для начала пойдем по широкому промежутку и будем перебирать множитель с большим шагом

In [10]:
list_of_c = np.arange(0.1, 1.5, 0.2)
df_scores = tuning(list_of_c)
df_scores

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
с : 0.1, regret : 258.3158253051061, avg_regret : 0.0012915791265255305, banners : 184, time : 331.38958644866943 
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 imp

Unnamed: 0,с,regret,avg_regret,banners,time
0,0.1,258.316,0.001,184.0,331.39
1,0.3,1665.416,0.008,184.0,325.74
2,0.5,4013.026,0.02,184.0,345.908
3,0.7,6910.462,0.035,184.0,328.155
4,0.9,9810.278,0.049,184.0,381.529
5,1.1,12431.719,0.062,184.0,328.625
6,1.3,14592.226,0.073,184.0,335.217


Заметим, что лучший результат получаем при `c = 0.1`. При этом уже побеждаем бейзлайн почти в 6 раз.

# Попытка №2
Попытаемся теперь потюнить баланс exploration/exploitation перебирая параметр в окрестности 0.1

In [11]:
list_of_c = np.arange(0.05, 0.2, 0.05)
df_scores = tuning(list_of_c)
df_scores

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
с : 0.05, regret : 2424.8962079893777, avg_regret : 0.012124481039946889, banners : 184, time : 333.98399662971497 
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 im

Unnamed: 0,с,regret,avg_regret,banners,time
0,0.05,2424.896,0.012,184.0,333.984
1,0.1,258.316,0.001,184.0,348.0
2,0.15,468.191,0.002,184.0,333.166
3,0.2,797.938,0.004,184.0,337.8


# Попытка №3
Еще одна попытка: Сузим диапазон значений и уменьшим шаг

In [12]:
list_of_c = np.arange(0.06, 0.12, 0.01)
df_scores = tuning(list_of_c)
df_scores

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
с : 0.06, regret : 224.59694995443272, avg_regret : 0.0011229847497721637, banners : 184, time : 342.09497833251953 
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 i

Unnamed: 0,с,regret,avg_regret,banners,time
0,0.06,224.597,0.001,184.0,342.095
1,0.07,152.618,0.001,184.0,330.646
2,0.08,197.727,0.001,184.0,370.73
3,0.09,228.796,0.001,184.0,338.681
4,0.1,258.316,0.001,184.0,368.861
5,0.11,260.841,0.001,184.0,367.377


# Сравнение с бейзлайном

In [13]:
res_df = df_scores[df_scores.с == 0.07]
best_c = res_df.с.values[0]
res_df.rename(columns={"с": "model"},inplace=True)
res_df.model = 'c = ' + str(best_c)
new_record = pd.DataFrame([['baseline', bese_regret, base_avg_regret, base_banners, base_time]], columns=res_df.columns)
res_df = pd.concat([new_record, res_df], ignore_index=True)
res_df

Unnamed: 0,model,regret,avg_regret,banners,time
0,baseline,1540.760968,0.007704,184.0,344.92229
1,c = 0.07,152.618,0.001,184.0,330.646


Таким образом, получаем оптимальную policy при ``с = 0.07`` и побеждаем бейзлайн в 10 раз