In [2]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# seed for homework
seed = 18475
np.random.seed(seed=seed)

Сначала посмотрим на бейзлайн

In [3]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


384.76067328453064

Выведем regret для бейзлайна, с которым будем сравнивать свою policy

In [4]:
print('Baseline results')
print('----------------\n')
print('Regret:', output['regret'])
print('Mean regret:', output['regret'] / output['rounds'])
print('Total banners:', output['total_banners'])
print('\nHistory:\n', output['history'])

Baseline results
----------------

Regret: 1540.7609683932544
Mean regret: 0.007703804841966272
Total banners: 184

History:
      impressions  clicks      lifetime         p
153      18970.0  4199.0  18003.025431  0.220134
162        228.0    26.0   1537.166719  0.113780
172        163.0    32.0  19648.592394  0.219968
173        170.0    18.0  12771.474990  0.122694
180         26.0     3.0   4655.819793  0.020061
182          6.0     0.0    889.624649  0.004621
183          1.0     0.0  15187.163761  0.073886


In [5]:
# Будем использовать Upper confidence bound алгоритм
class UCB1:
    def __init__(self, C : float):
        self.t = 0 # Шаг алгоритма
        self.C = C # Множитель для компонента exploration
    
    def __call__(self, history):
        self.t += 1
    
        # Рассчитываем оценки среднего дохода и верхнюю доверительную границу для каждой стратегии
        n_i = (history['impressions'] + 1) #Смещение + 1 в числителе для избежания деления на ноль
        exploitation = history['clicks'] / n_i
        exploration = np.sqrt(2 * np.log(self.t) / n_i)
        
        # Выбираем стратегию с максимальной оценкой UCB
        a = np.argmax(exploitation + self.C * exploration)

        return history.index[a]

Будем перебирать параметр C - множитель для компонента exploration. Начнем с 0.001 и переберем несколько степеней десятки. 

In [6]:
# Датасет для сохранения результатов
columns = ['C', 'Regret', 'Mean regret', 'Total banners']
param_results = pd.DataFrame(columns=columns)

In [7]:
for c_param in [0.001, 0.01, 0.1, 1, 10]:
    print(f"Parameter C = {c_param}")
    np.random.seed(seed)
    start = time.time()
    output = simulation(UCB1(c_param), n=200000, seed=seed)
    end = time.time()
    print('Time:', end - start)
    print('Regret:', output['regret'])
    print('Mean regret:', output['regret'] / output['rounds'])
    print('Total banners:', output['total_banners'])
    print('---------------------------------------------\n')
    new_row = {'C': c_param, 'Regret': output['regret'], 'Mean regret': output['regret'] / output['rounds'], 'Total banners': output['total_banners']}
    param_results = param_results.append(new_row, ignore_index=True)

Parameter C = 0.001
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Time: 447.24516797065735
Regret: 7570.250349143192
Mean regret: 0.03785125174571596
Total banners: 184
---------------------------------------------

Parameter C = 0.01
1 impressions have been simulated
10001 impre

Посмотрим на результаты:

In [8]:
param_results

Unnamed: 0,C,Regret,Mean regret,Total banners
0,0.001,7570.250349,0.037851,184.0
1,0.01,974.101387,0.004871,184.0
2,0.1,258.315825,0.001292,184.0
3,1.0,11185.916299,0.05593,184.0
4,10.0,28091.53449,0.140458,184.0


Лучший результат достигается при c = 0.1, тогда regret = 258. Бейзлайн побит, у него regret = 1540.

Теперь попробуем более точно подобрать коэффициент, рассмотрев значения вокруг 0.1. Так как regret у 0.01 меньше, чем у 1, возьмем основную часть значений в интервале между 0.01 и 0.1.

In [9]:
# Датасет для сохранения результатов
columns = ['C', 'Regret', 'Mean regret', 'Total banners']
param_results_2 = pd.DataFrame(columns=columns)

In [10]:
for c_param in [0.05, 0.06, 0.07, 0.08, 0.09, 0.11, 0.12, 0.13]:
    print(f"Parameter C = {c_param}")
    np.random.seed(seed)
    start = time.time()
    output = simulation(UCB1(c_param), n=200000, seed=seed)
    end = time.time()
    print('Time:', end - start)
    print('Regret:', output['regret'])
    print('Mean regret:', output['regret'] / output['rounds'])
    print('Total banners:', output['total_banners'])
    print('---------------------------------------------\n')
    new_row = {'C': c_param, 'Regret': output['regret'], 'Mean regret': output['regret'] / output['rounds'], 'Total banners': output['total_banners']}
    param_results_2 = param_results_2.append(new_row, ignore_index=True)

Parameter C = 0.05
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Time: 443.3818037509918
Regret: 2424.8962079893777
Mean regret: 0.012124481039946889
Total banners: 184
---------------------------------------------

Parameter C = 0.06
1 impressions have been simulated
10001 impre

In [11]:
param_results_2

Unnamed: 0,C,Regret,Mean regret,Total banners
0,0.05,2424.896208,0.012124,184.0
1,0.06,224.59695,0.001123,184.0
2,0.07,152.618217,0.000763,184.0
3,0.08,197.726862,0.000989,184.0
4,0.09,228.795585,0.001144,184.0
5,0.11,260.841092,0.001304,184.0
6,0.12,324.950685,0.001625,184.0
7,0.13,361.481241,0.001807,184.0


Самый низкий regret(152) был получен для C = 0.07.

Попробуем уточнить параметер последний раз)

In [12]:
# Датасет для сохранения результатов
columns = ['C', 'Regret', 'Mean regret', 'Total banners']
param_results_3 = pd.DataFrame(columns=columns)

for c_param in [0.068, 0.069, 0.071, 0.072, 0.073, 0.074, 0.075]:
    print(f"Parameter C = {c_param}")
    np.random.seed(seed)
    start = time.time()
    output = simulation(UCB1(c_param), n=200000, seed=seed)
    end = time.time()
    print('Time:', end - start)
    print('Regret:', output['regret'])
    print('Mean regret:', output['regret'] / output['rounds'])
    print('Total banners:', output['total_banners'])
    print('---------------------------------------------\n')
    new_row = {'C': c_param, 'Regret': output['regret'], 'Mean regret': output['regret'] / output['rounds'], 'Total banners': output['total_banners']}
    param_results_3 = param_results_3.append(new_row, ignore_index=True)

Parameter C = 0.068
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
Time: 442.6946156024933
Regret: 197.9538836174858
Mean regret: 0.000989769418087429
Total banners: 184
---------------------------------------------

Parameter C = 0.069
1 impressions have been simulated
10001 impr

In [13]:
param_results_3

Unnamed: 0,C,Regret,Mean regret,Total banners
0,0.068,197.953884,0.00099,184.0
1,0.069,131.705566,0.000659,184.0
2,0.071,127.169173,0.000636,184.0
3,0.072,196.655482,0.000983,184.0
4,0.073,140.367856,0.000702,184.0
5,0.074,169.743479,0.000849,184.0
6,0.075,169.291944,0.000846,184.0


Итог: лучший regret = 127 (бейзлайн = 1540) достигается при C = 0.071.