## Подключение библиотек

In [1]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None

## E-Greedy baseline

In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        # choose random banner
        return history.index[randint.rvs(0, n)]

    # the number of clicks that your ad receives divided by the number of times your ad is shown
    ctr = history['clicks'] / (history['impressions'] + 1)
    n = np.argmax(ctr)
    return history.index[n]


policy = partial(eps_greedy, eps=0.06)

## Тестирование бейзлайна

In [3]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

  0%|          | 256/200000 [00:00<02:35, 1285.85it/s]

1 impressions have been simulated


  5%|▌         | 10277/200000 [00:07<02:12, 1436.16it/s]

10001 impressions have been simulated


 10%|█         | 20159/200000 [00:14<02:06, 1422.17it/s]

20001 impressions have been simulated


 15%|█▌        | 30151/200000 [00:21<01:58, 1432.28it/s]

30001 impressions have been simulated


 20%|██        | 40262/200000 [00:28<01:52, 1423.39it/s]

40001 impressions have been simulated


 25%|██▌       | 50182/200000 [00:35<01:44, 1433.70it/s]

50001 impressions have been simulated


 30%|███       | 60234/200000 [00:42<01:42, 1367.74it/s]

60001 impressions have been simulated


 35%|███▌      | 70158/200000 [00:49<01:30, 1430.27it/s]

70001 impressions have been simulated


 40%|████      | 80287/200000 [00:56<01:23, 1428.12it/s]

80001 impressions have been simulated


 45%|████▌     | 90271/200000 [01:03<01:15, 1444.46it/s]

90001 impressions have been simulated


 50%|█████     | 100247/200000 [01:10<01:09, 1435.20it/s]

100001 impressions have been simulated


 55%|█████▌    | 110288/200000 [01:17<01:02, 1438.70it/s]

110001 impressions have been simulated


 60%|██████    | 120149/200000 [01:24<00:55, 1428.59it/s]

120001 impressions have been simulated


 65%|██████▌   | 130252/200000 [01:31<00:49, 1423.03it/s]

130001 impressions have been simulated


 70%|███████   | 140222/200000 [01:38<00:41, 1434.67it/s]

140001 impressions have been simulated


 75%|███████▌  | 150210/200000 [01:45<00:34, 1440.58it/s]

150001 impressions have been simulated


 80%|████████  | 160239/200000 [01:52<00:27, 1447.44it/s]

160001 impressions have been simulated


 85%|████████▌ | 170154/200000 [01:59<00:20, 1437.07it/s]

170001 impressions have been simulated


 90%|█████████ | 180287/200000 [02:06<00:13, 1425.99it/s]

180001 impressions have been simulated


 95%|█████████▌| 190261/200000 [02:13<00:06, 1429.72it/s]

190001 impressions have been simulated


100%|██████████| 200000/200000 [02:19<00:00, 1428.82it/s]


139.99325895309448

In [4]:
# baseline regret
output['regret'], output['regret'] / output['rounds'], output['total_banners']

(1470.2117137660496, 0.007351058568830248, 184)

In [5]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,19624.0,4357.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,27.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


## UCB policy
Реализуем UCB бандита

In [6]:
def ucb(history: pd.DataFrame, exploration_coef: float):
    # добавляем +1, чтобы впоследствии не возникло деления на 0
    impressions = history['impressions'] + 1
    # кол-во кликов на рекламный банер делить на суммарное число показов банера 
    ctr_exploitation = history['clicks'] / impressions

    # exploration часть
    total_impressions = np.sum(impressions)
    exploration = np.sqrt(2 * np.log(total_impressions) / impressions)

    # домножаем exploration часть на коэффициент, чтобы достичь баланса между exploitation и exploration 
    # (т.е между уже известной и еще неисследованной частью)
    u = ctr_exploitation + exploration_coef * exploration

    # выбираем argmax u по всем баннерам
    best_banner_id = np.argmax(u)
    return history.index[best_banner_id]

## Подбор гиперпараметра: коэффициент при exploration части

In [7]:
seed = 18475

def test_ucb_policy(exploration_coef: float, process_printing=False):
    np.random.seed(seed=seed)
    ucb_policy = partial(ucb, exploration_coef=exploration_coef)
    output = simulation(ucb_policy, n=200000, seed=seed, process_printing=process_printing)
    print(f"coef: {exploration_coef}, regret: {output['regret']:.4f}, "
          f"average regret: {(output['regret'] / output['rounds']):.4f}, banners: {output['total_banners']}")

Коэффициент при exploration слагаемом не должен быть большим, тк все же в основном нам важно учесть уже полученные данные exploitation части, и мы хотим добавить небольшое рассмотрение еще недостаточно изученных баннеров из exploration.

In [8]:
exploration_coefs = [0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.15, 0.2, 0.3, 0.5, 0.9, 1]

for coef in exploration_coefs:
    test_ucb_policy(coef)

100%|██████████| 200000/200000 [02:46<00:00, 1198.86it/s]


coef: 0, regret: 8482.3732, average regret: 0.0424, banners: 184


100%|██████████| 200000/200000 [02:53<00:00, 1154.05it/s]


coef: 0.02, regret: 275.2285, average regret: 0.0014, banners: 184


100%|██████████| 200000/200000 [02:57<00:00, 1126.76it/s]


coef: 0.04, regret: 7691.1916, average regret: 0.0385, banners: 184


100%|██████████| 200000/200000 [02:54<00:00, 1145.36it/s]


coef: 0.06, regret: 5841.3414, average regret: 0.0292, banners: 184


100%|██████████| 200000/200000 [02:51<00:00, 1168.92it/s]


coef: 0.08, regret: 164.4385, average regret: 0.0008, banners: 184


100%|██████████| 200000/200000 [02:46<00:00, 1198.38it/s]


coef: 0.1, regret: 207.1156, average regret: 0.0010, banners: 184


100%|██████████| 200000/200000 [02:48<00:00, 1184.20it/s]


coef: 0.15, regret: 419.0433, average regret: 0.0021, banners: 184


100%|██████████| 200000/200000 [02:48<00:00, 1186.59it/s]


coef: 0.2, regret: 750.6938, average regret: 0.0038, banners: 184


100%|██████████| 200000/200000 [02:51<00:00, 1168.99it/s]


coef: 0.3, regret: 1536.9049, average regret: 0.0077, banners: 184


100%|██████████| 200000/200000 [02:48<00:00, 1186.35it/s]


coef: 0.5, regret: 3730.8996, average regret: 0.0187, banners: 184


100%|██████████| 200000/200000 [02:49<00:00, 1178.98it/s]


coef: 0.9, regret: 9110.8636, average regret: 0.0456, banners: 184


100%|██████████| 200000/200000 [02:48<00:00, 1187.66it/s]

coef: 1, regret: 10383.9400, average regret: 0.0519, banners: 184





Видим, что наименьшее значение регрета достигается при coef = 0.08. Для более хорошего результата можем проверить значения коэффициента в окрестностях этой точки:

In [9]:
exploration_coefs = [0.065, 0.07, 0.075, 0.08, 0.085, 0.09]

for coef in exploration_coefs:
    test_ucb_policy(coef)

100%|██████████| 200000/200000 [02:47<00:00, 1190.97it/s]


coef: 0.065, regret: 910.1009, average regret: 0.0046, banners: 184


100%|██████████| 200000/200000 [02:49<00:00, 1177.47it/s]


coef: 0.07, regret: 148.3657, average regret: 0.0007, banners: 184


100%|██████████| 200000/200000 [02:52<00:00, 1159.91it/s]


coef: 0.075, regret: 162.4883, average regret: 0.0008, banners: 184


100%|██████████| 200000/200000 [02:52<00:00, 1157.36it/s]


coef: 0.08, regret: 164.4385, average regret: 0.0008, banners: 184


100%|██████████| 200000/200000 [02:50<00:00, 1171.69it/s]


coef: 0.085, regret: 176.6447, average regret: 0.0009, banners: 184


100%|██████████| 200000/200000 [02:51<00:00, 1169.01it/s]

coef: 0.09, regret: 188.5252, average regret: 0.0009, banners: 184





Финальное тестирование лучшего коэффициента coef = 0.07.

In [11]:
best_exploration_coef = 0.07
test_ucb_policy(best_exploration_coef, process_printing=True)

  0%|          | 223/200000 [00:00<02:57, 1125.70it/s]

1 impressions have been simulated


  5%|▌         | 10206/200000 [00:08<02:36, 1215.21it/s]

10001 impressions have been simulated


 10%|█         | 20197/200000 [00:16<02:27, 1221.05it/s]

20001 impressions have been simulated


 15%|█▌        | 30157/200000 [00:24<02:18, 1223.89it/s]

30001 impressions have been simulated


 20%|██        | 40143/200000 [00:33<02:11, 1214.66it/s]

40001 impressions have been simulated


 25%|██▌       | 50126/200000 [00:41<02:02, 1223.08it/s]

50001 impressions have been simulated


 30%|███       | 60145/200000 [00:49<01:54, 1216.21it/s]

60001 impressions have been simulated


 35%|███▌      | 70134/200000 [00:57<01:47, 1210.74it/s]

70001 impressions have been simulated


 40%|████      | 80218/200000 [01:05<01:36, 1236.22it/s]

80001 impressions have been simulated


 45%|████▌     | 90214/200000 [01:14<01:34, 1162.58it/s]

90001 impressions have been simulated


 50%|█████     | 100229/200000 [01:22<01:21, 1222.65it/s]

100001 impressions have been simulated


 55%|█████▌    | 110156/200000 [01:30<01:12, 1235.26it/s]

110001 impressions have been simulated


 60%|██████    | 120209/200000 [01:38<01:04, 1231.80it/s]

120001 impressions have been simulated


 65%|██████▌   | 130159/200000 [01:46<00:56, 1226.45it/s]

130001 impressions have been simulated


 70%|███████   | 140181/200000 [01:55<00:48, 1232.05it/s]

140001 impressions have been simulated


 75%|███████▌  | 150230/200000 [02:03<00:40, 1229.76it/s]

150001 impressions have been simulated


 80%|████████  | 160156/200000 [02:11<00:32, 1232.05it/s]

160001 impressions have been simulated


 85%|████████▌ | 170203/200000 [02:19<00:24, 1235.48it/s]

170001 impressions have been simulated


 90%|█████████ | 180178/200000 [02:27<00:16, 1230.27it/s]

180001 impressions have been simulated


 95%|█████████▌| 190231/200000 [02:35<00:07, 1230.17it/s]

190001 impressions have been simulated


100%|██████████| 200000/200000 [02:43<00:00, 1220.68it/s]

coef: 0.07, regret: 148.3657, average regret: 0.0007, banners: 184





Регрет значительно меньше, чем у бейзлайна, баланс между exploitation и exploration подобран. Успех