# sim_lib.py

In [1]:
import numpy as np
import pandas as pd

from scipy.stats import beta, uniform, bernoulli, expon
from typing import Callable

ALPHA = 1
BETA = 20
MU = 10 ** 4
MONITORING_FREQ = 10 ** 4
MAX_RANDOM = 1111111


def generate_new_banner(n, a=ALPHA, b=BETA, mu=MU, random_state=None):
    if random_state:
        random_state += 1
    p = beta.rvs(a, b, size=n, random_state=random_state)
    lifetimes = expon.rvs(scale=mu, size=n, random_state=random_state)

    return p, lifetimes


def simulation(policy: Callable, n=10 ** 6, initial_banners=9, seed=None):
    state = pd.DataFrame(np.zeros((initial_banners, 4)), columns=['impressions', 'clicks', 'lifetime', 'p'])
    state['p'], state['lifetime'] = generate_new_banner(initial_banners)
    regret = 0
    max_index = initial_banners
    borning_rate = initial_banners*(1-np.exp(-1/MU))
    random_state = seed

    for i in range(n):
        if uniform.rvs(random_state=random_state) < borning_rate or state.shape[0] < 2:
            p, lifetime = generate_new_banner(1, random_state=random_state)
            new_banner = pd.DataFrame({'impressions': 0, 'clicks': 0, 'lifetime': lifetime, 'p': p}, index=[max_index])
            state = pd.concat([state, new_banner], verify_integrity=True)
            max_index += 1

        index = policy(state[['impressions', 'clicks']].copy())

        assert index in state.index, 'Error, wrong action number'

        p = state.loc[index, 'p']
        reward = bernoulli.rvs(p)
        state.loc[index, 'impressions'] += 1
        state.loc[index, 'clicks'] += reward
        regret = regret + max(state['p']) - p

        state['lifetime'] = state['lifetime'] - 1
        state = state[state['lifetime'] > 0]
        if random_state:
            random_state = 7*random_state % MAX_RANDOM

        if not i % MONITORING_FREQ:
            print('{} impressions have been simulated'.format(i + 1))

    return {'regret': regret, 'rounds': n, 'total_banners': max_index, 'history': state}

# task3_example.ipynb

In [2]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform

pd.options.mode.chained_assignment = None


In [3]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [4]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


716.3640353679657

In [5]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1540.7609683932544, 0.007703804841966272, 184)

In [6]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
153,18970.0,4199.0,18003.025431,0.220134
162,228.0,26.0,1537.166719,0.11378
172,163.0,32.0,19648.592394,0.219968
173,170.0,18.0,12771.47499,0.122694
180,26.0,3.0,4655.819793,0.020061
182,6.0,0.0,889.624649,0.004621
183,1.0,0.0,15187.163761,0.073886


# HW 3 solution

В качестве policy я взяла upper confidence bound. Чтобы тюнить баланс exploration и exploitation, я добавила коэффициент-множитель С к exploration части.

In [7]:
class UCB:
  def __init__(self, C=1.0):
    self.t = 0
    self.C = C

  def __call__(self, history: pd.DataFrame):
    self.t += 1
    ctr = history['clicks'] / (history['impressions'] + 1) #+1 в знаменателе, чтобы корректно обрабатывать новые баннеры, у которых не было показов
    exploration = np.sqrt(2 * np.log(self.t) / (history['impressions'] + 1))
    n = np.argmax(ctr + self.C * exploration)
    return history.index[n]

In [8]:
start = time.time()
output = simulation(UCB(), n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


811.179233789444

In [9]:
#UCB policy regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(11409.558433606822, 0.057047792168034114, 184)

Видим, что пока регрет сильно выше бейзлайна. Попробуем уменьшить его, выбрав оптимальный баланс exploitation и exploration с помощью подбора гиперпараметра C.

In [10]:
# сначала найдем примерные границы, в которых стоит искать оптимальный C.
C_grid = [0.0001, 0.001, 0.01, 0.1]

seed = 18475
np.random.seed(seed=seed)

for C in C_grid:
  print(f"C = {C}")
  output = simulation(UCB(C), n=200000, seed=seed)
  print(f"regret = {output['regret']}, C = {C}\n")

C = 0.0001
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
regret = 7570.250349143192, C = 0.0001

C = 0.001
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have be

Видим, что самый маленький регрет получился при C = 0.1. Попробуем уменьшить его, подобрав более оптимальный гиперпараметр примерно того же порядка

In [11]:
С_grid = np.linspace(0.05, 0.3, 5)

seed = 18475
np.random.seed(seed=seed)

for C in С_grid:
  print(f"C = {C}")
  output = simulation(UCB(C), n=200000, seed=seed)
  print(f"regret = {output['regret']}, C = {C}\n")

C = 0.05
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
regret = 2424.8962079893777, C = 0.05

C = 0.1125
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been

KeyboardInterrupt: ignored

(Я остановила подсчеты, потому что увидела, что регрет растет и смотреть на большие C бессмысленно.

Видим, что совсем далеко от 0.1 все-таки нет смысла отходить, потому что регрет растет. Поищем оптимальный С, близкий к 0.1.

In [12]:
С_grid = np.linspace(0.075, 0.15, 5)

seed = 18475
np.random.seed(seed=seed)

for C in С_grid:
  print(f"C = {C}")
  output = simulation(UCB(C), n=200000, seed=seed)
  print(f"regret = {output['regret']}, C = {C}\n")

C = 0.075
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
regret = 169.29194382429958, C = 0.075

C = 0.09375
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have b

Вывод: получили минимальный регрет равный 169 при C = 0.075, что значительно ниже регрета бейзлайна