In [1]:
# sim_lib.py

import numpy as np
import pandas as pd

from scipy.stats import beta, uniform, bernoulli, expon
from typing import Callable

ALPHA = 1
BETA = 20
MU = 10 ** 4
MONITORING_FREQ = 10 ** 4
MAX_RANDOM = 1111111


def generate_new_banner(n, a=ALPHA, b=BETA, mu=MU, random_state=None):
    if random_state:
        random_state += 1
    p = beta.rvs(a, b, size=n, random_state=random_state)
    lifetimes = expon.rvs(scale=mu, size=n, random_state=random_state)
    
    return p, lifetimes


def simulation(policy: Callable, n=10 ** 6, initial_banners=9, seed=None):
    state = pd.DataFrame(np.zeros((initial_banners, 4)), columns=['impressions', 'clicks', 'lifetime', 'p'])
    state['p'], state['lifetime'] = generate_new_banner(initial_banners)
    regret = 0
    max_index = initial_banners
    borning_rate = initial_banners*(1-np.exp(-1/MU))
    random_state = seed

    for i in range(n):
        if uniform.rvs(random_state=random_state) < borning_rate or state.shape[0] < 2:
            p, lifetime = generate_new_banner(1, random_state=random_state)
            new_banner = pd.DataFrame({'impressions': 0, 'clicks': 0, 'lifetime': lifetime, 'p': p}, index=[max_index])
            state = pd.concat([state, new_banner], verify_integrity=True)
            max_index += 1

        index = policy(state[['impressions', 'clicks']].copy())

        assert index in state.index, 'Error, wrong action number'

        p = state.loc[index, 'p']
        reward = bernoulli.rvs(p)
        state.loc[index, 'impressions'] += 1
        state.loc[index, 'clicks'] += reward
        regret = regret + max(state['p']) - p

        state['lifetime'] = state['lifetime'] - 1
        state = state[state['lifetime'] > 0]
        if random_state:
            random_state = 7*random_state % MAX_RANDOM

        if not i % MONITORING_FREQ:
            print('{} impressions have been simulated'.format(i + 1))

    return {'regret': regret, 'rounds': n, 'total_banners': max_index, 'history': state}

In [2]:
import time
from functools import partial

import optuna

from scipy.stats import randint

pd.options.mode.chained_assignment = None


In [None]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [None]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

In [None]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

In [None]:
output['history']

## Реализуем UCB

Baseline: (1540.7609683932544, 0.007703804841966272, 184)
### PS: в кэггле после перезапуска ноутбука у меня пропали output'ы ячеек, пересчитывать заново свой первый подбор гиперпараметров не стал, потому что он долгий

In [8]:
SEED = 18475
np.random.seed(seed=SEED)

In [9]:
def ucb(history: pd.DataFrame, c: float):
    q = history['impressions'] + 1
    ctr = history['clicks'] / q
    return history.index[np.argmax(ctr + c * np.sqrt(2 * np.log(np.sum(q)) / q))]

In [10]:
# будем подбирать параметры при меньшем n, посмотрим на каких значениях целевая метрика меньше
def objective(trial):
    c = trial.suggest_float("c", 0.0001, 3, log=True)
    policy = partial(ucb, c=c)
    regret = simulation(policy, n=100000, seed=SEED)['regret']
    return regret

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, show_progress_bar=1)

**Лучший результат оказался в окрестности 0.1. Попробуем найти С на этом отрезке. (На самом деле было бы более удобно пройтись по грубой сетке, а потом уточнят параметр с помощью байесовского подхода (Optuna))**

In [13]:
# будем подбирать параметры при меньшем n
def objective(trial):
    c = trial.suggest_float("c", 0.09, 0.11, log=True)
    policy = partial(ucb, c=c)
    regret = simulation(policy, n=100000, seed=SEED)['regret']
    return regret

In [17]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5, show_progress_bar=1)

[I 2023-12-07 19:22:58,920] A new study created in memory with name: no-name-8ce522ec-4e12-427d-a3c0-9d5d87bd87ac


  0%|          | 0/5 [00:00<?, ?it/s]

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
[I 2023-12-07 19:27:41,075] Trial 0 finished with value: 163.669814237115 and parameters: {'c': 0.10521199703467045}. Best is trial 0 with value: 163.669814237115.
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
[I 2023-12-07 19:32:21,592] Trial 1 finished with value: 129.332791761909 and parame

In [18]:
# Лучший параметр
study.best_params

{'c': 0.09305900502870383}

In [19]:
policy = partial(ucb, c=study.best_params['c'])
regret = simulation(policy, n=200000, seed=SEED)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
{'regret': 601.4580850536672, 'rounds': 200000, 'total_banners': 184, 'history':      impressions  clicks      lifetime         p
153        148.0    24.0  18003.025431  0.220134
162          6.0     0.0   1537.166719  0.113780
172   

In [20]:
regret['regret']

601.4580850536672

### Baseline побит!