In [10]:
# sim_lib.py

import numpy as np
import pandas as pd

from scipy.stats import beta, uniform, bernoulli, expon
from typing import Callable

ALPHA = 1
BETA = 20
MU = 10 ** 4
MONITORING_FREQ = 10 ** 4
MAX_RANDOM = 1111111


def generate_new_banner(n, a=ALPHA, b=BETA, mu=MU, random_state=None):
    if random_state:
        random_state += 1
    p = beta.rvs(a, b, size=n, random_state=random_state)
    lifetimes = expon.rvs(scale=mu, size=n, random_state=random_state)
    
    return p, lifetimes


def simulation(policy: Callable, n=10 ** 6, initial_banners=9, seed=None):
    state = pd.DataFrame(np.zeros((initial_banners, 4)), columns=['impressions', 'clicks', 'lifetime', 'p'])
    state['p'], state['lifetime'] = generate_new_banner(initial_banners)
    regret = 0
    max_index = initial_banners
    borning_rate = initial_banners*(1-np.exp(-1/MU))
    random_state = seed

    for i in range(n):
        if uniform.rvs(random_state=random_state) < borning_rate or state.shape[0] < 2:
            p, lifetime = generate_new_banner(1, random_state=random_state)
            new_banner = pd.DataFrame({'impressions': 0, 'clicks': 0, 'lifetime': lifetime, 'p': p}, index=[max_index])
            state = pd.concat([state, new_banner], verify_integrity=True)
            max_index += 1

        index = policy(state[['impressions', 'clicks']].copy())

        assert index in state.index, 'Error, wrong action number'

        p = state.loc[index, 'p']
        reward = bernoulli.rvs(p)
        state.loc[index, 'impressions'] += 1
        state.loc[index, 'clicks'] += reward
        regret = regret + max(state['p']) - p

        state['lifetime'] = state['lifetime'] - 1
        state = state[state['lifetime'] > 0]
        if random_state:
            random_state = 7*random_state % MAX_RANDOM

        if not i % MONITORING_FREQ:
            print('{} impressions have been simulated'.format(i + 1))

    return {'regret': regret, 'rounds': n, 'total_banners': max_index, 'history': state}

In [2]:
import time
from functools import partial

import optuna

from scipy.stats import randint

pd.options.mode.chained_assignment = None


## Реализуем UCB

### Baseline: (1540.7609683932544, 0.007703804841966272, 184)

In [5]:
SEED = 18475
np.random.seed(seed=SEED)

In [6]:
def ucb(history: pd.DataFrame, c: float):
    q = history['impressions'] + 1
    ctr = history['clicks'] / q
    return history.index[np.argmax(ctr + c * np.sqrt(2 * np.log(np.sum(q)) / q))]

In [11]:
# Пройдём сначала по сетке
result = dict()
for c in [0.01, 0.05, 0.1, 0.3, 0.5]:
    policy = partial(ucb, c=c)
    regret = simulation(policy, n=200000, seed=SEED)['regret']
    result[c] = regret
    print(f'c = {c}, regret = {regret}')
print(result)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
c = 0.01, regret = 4136.430122886874
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impre

In [14]:
# используем байесовский подход для поиска гиперпараметра в окрестности лучшего по сетке
def objective(trial):
    c = trial.suggest_float("c", 0.099, 0.11, log=True)
    policy = partial(ucb, c=c)
    regret = simulation(policy, n=200000, seed=SEED)['regret']
    return regret

In [15]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5, show_progress_bar=1)

[I 2023-12-07 23:33:54,313] A new study created in memory with name: no-name-eac6a666-111f-4f2a-95f8-7c27b0fdf13b


  0%|          | 0/5 [00:00<?, ?it/s]

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
[I 2023-12-07 23:43:25,196] Trial 0 finished with value: 523.9324593237982 and parameters: {'c': 0.10145143486682566}. Best is trial 0 with value: 523.9324593237982.
1 impressions have been simulated
10001 impressions have been simula

**Какую-то закономерность увидеть сложно. По-хорошему надо было сделать интервал поиска больше и сделать больше запусков на этом интервале. И первую сетку стоило сделать более частой, возможно. Тем не менее baseline побит и решение после перебора гиперпараметра улучшено.**

Best is trial 2 with value: 197.82996575688583