In [48]:
import numpy as np
import pandas as pd
import time
from functools import partial

from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [112]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.06)

In [115]:
# seed for homework
seed = 18475
np.random.seed(seed=seed)

start = time.time()
output = simulation(policy, n=200000, seed=seed)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


390.1041259765625

In [117]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1540.7609683932544, 0.007703804841966272, 184)

Напишем Thompson Sampling

In [107]:
class ThompsonSampling:
    def __init__(self, arms):
        self.arms = arms
        self.alpha = np.ones(arms)
        self.beta = np.ones(arms)
    
    def __call__(self, df):
        avail_arms = list(df.index)
        self.alpha[avail_arms] = (df['clicks'] + 1).values
        self.beta[avail_arms] = (df['impressions'] - df['clicks'] + 1).values
        
        theta = np.random.beta(self.alpha[avail_arms], self.beta[avail_arms])
        n = theta.argmax()
        action = avail_arms[n]
        
        return action
    

In [110]:
output = simulation(ThompsonSampling(200), n=200000, seed=seed)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


In [111]:
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1107.6565198411663, 0.005538282599205832, 184)

Результат уже лучше e-greedy. Thompson sampling, по идее, сам контролирует exploration/explotaition, но раз нужно затюнить, то навесим коэф перед альфой и оверфитнемся на сид

In [123]:
class ThompsonSampling:
    def __init__(self, arms, alpha_coef):
        self.arms = arms
        self.alpha = np.ones(arms)
        self.beta = np.ones(arms)
        self.alpha_coef = alpha_coef
        
    def __call__(self, df):
        avail_arms = list(df.index)
        self.alpha[avail_arms] = self.alpha_coef * (df['clicks'] + 1).values
        self.beta[avail_arms] = (df['impressions'] - df['clicks'] + 1).values
        
        theta = np.random.beta(self.alpha[avail_arms], self.beta[avail_arms])
        n = theta.argmax()
        action = avail_arms[n]
        
        return action

In [125]:
coefs = [4, 10, 20]

for coef in coefs:
    output = simulation(ThompsonSampling(200, coef), n=200000, seed=seed)
    print(f'coef: {coef}, regret: {output["regret"]}')

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
coef: 4, regret: 765.5953339227856
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impress

Регрет начинает медленно уменьшаться, оптимальная альфа где-то близко.