In [1]:

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss, roc_auc_score

In [2]:
data = pd.read_csv('data.csv')
data = data[data['date_time'] > '2021-09-02']
data = data.sort_values(by=['date_time']) 
data.tail()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
1745969,2021-10-02 23:59:59.000000,0,1240,1602730444213689664,0,6,0,1240,0.067,0.023994,-6.596547,11815404,1.261,0.009313,-4.334192,1,0
13959634,2021-10-02 23:59:59.000000,76,223,7727064149323604529,2,3,0,223,0.001,0.092767,-3.738197,153,0.001,0.100974,-3.812352,1,0
13319080,2021-10-02 23:59:59.000000,20,1240,6641189974907647716,0,6,0,1240,0.067,0.011625,-5.709127,1235,0.067,0.011629,-5.707838,1,0
3336944,2021-10-02 23:59:59.000000,24,556,5831343304744661795,0,0,12,556,0.001,0.237553,-3.124159,385,0.001,0.252622,-3.259452,1,0
3226555,2021-10-02 23:59:59.000000,74,418,8774465000271096189,0,0,5,899,0.004,0.109321,-3.809907,418,0.01,0.039603,-4.673005,1,0


In [3]:
test_day = '2021-10-02'

In [4]:
#feature engeneering
data = data.drop(["impressions", "campaign_clicks", "rate0", "rate1"], axis=1)
data['hour'] = pd.to_datetime(data['date_time']).dt.hour # выборка всего на неделю, имеет смысл смотреть только на час

# разибиваем на test и train
train  = data[(data['date_time']<test_day)]
test = data[(data['date_time']>=test_day)]

test = test.drop(["date_time"], axis=1)
train = train.drop(["date_time"], axis=1)

train_target = train["clicks"]
train_x = train.drop(["g0", "g1", "coeff_sum0", "coeff_sum1", "clicks", "banner_id0", "banner_id1"], axis=1)

test_target = test["clicks"]
test_x = test.drop(["g0", "g1", "coeff_sum0", "coeff_sum1", "clicks", "banner_id0", "banner_id1"], axis=1)

oneHotEncoder = OneHotEncoder(handle_unknown='ignore', drop='first')
train_x = oneHotEncoder.fit_transform(train_x)
test_x = oneHotEncoder.transform(test_x)




Обучим обычную LogisticRegression на liblinear, l2 и c=0.01

In [5]:
model = LogisticRegression(solver="liblinear", penalty="l2", C=0.01)
model.fit(train_x, train_target)

In [6]:
test_predicted = model.predict_proba(test_x)
print(f'roc-auc: {roc_auc_score(test_target, test_predicted[:, 1])}')
print(f'log-loss: {log_loss(test_target, test_predicted)}')

roc-auc: 0.7792413811786512
log-loss: 0.13341114807408233


### Считаем clipped_ips

Сначала поймем, какова вероятность, что одна с.в. $\xi_1$ больше другой $\xi_2$. Это в точности означает, что $P(\xi_1-\xi_2 > 0)$

Т.к. это величины, которые распределены нормально (пусть $\xi_1$ ~ $N(\mu_1, \sigma_1^2)$, $\xi_2$ ~ $N(\mu_2, \sigma_2^2)$).
То велечина $\xi_1-\xi_2$ распределена как $N(\mu_1-\mu_2, \sigma_1^2+\sigma_2^2)$
А зная функцию распределения $F$, вероятность того, что величина положительна, вычисляется как $1-F(0)$

In [7]:
from scipy.stats import norm
from scipy.special import logit

In [8]:
def get_pi(m1, s1, m2, s2):
    return 1 - norm.cdf(0, loc=m1-m2, scale=np.sqrt(s1**2 + s2**2)+1e-10) # объяснение формулы выше

# считаем pi_0
pi_0 = get_pi(test['coeff_sum0'], test['g0'], test['coeff_sum1'], test['g1'])
pi_0

array([1.        , 0.99995022, 0.99998809, ..., 0.46875955, 0.65178789,
       1.        ])

Теперь считаем pi_1

In [9]:
coeff_sum0_new = logit(test_predicted[:, 1]) # используем старый результат, banner_id = banner_id0 итак

In [10]:
test_banner_1 = test.copy()
test_banner_1["banner_id"] = test_banner_1["banner_id1"] # чтобы посчитать pi_1 :  banner_id = banner_id1
test_banner_1 = test_banner_1.drop(["g0", "g1", "coeff_sum0", "coeff_sum1", "clicks", "banner_id0", "banner_id1"], axis=1)
test_banner_1 = oneHotEncoder.transform(test_banner_1)
test_banner_1_predicted = model.predict_proba(test_banner_1)



In [11]:
coeff_sum1_new = logit(test_banner_1_predicted[:, 1])

In [12]:
pi_1 = get_pi(coeff_sum0_new, test['g0'], coeff_sum1_new, test['g1'])
pi_1

array([1.        , 1.        , 1.        , ..., 0.50810802, 0.61279352,
       0.5       ])

In [13]:
clipped_ips = np.mean(test_target * np.minimum(pi_1/(pi_0 + 1e-10), 10))

In [14]:
clipped_ips

0.07126441664041822