In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.linear_model import LogisticRegression
from scipy.stats import norm
from scipy.special import logit
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('data.csv', parse_dates=['date_time'], date_parser=pd.to_datetime)
data.drop(['oaid_hash', 'rate0', 'rate1', 'campaign_clicks', 'impressions'], axis=1, inplace=True)

Сразу выбросим ненужные фичи

In [3]:
def feature_engineering(data: pd.DataFrame):
    test = data[data['date_time'].dt.date == dt.date(2021, 10, 2)]
    train = data[data['date_time'].dt.date < dt.date(2021, 10, 2)]
    test = test[test['banner_id0'] == test['banner_id']]
    test_0 = test.copy()
    y_train = train['clicks']
    y_test = test['clicks']
    test.drop(['date_time', 'clicks'], axis=1, inplace=True)
    train.drop(['date_time', 'clicks'], axis=1, inplace=True)
    test_b1 = test.copy()
    test_b1['banner_id'] = test_b1['banner_id1']
    
    OHE = OneHotEncoder(handle_unknown='ignore')
    train = OHE.fit_transform(train)
    test = OHE.transform(test)
    test_b1 = OHE.transform(test_b1)
    
    return train, test, test_b1, test_0, y_train, y_test

In [4]:
train, test, test_b1, test_0, y_train, y_test = feature_engineering(data)

In [5]:
model = LogisticRegression(C = 1, solver='liblinear')
model.fit(train, y_train)
y_pred = model.predict_proba(test)[:, 1]
y_pred_b1 = model.predict_proba(test_b1)[:, 1]
print('roc_auc =', roc_auc_score(y_test, y_pred), 'log_loss =', log_loss(y_test, y_pred))

roc_auc = 0.7703158091435933 log_loss = 0.13954169461322535


Теперь рассмотрим вероятность того, что одна случайная величина $X \sim N(\mu_X, \sigma_X^2)$ будет больше другой $Y \sim N(\mu_Y, \sigma_Y^2)$:

$P(X > Y) = P(X - Y > 0) = 1 - F_{X - Y}(0)$, причем

$Z = X - Y \sim N(\mu_X - \mu_Y, \sigma_X^2+\sigma_Y^2)$

Дальше у нас производятся операции деления, поэтому на всякий случай везде добавим небольшое слогаемое, чтобы избежать деления на 0

In [6]:
def pi(coeff_0, g_0, coeff_1, g_1):
    res = 1 - norm.cdf(0, loc = coeff_0 - coeff_1, scale = np.sqrt(g_0**2 + g_0**2) + 1e-9)
    return res

In [7]:
pi_0 = pi(test_0['coeff_sum0'], test_0['g0'], test_0['coeff_sum1'], test_0['g1'])

In [8]:
coeff_sum0_new = logit(y_pred)
coeff_sum1_new = logit(y_pred_b1)

pi_1 = pi(coeff_sum0_new, test_0['g0'], coeff_sum1_new, test_0['g1'])

In [9]:
clips = np.mean(y_test * np.minimum(pi_1 / (pi_0 + 1e-9), 10))
clips

0.07114695880379313