In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Преобразуем данные

In [3]:
df = pd.read_csv('/content/drive/My Drive/Copy of data.csv')
df.drop(['oaid_hash', 'campaign_clicks','impressions','rate0', 'rate1'], axis=1, inplace=True)
datetime = pd.to_datetime(df['date_time'])
df['date'] = datetime.dt.date
df.drop('date_time', axis=1, inplace=True)
df.head()

Unnamed: 0,zone_id,banner_id,os_id,country_id,banner_id0,g0,coeff_sum0,banner_id1,g1,coeff_sum1,clicks,date
0,0,0,0,0,1240,0.035016,-7.268846,0,0.049516,-5.369901,1,2021-09-27
1,1,1,0,1,1,0.054298,-2.657477,269,0.031942,-4.44922,1,2021-09-26
2,2,2,0,0,2,0.014096,-3.824875,21,0.014906,-3.939309,1,2021-09-26
3,3,3,1,1,3,0.015232,-3.461357,99,0.050671,-3.418403,1,2021-09-27
4,4,4,1,0,4,0.051265,-4.009026,11464230,0.032005,-2.828797,1,2021-09-27


Фильтруем данные по banner_id == banner_id0, по условиям задачи. Разделяем на трейн и тест. В тесте хранятся данные последнего дня. Также подготавливаем тест2, в котором столбцу banner_id присваиваем значения banner_id1. Для ускорения обучения будем использовать не весь тренировочный датасет.

In [4]:
from sklearn.preprocessing import OneHotEncoder

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
  df = df[df['banner_id'] == df['banner_id0']]
  df = df.dropna()
  df_train = df[df['date'] < pd.Timestamp('2021-10-02').date()].sample(frac=0.01, random_state=72)

  X_train = df_train.drop(['clicks', 'date'], axis=1)
  y_train = df_train['clicks']

  df_test = df[df['date'] == pd.Timestamp('2021-10-02').date()]

  X_test = df_test.drop(['clicks', 'date'], axis=1)
  y_test = df_test['clicks']

  X_test2 = X_test.copy()
  X_test2['banner_id'] = X_test2['banner_id1']

  encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train)
  X_train = encoder.transform(X_train)
  X_test = encoder.transform(X_test)
  X_test2 = encoder.transform(X_test2)

  return X_train, y_train, X_test, y_test, X_test2, df_test

In [5]:
X_train, y_train, X_test, y_test, X_test2, df_test = feature_engineering(df)

Будем использовать линейную модель

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss

model = LogisticRegression(solver='liblinear').fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
roc_auc_metric = roc_auc_score(y_test, y_pred)
log_loss_metric = log_loss(y_test, y_pred)
print(f"roc_auc={roc_auc_metric}, log_loss={log_loss_metric}")


roc_auc=0.7666174687633642, log_loss=0.1402121439623107


$$P(N_0 > N_1) = P(N_0 - N_1 > 0) = 1 - P(N_0 - N_1 < 0) = 1 - F_{N_0-N_1} (0)$$    $$N_0-N_1 = N(mean_0 - mean_1, \sqrt{std_0^2 + std_1^2})$$

В функции присутствует небольшая добавка, чтобы избавиться от 0 в стандартном отклонении

In [10]:
from scipy.stats import norm
def get_pi(coeff_sum0, g0, coeff_sum1, g1):
  return 1 - norm.cdf(0, loc=coeff_sum0 - coeff_sum1, scale=np.sqrt(g0 ** 2 + g1 ** 2) + 1e-6)

pi_0 = get_pi(df_test['coeff_sum0'], df_test['g0'], df_test['coeff_sum1'], df_test['g1'])
print(pi_0)

[1.00000000e+00 1.00000000e+00 8.23530133e-11 ... 0.00000000e+00
 4.84896323e-01 4.95142993e-01]


Предскажем новые коэффициенты, применим logit функцию и посчитаем pi_1

In [11]:
from scipy.special import logit

coeff_sum0_new = logit(model.predict_proba(X_test)[:, 1])
coeff_sum1_new = logit(model.predict_proba(X_test2)[:, 1])

pi_1 = get_pi(coeff_sum0_new, df_test['g0'],coeff_sum1_new, df_test['g1'])
print(pi_1)

[1.00000000e+00 9.99993657e-01 8.90410772e-01 ... 9.99999987e-01
 1.55431223e-15 9.04947042e-01]


In [12]:
cips = np.mean(y_test * np.minimum(pi_1 / np.maximum(pi_0, 1e-10), 10))
print(cips)

0.0728390348928939
