In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import hstack
from scipy.special import logit
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [3]:
data.isnull().sum()

date_time              0
zone_id                0
banner_id              0
oaid_hash              0
campaign_clicks        0
os_id                  0
country_id             0
banner_id0             0
rate0                 69
g0                    69
coeff_sum0            69
banner_id1             0
rate1              19744
g1                 19744
coeff_sum1         19744
impressions            0
clicks                 0
dtype: int64

В новых колонках есть Nan.

In [4]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)

Посмотрим подробнее на новые колонки.

In [5]:
data[['coeff_sum0', 'coeff_sum1', 'g0', 'g1', 'rate0', 'rate1']].describe() 

Unnamed: 0,coeff_sum0,coeff_sum1,g0,g1,rate0,rate1
count,15821403.0,15801728.0,15821403.0,15801728.0,15821403.0,15801728.0
mean,-3.97959,-3.968883,0.744076,0.548572,0.196925,1.415899
std,1.143982,1.186403,16.703576,14.191364,2.73344,8.689053
min,-8.58897,-9.562188,-0.017637,-0.069839,0.0,0.0
25%,-4.515871,-4.529519,0.016439,0.016152,0.005,0.004
50%,-3.921164,-3.928674,0.035393,0.035567,0.01,0.014
75%,-3.42128,-3.390867,0.080229,0.075468,0.03,0.05
max,0.314998,0.475618,691.088787,691.088453,100.0,100.0


В основном будем обрабатывать датасет также, как и в первом дз.\
В новых колонках g0, g1 есть отрицательные числа. Стандартное отклонение не может принимать такие значения, поэтому удалим из датасета подобные наблюдения.\
Согласно описанию, для Домашней работы 4 колонка campaign_clicks не нужна, ее тоже уберем.\
Также в новых колонках есть Nan, их надо будет удалить.

In [6]:
def train_test_split(data: pd.DataFrame):
    """
    Разбивает данные на тренировочную и тестовую выборки.
    В тестовую часть попадает последний день.
    """
    last_day = data['date_time'].dt.date.max()
    Test = data.loc[data['date_time'].dt.date == last_day]
    Train = data.loc[data['date_time'].dt.date < last_day]

    X_vars = [col for col in data.columns if col not in ['date_time']]

    X_train = Train.loc[:, X_vars]
    Y_train = Train.loc[:, 'clicks']
    X_train = X_train.drop(["g0", "g1", "coeff_sum0", "coeff_sum1", 'clicks'], axis=1)

    del Train

    X_test = Test.loc[:, X_vars]

    X_test = X_test[X_test["banner_id"] == X_test["banner_id0"]]
    Y_test = X_test.loc[:, 'clicks']
    
    # для cips
    X_test_ips = X_test[["g0", "g1", "coeff_sum0", "coeff_sum1"]].copy()
    
    X_test = X_test.drop(["g0", "g1", "coeff_sum0", "coeff_sum1", 'clicks'], axis=1)
    X_test_banner1 = X_test.copy()
    X_test_banner1['banner_id'] = X_test_banner1['banner_id1']

    del Test
    
    return X_train, Y_train, X_test, X_test_banner1, Y_test, X_test_ips

In [7]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    """
    Создает фичи и подготовливает их для подачи в модель.
    """
    data = data.drop(["impressions", "campaign_clicks", "rate0", "rate1"], axis=1)
    data = data.dropna()
    # Убираем отрицательные стандартные отклонения
    data = data[data['g0'] >= 0]
    data = data[data['g1'] >= 0]
    
    # Выделим день недели в отдельную переменную
    data['date_time'] = pd.to_datetime(data['date_time'], format="%Y-%m-%d %H:%M:%S.%f")
    data['weekday'] = data['date_time'].dt.weekday
 
    # Выделим train и test
    X_train, Y_train, X_test, X_test_banner1, Y_test, X_test_ips = train_test_split(data) 

    # Кодирование
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True, drop='first')
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.transform(X_test)
    X_test_banner1 = encoder.transform(X_test_banner1)    
    
    return X_train, Y_train, X_test, X_test_banner1, Y_test, X_test_ips

In [8]:
X_train, Y_train, X_test, X_test_banner1, Y_test, X_test_ips = feature_engineering(data)

In [9]:
del data

Обучаем модель как в дз1 - логистическую регрессию с liblinear optimizer, l2 регуляризацией и C = 0.1.

In [10]:
model = LogisticRegression(solver="liblinear", penalty="l2", random_state=42, C=0.1)
model.fit(X_train, Y_train)

In [11]:
test_roc = round(roc_auc_score(Y_test, model.predict_proba(X_test)[:, 1]), 3)
test_loss = round(log_loss(Y_test, model.predict_proba(X_test)), 3)
print('Training results')
print(f'ROC AUC: {test_roc}, Log-loss: {test_loss}')

Training results
ROC AUC: 0.8, Log-loss: 0.133


Расчитаем вероятность того, что нормальная величина X больше нормальной величины Y:\
$P(X > Y) = P(X - Y > 0)=1-P(X-Y<=0)=1-F_{X-Y}(0)$\
Так как X и Y нормально распределены, то у величины X-Y тоже будет нормальное распределение. Тогда среднее этой величины посчитается как разность средних X и Y, а квадрат стандартного отклонения как сумма квадратов стандартных отклонений X и Y.

In [12]:
def calculate_prob(coeff_sum0, g0, coeff_sum1, g1):
    pi =  1 - norm.cdf(0, loc=coeff_sum0 - coeff_sum1, scale=np.sqrt(g0**2 + g1**2) + 1e-6) 
    return pi

In [13]:
pi_0 = calculate_prob(X_test_ips["coeff_sum0"], X_test_ips["g0"],
                      X_test_ips["coeff_sum1"], X_test_ips["g1"])

# новые коэффициенты
coeff_sum0_pred = logit(model.predict_proba(X_test)[:, 1])
coeff_sum1_pred = logit(model.predict_proba(X_test_banner1)[:, 1])

pi_1 = calculate_prob(coeff_sum0_pred, X_test_ips["g0"], 
                      coeff_sum1_pred, X_test_ips["g1"])

cips = np.mean(Y_test * np.minimum(pi_1 / (pi_0 + 1e-9), 10))
print("CIPS:", cips)

CIPS: 0.06314049347152971
