# Домашнее задание 4
Рекомендательные системы, Оксана Нырка

## Загрузка данных

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from scipy.stats import norm
from scipy.special import logit

In [6]:
data = pd.read_csv('data.csv')

In [7]:
# date_time : object -> datetime
data.date_time = pd.to_datetime(data.date_time)
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [8]:
# Удалим выброс за 1 сентября
data = data[data['date_time'] >= pd.to_datetime('2021-09-26')]
print(data.shape)

(15821471, 17)


In [9]:
# Оставим только наблюдения, где banner_id совпадает с banner_id0
data = data[data['banner_id'] == data['banner_id0']]

## Feature engineering

> Сделаем предобработку как во второй домашке (на основе комментариев по первой домашке) и без использования campaign_clicks

In [10]:
# в impressions все единицы - удалим колонку
data = data.drop('impressions', axis = 1)

# date_time
## достанем из date_time время суток и день недели
data['day'] = data['date_time'].dt.weekday
data['hour'] = data['date_time'].dt.hour


## interactions between hour and day
data['day_hour_interact'] = data['day'].astype(str) + data['hour'].astype(str)
data['day_hour_interact'] = pd.factorize(data['day_hour_interact'])[0]


# zone_id
counts = data['zone_id'].value_counts()
idxs = counts[counts < 20].index
data.loc[data['zone_id'].isin(idxs), 'zone_id'] = -1

# os_id
data.os_id = data.os_id.replace({9: 1, 10: 1, 7: 2, 8: 2})

In [26]:
# сделаем one-hot encoding для всех категориальных фичей
## разделим на обучающую и тестовую выборку: тестовая - последний день датасета
x_train = data[data['date_time'] < pd.to_datetime('2021-10-02')]
y_train = x_train['clicks']
x_test = data[~(data['date_time'] < pd.to_datetime('2021-10-02'))]
y_test = x_test['clicks']

In [27]:
## добавим тестовую для policy_1
x_test_1 = x_test.copy()
x_test_1['banner_id'] = x_test_1['banner_id1']

In [28]:
## удалим все фичи, которые не надо делать dummies
x_train = x_train.drop(columns = ['campaign_clicks', 'date_time', 'clicks', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'oaid_hash'])
x_test = x_test.drop(columns = ['campaign_clicks', 'date_time', 'clicks', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'oaid_hash'])
x_test_1 = x_test_1.drop(columns = ['campaign_clicks', 'date_time', 'clicks', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1', 'oaid_hash'])


In [29]:
# encoder для one-hot encoding
encoder = OneHotEncoder(sparse_output=True, drop = 'first', handle_unknown='ignore')
X_train = encoder.fit_transform(x_train)
X_test = encoder.transform(x_test)
X_test_1 = encoder.transform(x_test_1)



## Линейная модель: линейная регрессия
> по сравнению с HW1 используется другой солвер

In [52]:
lr = LogisticRegression(C = .1, solver = 'liblinear', max_iter = 100000)
lr.fit(X_train, y_train)

In [53]:
# Результат на тесте
print("Model performance:")
y_pred = lr.predict_proba(X_test)
print('roc auc score: ', roc_auc_score(y_test, y_pred[:, 1]))
print('log loss score: ', log_loss(y_test, y_pred))

Model performance:
roc auc score:  0.7915547337615091
log loss score:  0.13428434368246028


In [21]:
# Test baseline
print("Test Baseline:")
pred = np.ones_like(y_test) * y_test.mean()
print(f"auc: {roc_auc_score(y_test, pred)}")
print(f"log_loss: {log_loss(y_test, pred)}")

Test Baseline:
auc: 0.5
log_loss: 0.15581177703186688


## CIPS

Определим вероятность того, что одна нормальная величина больше другой.

Пусть $\xi_0 \sim \mathcal{N}(\mu_0; \sigma_0^2)$, $\xi_1 \sim \mathcal{N}(\mu_1; \sigma_1^2)$. Посчитаем вероятность как:

$$Pr(\xi_0 > \xi_1) = Pr(\xi_0 - \xi_1 > 0) = 1 - Pr(\xi_0 - \xi_1 < 0) = 1 - F_{\xi_0 - \xi_1}(0)$$

где $\xi_0 - \xi_1 \sim \mathcal{N}(\mu_0 - \mu_1; \sigma_0^2 + \sigma_1^2)$.

In [34]:
# Вернем данные, т.к. нужные колонки были удалены перед one-hot encoding
x_test = data[~(data['date_time'] < pd.to_datetime('2021-10-02'))]

In [49]:
# Посчитаем policy_0
mu = x_test['coeff_sum0'] - x_test['coeff_sum1']
sigma = np.sqrt(x_test['g0']**2 + x_test['g1']**2) + 1e-8
pi_0 = 1 - norm.cdf(0, mu, sigma)

In [54]:
# Посчитаем policy_1
coeff_sum_new0 = logit(y_pred[:, 1])
y_pred_new = lr.predict_proba(X_test_1)
coeff_sum_new1 = logit(y_pred_new[:, 1])

In [55]:
mu = coeff_sum_new0 - coeff_sum_new1
sigma = np.sqrt(x_test['g0']**2 + x_test['g1']**2) + 1e-8
pi_1 = 1 - norm.cdf(0, mu, sigma)

$$CIPS = \frac{1}{n}\sum_i r_i\cdot min \left(\frac{\pi_1}{\pi_0}, \lambda\right)$$

In [56]:
# Посчитаем CIPS
lambda_ = 10
cips = np.mean(y_test*np.minimum(pi_1 / (pi_0 + 1e-8), lambda_))
print('CIPS: ', cips)

CIPS:  0.06273601501661438
