# Логистическая регрессия (из HW1)

In [18]:
import pandas as pd

In [19]:
data = pd.read_csv('/kaggle/input/ad-clicks/data.csv')
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


## Preprocessing

In [20]:
data = data.drop(columns = ['campaign_clicks', 'oaid_hash'])

In [21]:
data['os_id'] = data['os_id'].replace({9: 2, 10: 2, 7: 0, 8: 0})

In [22]:
# отфильтруем наблюдения, в которых banner_id не совпадает с banner_id0
data = data[data['banner_id'] == data['banner_id0']]

In [23]:
from sklearn.preprocessing import OneHotEncoder

  # удаляем impressions
data = data.drop('impressions', axis = 1)

# создаем фичи от времени
data.date_time = pd.to_datetime(data.date_time)
data['day'] = data['date_time'].dt.weekday
data['hour'] = data['date_time'].dt.hour

# добавляем интерсекции между часом и днем недели 
data['hour_day'] = data['day'].astype(str) + ' ' + data['hour'].astype(str)
data['hour_day'] = pd.factorize(data['hour_day'])[0]

# выделяем редкие zone id в категорию -1
zone_id_counts = data['zone_id'].value_counts()
idxs = zone_id_counts[zone_id_counts < 10].index

data.loc[data['zone_id'].isin(idxs), 'zone_id'] = -1

# делим выборку на train и test
data = data.sort_values(by='date_time', ignore_index=True)
cut_off_test = data[data['date_time'] < pd.to_datetime('2021-10-02')].index[-1]

y = data['clicks']
data_train, data_test_extra = data.iloc[:cut_off_test, :], data.iloc[cut_off_test:, :]
data_train = data_train.drop(columns = ['clicks', 'date_time', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1'])
data_test = data_test_extra.drop(columns = ['clicks', 'date_time', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1'])
y_train_all, y_test = y.iloc[:cut_off_test], y.iloc[cut_off_test:]

# делаем one hot encodings
enc = OneHotEncoder(drop='first', handle_unknown='ignore')
enc.fit(data_train)
transformed_train_all = enc.transform(data_train)
transformed_test = enc.transform(data_test)



Нам также пригодится тестовый датасет с banner_id = banner_id1, подготовим его и преобразуем

In [24]:
data_test_banner1 = data_test.copy()
data_test_banner1['banner_id'] = data_test_extra['banner_id1']
transformed_test_banner1 = enc.transform(data_test_banner1)



## Обучение

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, f1_score

In [26]:
clf = LogisticRegression(solver='newton-cholesky', max_iter=10000, C=0.001, verbose=1)

In [27]:
clf.fit(transformed_train_all, y_train_all)

Newton iter=1
  Check Convergence
    1. max |gradient| 0.10565115105351917 <= 0.0001
Newton iter=2
  Check Convergence
    1. max |gradient| 0.03185142562600456 <= 0.0001
Newton iter=3
  Check Convergence
    1. max |gradient| 0.008291477186923626 <= 0.0001
Newton iter=4
  Check Convergence
    1. max |gradient| 0.0014869221757687345 <= 0.0001
Newton iter=5
  Check Convergence
    1. max |gradient| 0.00010640890421553799 <= 0.0001
Newton iter=6
  Check Convergence
    1. max |gradient| 9.26127529392145e-07 <= 0.0001
    2. Newton decrement 1.1060369832420306e-06 <= 0.0001
  Solver did converge at loss = 0.10644002828142879.


Посчитаем скор на тестовой выборке

In [28]:
y_pred_probs = clf.predict_proba(transformed_test)
y_pred = clf.predict(transformed_test)

print('log loss:', log_loss(y_test, y_pred_probs))

print('roc auc:', roc_auc_score(y_test, y_pred_probs[:, 1]))

log loss: 0.13841153794429203
roc auc: 0.7812494334307901


# Clipped IPS

Посчитаем вероятность, что одна случайная величина из нормального распределения больше другой.

$$\xi_1\sim \mathcal{N}(\mu_1, \sigma_1^2), \xi_2\sim \mathcal{N}(\mu_2, \sigma_2^2)$$
$$P(\xi_1 > xi_2) = P(\xi_1 - xi_2 > 0) = 1 - P(\xi_1 - \xi_2 < 0) = 1 - F_{\xi_1-\xi_2}(0),$$ где

$$\xi_1-\xi_2 \sim \mathcal{N}(\mu_1-\mu_2, \sigma_1^2 + \sigma_2^2)$$

$$F_{\xi_1-\xi_2}(0) = \Phi\left(\frac{\mu_2-\mu_1}{\sqrt{\sigma_1^2 + \sigma_2^2}}\right)$$

In [29]:
from scipy.stats import norm
import numpy as np

def policy(mu_1, mu_2, sigma_1, sigma_2):
    prob = 1 - norm.cdf(0, loc=mu_1-mu_2, scale=np.sqrt(sigma_1**2 + sigma_2**2))
    return prob

In [30]:
pi_0 = policy(data_test_extra['coeff_sum0'], data_test_extra['coeff_sum1'], data_test_extra['g0'], data_test_extra['g1'])


  x = np.asarray((x - loc)/scale, dtype=dtyp)


In [31]:
from scipy.special import logit

coeff_sum0_new = logit(y_pred_probs[:, 1])

# посчитаем вероятности для banner_id1

y_pred1_probs = clf.predict_proba(transformed_test_banner1)
coeff_sum1_new = logit(y_pred1_probs[:,1])

In [32]:
pi_1 = policy(coeff_sum0_new, coeff_sum1_new, data_test_extra['g0'], data_test_extra['g1'])

  x = np.asarray((x - loc)/scale, dtype=dtyp)


Наконец, посчитаем cIPS

In [33]:
c_ips = np.mean(data_test_extra['rate0'] * np.minimum(pi_1/(pi_0+1e-8), 10))

In [34]:
print('Clipped IPS:', c_ips)

Clipped IPS: 0.04197149014605779
