In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
import scipy
from sklearn.model_selection import cross_validate
from scipy.special import logit



In [2]:
all_data = pd.read_csv('data.csv')
# data = all_data.sample(500000)
data = all_data

In [3]:
data.drop(columns=["campaign_clicks"], inplace=True)

Отфильтруем данные, banner_id0 должен быть равен banner_id

In [4]:
data = data[data['banner_id0'] == data['banner_id']]

In [5]:
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1
5,2021-09-27 00:06:50.000000,5,5,2377014068362699676,2,2,5,0.004,0.337634,-3.222757,37,0.004,0.338195,-3.221755,1,1


Предобработка данных, как в 1 дз

In [6]:
def get_date_features(x):
    x_datetime = datetime.strptime(x.split('.')[0], '%Y-%m-%d %H:%M:%S')
    year = x_datetime.year
    month = x_datetime.month
    day = x_datetime.day
    hour = x_datetime.hour
    
    return [year, month, day, hour]

def split_date_time(data: pd.DataFrame):
    date_values= np.stack(data['date_time'].apply(lambda x: get_date_features(x)).values)
    data['year'] = date_values[:, 0]
    data['month'] = date_values[:, 1]
    data['day'] = date_values[:, 2]
    data['hour'] = date_values[:, 3]
    data = data.drop(columns=['date_time'])
    return data

def feature_engineering(data: pd.DataFrame, min_frequency=100):    
    data = split_date_time(data)
    
    data = data.drop(columns=['impressions', 'year'])    
    data = data.drop_duplicates()
    data = data.dropna()
    

    last_day_indices = np.logical_and((data['month'] == 10).values, (data['day'] == 2).values)
    train_data, test_data = data[np.logical_not(last_day_indices)], data[last_day_indices]
    
    X_train, y_train = train_data.drop(columns=['clicks']), train_data['clicks']
    X_test, y_test = test_data.drop(columns=['clicks']), test_data['clicks']

    # Добавим test сет, для вычисления pi_1 (подставим вместо banner_id значение banner_id1)
    X_test2, y_test2 = pd.DataFrame.copy(X_test), pd.DataFrame.copy(y_test)
    X_test2['banner_id'] = X_test2['banner_id1']
    
    # А ещё отдельно выделим значения для ips: g, coeff_sum
    data_for_ips = X_test[['g0', 'g1', 'coeff_sum0', 'coeff_sum1']]
    
    categorical_cols = ['zone_id', 'banner_id', 'os_id', 'country_id']
    
    enc = OneHotEncoder(handle_unknown='ignore', min_frequency=min_frequency)
    X_train = enc.fit_transform(X_train[categorical_cols])
    X_test = enc.transform(X_test[categorical_cols])
    X_test2 = enc.transform(X_test2[categorical_cols])

    # Я решил оставить только категориальные фичи, с one hot поверх них модель не переобучается
    # и ресемплинг не нужен => не меняем исходного распределения
    
    return (X_train, y_train), (X_test, y_test), (X_test2, y_test2), data_for_ips

def create_model(C=0.01):
    model = LogisticRegression(solver='liblinear', C=C)
    
    return model

def count_metrics(y_test, y_pred):
    return log_loss(y_test, y_pred), roc_auc_score(y_test, y_pred)

def validate_model(X, y, model):
    y_pred_proba = model.predict_proba(X)[:, 1]
    log_loss, roc_auc = count_metrics(y, y_pred_proba)
    print(f"log_loss: {log_loss}, roc_auc: {roc_auc}")
    return log_loss, roc_auc

In [7]:
(X_train, y_train), (X_test, y_test), (X_test2, y_test2), data_for_ips = feature_engineering(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = date_values[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = date_values[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['day'] = date_values[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [8]:
# C=1 - лучшее значение из 1 дз
model = create_model(C=1)
model.fit(X_train, y_train)

In [9]:
validate_model(X_test, y_test, model)
pass

log_loss: 0.13284874125486476, roc_auc: 0.7914221159996891


$$
P(X > Y) = P(X - Y > 0) = 1 - P(X - Y < 0) = 1 - F_{X-Y}(0);
$$
$$
X,Y \sim \mathcal{N}(\mu,\sigma^2). \Rightarrow X - Y \sim \mathcal{N} (\mu_1 - \mu_2, \sigma_1^2 + \sigma_2^2)
$$

In [14]:
EPS = 1e-6

def policy_prob(mu1, mu2, std1, std2):
    mu = mu1 - mu2
    std = np.sqrt(std1 ** 2 + std2 ** 2) + EPS
    cum_dist = 1 - scipy.stats.norm.cdf(0, mu, std)
    
    return cum_dist

pi_0 = policy_prob(data_for_ips['coeff_sum0'], data_for_ips['coeff_sum1'], data_for_ips['g0'], data_for_ips['g1'])

coeff_sum0_pred = logit(model.predict_proba(X_test)[:, 1])
coeff_sum1_pred = logit(model.predict_proba(X_test2)[:, 1])
pi_1 = policy_prob(coeff_sum0_pred, coeff_sum1_pred, data_for_ips['g0'], data_for_ips['g1'])

In [15]:
def get_cips(y_test, pi_0, pi_1, l=10):
    cips = np.mean(y_test * np.minimum(pi_1 / (pi_0 + EPS), l))
    return cips

get_cips(y_test, pi_0, pi_1, l=10)

0.06404814885499405