In [None]:
%pip install -q category_encoders==2.6.3 polars==0.18.6

In [1]:
import pandas as pd
import polars as pl
import numpy as np

import category_encoders as ce
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from typing import Tuple
import datetime

import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42

In [2]:
data = pl.read_csv('../data/data.csv', try_parse_dates=True).sort('date_time')
data = data.drop('oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1')
data

date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
datetime[μs],i64,i64,i64,i64,i64,i64,i64
2021-09-01 00:02:49,30,596,0,0,7,1,0
2021-09-26 00:00:00,3,86,0,4,0,1,0
2021-09-26 00:00:00,1,188,2,2,15,1,0
2021-09-26 00:00:00,17,3,0,1,1,1,0
2021-09-26 00:00:00,86,61,0,0,1,1,0
2021-09-26 00:00:00,19,36,1,1,0,1,0
2021-09-26 00:00:00,41,87,0,0,3,1,0
2021-09-26 00:00:00,19,159,0,0,11,1,0
2021-09-26 00:00:00,41,29,1,3,0,1,0
2021-09-26 00:00:00,19,453,0,0,5,1,0


In [3]:
def analysis(data: pl.DataFrame):
    display(data.describe())
    
    for col in data.drop('date_time').columns:
        print(f'количество уникальных значений {col}: {data[col].n_unique()}')
        
analysis(data)

describe,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""15821472""",15821472.0,15821472.0,15821472.0,15821472.0,15821472.0,15821472.0,15821472.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,81.526792,381.648312,0.623854,1.840605,4.346986,1.0,0.026688
"""std""",,163.244823,395.938571,9.249152,1.530005,4.317701,0.0,0.161171
"""min""","""2021-09-01 00:…",0.0,0.0,0.0,0.0,0.0,1.0,0.0
"""max""","""2021-10-02 23:…",3443.0,1632.0,829.0,10.0,16.0,1.0,1.0
"""median""",,19.0,217.0,0.0,2.0,4.0,1.0,0.0
"""25%""",,14.0,52.0,0.0,1.0,0.0,1.0,0.0
"""75%""",,60.0,611.0,0.0,3.0,7.0,1.0,0.0


количество уникальных значений zone_id: 3444
количество уникальных значений banner_id: 1633
количество уникальных значений campaign_clicks: 822
количество уникальных значений os_id: 11
количество уникальных значений country_id: 17
количество уникальных значений impressions: 1
количество уникальных значений clicks: 2


Закодируем время в виде синуса/косинуса от часа дня и дня недели

In [4]:
data = (
    data
    .with_columns([
        pl.col('date_time').apply(lambda x: x.hour).alias('hour'),
        pl.col('date_time').apply(lambda x: x.weekday()).alias('weekday'),
    ])
    .with_columns([
        pl.col('hour').apply(lambda x: np.sin(2 * np.pi * x / 24)).alias('sin_hour'),
        pl.col('hour').apply(lambda x: np.cos(2 * np.pi * x / 24)).alias('cos_hour'),
        pl.col('weekday').apply(lambda x: np.sin(2 * np.pi * x / 7)).alias('sin_weekday'),
        pl.col('weekday').apply(lambda x: np.cos(2 * np.pi * x / 7)).alias('cos_weekday'),
    ])
    .drop('hour', 'weekday')
)

Для тестирования решения будем использовать последний день, а для валидации – предпоследний

In [5]:
test_date_threshold = data['date_time'].max().replace(hour=0, minute=0, second=0, microsecond=0)
print(f'test date threshold: {test_date_threshold}')

test date threshold: 2021-10-02 00:00:00


In [6]:
train_data = data.filter(pl.col('date_time') < test_date_threshold)
test_data = data.filter(pl.col('date_time') >= test_date_threshold)
print(f'строчек в тренировочной выборке: {len(train_data)}')
print(f'строчек в тестовой выборке: {len(test_data)}')

# sanity check
assert len(train_data) + len(test_data) == len(data)

строчек в тренировочной выборке: 13692494
строчек в тестовой выборке: 2128978


Чтобы закодировать категориальные признаки, воспользуемся target encoding для признаков, у которых много уникальных значений и One-hot encoding для признаков, у которых немного уникальных значений, чтобы не раздувать память. Для этого я использовал библиотеку `category-encoders`

In [7]:
target_col = 'clicks'
drop_columns = ['date_time', 'impressions', 'clicks']

loo_encoder = ce.leave_one_out.LeaveOneOutEncoder(cols=['zone_id', 'banner_id'])
loo_encoder.fit(train_data.drop(drop_columns).to_pandas(), train_data[target_col].to_pandas())

ce_one_hot_encoder = ce.OneHotEncoder(cols=['os_id', 'country_id'])
ce_one_hot_encoder.fit(train_data.drop(drop_columns).to_pandas(), train_data[target_col].to_pandas())

OneHotEncoder(cols=['os_id', 'country_id'])

In [8]:
def feature_engineering(data: pl.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    X, y = data.drop(drop_columns).to_pandas(), data[target_col].to_pandas()
    X = loo_encoder.transform(X, y)
    X = ce_one_hot_encoder.transform(X, y)
    return X, y

train_X, train_y = feature_engineering(train_data)
test_X, test_y = feature_engineering(test_data)

In [9]:
def create_model(**kwargs):
    return LogisticRegression(**kwargs)

In [10]:
def eval_model(model, X, y):
    y_pred = model.predict_proba(X)[:, 1]
    print(f'ROC AUC = {roc_auc_score(y, y_pred)}')
    print(f'Log loss = {log_loss(y, y_pred)}')

Для подбора оптимальных гиперпараметров воспользуемся `TimeSeriesSplit` валидацией, которая учитывает временную составляющую в данных, тогда для оценки будет использоваться скользящее окно размером 1_000_000 строк, что чуть меньше по размеру тестовой выборки

In [12]:
def cv(X: pd.DataFrame, y: pd.Series):
    tscv = TimeSeriesSplit(n_splits=3, test_size=2_000_000)
    gsearch = GridSearchCV(
        estimator=create_model(random_state=RANDOM_STATE, max_iter=50),
        cv=tscv,
        param_grid={'C': [1e-2, 1e-1, 1, 1e1]},
        scoring=['neg_log_loss', 'roc_auc'],
        verbose=3,
        refit='roc_auc',
    )
    gsearch.fit(train_X, train_y)
    display(pl.DataFrame(gsearch.cv_results_))
    return gsearch.best_estimator_

best_model = cv(train_X, train_y)
eval_model(best_model, test_X, test_y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END C=0.01; neg_log_loss: (test=-0.113) roc_auc: (test=0.658) total time=  58.3s
[CV 2/3] END C=0.01; neg_log_loss: (test=-0.141) roc_auc: (test=0.699) total time= 1.5min
[CV 3/3] END C=0.01; neg_log_loss: (test=-0.158) roc_auc: (test=0.743) total time= 2.9min
[CV 1/3] END C=0.1; neg_log_loss: (test=-0.112) roc_auc: (test=0.684) total time= 1.3min
[CV 2/3] END C=0.1; neg_log_loss: (test=-0.139) roc_auc: (test=0.721) total time= 1.3min
[CV 3/3] END C=0.1; neg_log_loss: (test=-0.157) roc_auc: (test=0.752) total time= 3.2min
[CV 1/3] END C=1; neg_log_loss: (test=-0.112) roc_auc: (test=0.688) total time= 1.1min
[CV 2/3] END C=1; neg_log_loss: (test=-0.139) roc_auc: (test=0.726) total time= 1.3min
[CV 3/3] END C=1; neg_log_loss: (test=-0.157) roc_auc: (test=0.752) total time= 1.9min
[CV 1/3] END C=10.0; neg_log_loss: (test=-0.112) roc_auc: (test=0.689) total time= 1.2min
[CV 2/3] END C=10.0; neg_log_loss: (test=-0.139) roc

mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_neg_log_loss,split1_test_neg_log_loss,split2_test_neg_log_loss,mean_test_neg_log_loss,std_test_neg_log_loss,rank_test_neg_log_loss,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
f64,f64,f64,f64,object,struct[1],f64,f64,f64,f64,f64,i32,f64,f64,f64,f64,f64,i32
106.803091,49.899947,2.094338,0.138936,0.01,{0.01},-0.113335,-0.141182,-0.15793,-0.137482,0.018393,4,0.657869,0.699238,0.743278,0.700128,0.034874,4
112.792906,53.439988,2.013463,0.140529,0.1,{0.1},-0.112269,-0.138886,-0.156597,-0.135918,0.018218,3,0.68357,0.721183,0.75242,0.719058,0.028148,3
83.063315,21.536372,2.13251,0.270497,1.0,{1.0},-0.112193,-0.138789,-0.156577,-0.135853,0.018238,2,0.68836,0.726274,0.752497,0.722377,0.026328,1
117.436231,35.260751,2.307868,0.559409,10.0,{10.0},-0.112039,-0.138864,-0.15652,-0.135808,0.018287,1,0.689271,0.723848,0.75354,0.72222,0.026263,2


ROC AUC = 0.7448397266390455
Log loss = 0.1415572152890492


В качестве предложенного бейзлайна используем среднее значение таргета, для такого предсказания ожидаемо ROC-AUC = 0.5, а Log-loss выше, чем у нашего решения

In [13]:
baseline_pred = data.filter(pl.col('date_time') < test_date_threshold)[target_col].mean()
baseline_pred = np.ones_like(test_y) * baseline_pred
print(f'ROC AUC = {roc_auc_score(test_y, baseline_pred)}')
print(f'Log loss = {log_loss(test_y, baseline_pred)}')

ROC AUC = 0.5
Log loss = 0.15486198009919758
