In [1]:
import category_encoders as ce
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import cross_validate


In [2]:
data = pd.read_csv('data.csv')
data = data.drop(columns=['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1'])
data

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,0,0,0,1,1
1,2021-09-26 22:54:49.000000,1,1,0,0,1,1,1
2,2021-09-26 23:57:20.000000,2,2,3,0,0,1,1
3,2021-09-27 00:04:30.000000,3,3,0,1,1,1,1
4,2021-09-27 00:06:21.000000,4,4,0,1,0,1,1
...,...,...,...,...,...,...,...,...
15821467,2021-10-02 15:51:35.000000,146,530,0,2,9,1,0
15821468,2021-09-27 22:03:14.000000,12,22,0,1,6,1,0
15821469,2021-10-02 17:41:10.000000,12,1236,0,2,0,1,0
15821470,2021-09-29 00:39:32.000000,967,21,0,0,0,1,0


In [3]:
def analysis(data: pd.DataFrame):
    display(data.describe())
    for column in data.drop(columns=['date_time']).columns:
        print('unique values ' + str(column) + ':', len(data[column].unique()))

analysis(data)

Unnamed: 0,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
count,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0,15821472.0,15821470.0
mean,81.52679,381.6483,0.623854,1.840605,4.346986,1.0,0.02668835
std,163.2448,395.9386,9.249152,1.530005,4.317701,0.0,0.161171
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,14.0,52.0,0.0,1.0,0.0,1.0,0.0
50%,19.0,217.0,0.0,2.0,4.0,1.0,0.0
75%,60.0,611.0,0.0,3.0,7.0,1.0,0.0
max,3443.0,1632.0,829.0,10.0,16.0,1.0,1.0


unique values zone_id: 3444
unique values banner_id: 1633
unique values campaign_clicks: 822
unique values os_id: 11
unique values country_id: 17
unique values impressions: 1
unique values clicks: 2


Можно игнорировать impressions. Время разобъем по колонкам

In [4]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data['date_time'] = data['date_time'].apply(lambda x: datetime.strptime(x.split('.')[0],'%Y-%m-%d %H:%M:%S'))
    data = data.drop(columns=['impressions'])
    data['year'] = data['date_time'].dt.year
    data['month'] = data['date_time'].dt.month
    data['day'] = data['date_time'].dt.day
    data['hour'] = data['date_time'].dt.hour
    data['minute'] = data['date_time'].dt.minute
    data['second'] = data['date_time'].dt.second
    return data

data = feature_engineering(data)

Тестовую выборку отсечем по последнему дню

In [5]:
last_day = data['date_time'].max().replace(hour=0, minute=0, second=0, microsecond=0)
print('last_day ' + str(last_day))
train_data = data[data['date_time'] < last_day]
print('train_data size ' + str(len(train_data)))
test_data = data[data['date_time'] >= last_day]
print('test_data size ' + str(len(test_data)))

last_day 2021-10-02 00:00:00
train_data size 13692494
test_data size 2128978


In [6]:
target_column = 'clicks'
drop_columns = ['date_time', 'clicks']

X_train = train_data.drop(columns=drop_columns)
y_train = train_data[target_column]
X_test = test_data.drop(columns=drop_columns)
y_test = test_data[target_column]

Нужно закодировать категориальные признаки, для 'os_id' и 'country_id' можем позволить one-hot-encoding. Для 'banner_id' и 'zone_id' воспользуемся target_encoding

In [None]:
one_hot_encoder = ce.OneHotEncoder(cols=['os_id', 'country_id'])
one_hot_encoder.fit(X_train, y_train)
X_train = one_hot_encoder.transform(X_train, y_train)
X_test = one_hot_encoder.transform(X_test, y_test)

target_encoder = ce.TargetEncoder(cols=['zone_id', 'banner_id'])
target_encoder.fit(X_train, y_train)
X_train = target_encoder.transform(X_train, y_train)
X_test = target_encoder.transform(X_test, y_test)

В качестве бэйзлайна возьмем среднее значение

In [55]:
baseline = data[data['date_time'] < last_day][target_column].mean()
baseline = np.ones_like(y_test) * baseline
print('log loss = ', log_loss(y_test, baseline))
print('Roc Auc = ', roc_auc_score(y_test, baseline))

NameError: name 'target_column' is not defined

In [None]:
def cv(X: pd.DataFrame, y: pd.Series):
    grid = np.linspace(0.1, 1, 10)
    results = []
    for grid_val in grid:
        cur_model = LogisticRegression(solver='liblinear',C=grid_val,penalty='l2')
        results.append(np.mean(cross_validate(cur_model, X_train, y_train, scoring=['roc_auc'], cv=3)['test_roc_auc']))
        print(grid_val, results[-1])
    return results[np.argmax(results)]

result = LogisticRegression(solver='liblinear',C=cv(X_train, y_train),penalty='l2')
result.fit(X_train, y_train)

In [None]:
y_predicted = result.predict(X_test)
print('log loss: ', log_loss(y_test, y_predicted))
print('roc auc: ', roc_auc_score(y_test, y_predicted))