In [None]:
%pip install -q polars==0.18.6 xlearn=="0.40a1"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for xlearn (setup.py) ... [?25l[?25hdone


In [1]:
import pandas as pd
import polars as pl
import numpy as np
import xlearn as xl
from sklearn.metrics import log_loss, roc_auc_score

import datetime
from tqdm import tqdm

import os
import math
import json
import warnings
from typing import Tuple

os.environ['USER'] = 'test'
warnings.filterwarnings('ignore')

RANDOM_STATE = 42

In [None]:
data = pl.read_csv('../data/data.csv', try_parse_dates=True).sort('date_time')
data = data.drop('banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1')

In [6]:
data = (
    data
    .with_columns([
        pl.col('date_time').apply(lambda x: x.hour).alias('hour'),
        pl.col('date_time').apply(lambda x: x.weekday()).alias('weekday'),
    ])
    .with_columns([
        pl.col('hour').apply(lambda x: np.sin(2 * np.pi * x / 24)).alias('sin_hour'),
        pl.col('hour').apply(lambda x: np.cos(2 * np.pi * x / 24)).alias('cos_hour'),
        pl.col('weekday').apply(lambda x: np.sin(2 * np.pi * x / 7)).alias('sin_weekday'),
        pl.col('weekday').apply(lambda x: np.cos(2 * np.pi * x / 7)).alias('cos_weekday'),
    ])
    .drop('hour', 'weekday')
)

В качестве валидации будем использовать отложенную выборку за один день до тестовой

In [7]:
test_date_threshold = data['date_time'].max().replace(hour=0, minute=0, second=0, microsecond=0)
val_date_threshold = test_date_threshold - datetime.timedelta(days=1)

train_data = data.filter(pl.col('date_time') < val_date_threshold)
val_data = data.filter(pl.col('date_time') >= val_date_threshold).filter(pl.col('date_time') < test_date_threshold)
test_data = data.filter(pl.col('date_time') >= test_date_threshold)
print(f'строчек в тренировочной выборке: {len(train_data)}')
print(f'строчек в валидационной выборке: {len(val_data)}')
print(f'строчек в тестовой выборке: {len(test_data)}')

# sanity check
assert len(train_data) + len(val_data) + len(test_data) == len(data)

строчек в тренировочной выборке: 12049046
строчек в валидационной выборке: 1643448
строчек в тестовой выборке: 2128978


In [8]:
target_col = 'clicks'
drop_columns = ['date_time', 'impressions', 'clicks']

def feature_engineering(data: pl.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    # ничего не делаем, так как это будет сделано в следующей ячейке
    X, y = data.drop(drop_columns).to_pandas(), data[target_col].to_pandas()
    return X, y

train_X, train_y = feature_engineering(train_data)
val_X, val_y = feature_engineering(val_data)
test_X, test_y = feature_engineering(test_data)

Для матричной факторизации будем использовать xlearn, но перед этим законвертируем все категориальные признаки с помощью функции `convert_to_ffm`

In [None]:
def convert_to_ffm(path, df, type, target, numerics, categories, features, encoder):
    # source: https://github.com/wngaw/blog/blob/master/xlearn_example/src/utils.py

    # Flagging categorical and numerical fields
    print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    tmp = df.to_dicts()
    with open(path + str(type) + "_ffm.txt", "w") as text_file:
        # Looping over rows to convert each row to libffm format
        for n, r in tqdm(enumerate(range(nrows)), total=nrows):
            datastring = ""
            datarow = tmp[r]
            datastring += str(int(datarow[target]))  # Set Target Variable here

            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    # Not adding numerical values that are nan
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    # For a new field appearing in a training example
                    if(x not in encoder['catcodes']):
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    # For already encoded fields
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

    return encoder

In [40]:
numerics = ['campaign_clicks', 'sin_hour', 'cos_hour', 'sin_weekday', 'cos_weekday']
categories = ['zone_id', 'banner_id', 'oaid_hash', 'os_id', 'country_id']

encoder = {
    "currentcode": len(numerics),
    "catdict": {},
    "catcodes": {}
}

encoder = convert_to_ffm(
    path='./',
    df=train_data,
    type='train',
    target='clicks',
    numerics=numerics,
    categories=categories,
    features=numerics + categories,
    encoder=encoder,
)

encoder = convert_to_ffm(
    path='./',
    df=val_data,
    type='val',
    target='clicks',
    numerics=numerics,
    categories=categories,
    features=numerics + categories,
    encoder=encoder,
)

encoder = convert_to_ffm(
    path='./',
    df=test_data,
    type='test',
    target='clicks',
    numerics=numerics,
    categories=categories,
    features=numerics + categories,
    encoder=encoder,
)

convert_to_ffm - START


100%|██████████| 12049046/12049046 [03:23<00:00, 59327.74it/s]


convert_to_ffm - START


100%|██████████| 1643448/1643448 [00:27<00:00, 59126.69it/s]


convert_to_ffm - START


100%|██████████| 2128978/2128978 [00:36<00:00, 58089.67it/s]


Переберем гиперпараметр, отвечающий за размерность

In [11]:
for k in [2, 4, 6, 8]:
    params = {
        "task": "binary",
        "lr": 1.0,
        "lambda": 1e-3,
        "k": k,
        "metric": "auc"
    }
    print(params)
    ffm_model = xl.create_ffm()
    ffm_model.setTrain("train_ffm.txt")
    ffm_model.setValidate("val_ffm.txt")
    ffm_model.fit(params, "./model.out")

    ffm_model.setTest("val_ffm.txt")
    ffm_model.setSigmoid()
    ffm_model.predict("./model.out", "./output.txt")

    with open("./output.txt", 'r') as f:
        y_pred = np.array([float(prediction) for prediction in f.readlines()])

    print(f'ROC AUC = {roc_auc_score(val_y, y_pred)}')
    print(f'Log loss = {log_loss(val_y, y_pred)}')

{'task': 'binary', 'lr': 1.0, 'lambda': 0.001, 'k': 2, 'metric': 'auc'}
ROC AUC = 0.8058123313718395
Log loss = 0.15070572896121728
{'task': 'binary', 'lr': 1.0, 'lambda': 0.001, 'k': 4, 'metric': 'auc'}
ROC AUC = 0.8063131346050376
Log loss = 0.15159130370529916
{'task': 'binary', 'lr': 1.0, 'lambda': 0.001, 'k': 6, 'metric': 'auc'}
ROC AUC = 0.8064604671303455
Log loss = 0.15074274307975366
{'task': 'binary', 'lr': 1.0, 'lambda': 0.001, 'k': 8, 'metric': 'auc'}
ROC AUC = 0.8061713582273058
Log loss = 0.15051741427097662


Разница получилась не очень большая, но k=6 дал лучшие результаты по метрике ROC-AUC

In [14]:
# соединим все данные до тестовой выборки
!cat train_ffm.txt val_ffm.txt > train_full_ffm.txt

In [18]:
params = {
    "task": "binary",
    "lr": 1.0,
    "lambda": 1e-3,
    "k": 6,
    "metric": "auc"
}
print(params)
ffm_model = xl.create_ffm()
ffm_model.setTrain("train_full_ffm.txt")
ffm_model.fit(params, "./model.out")

ffm_model.setTest("test_ffm.txt")
ffm_model.setSigmoid()
ffm_model.predict("./model.out", "./output.txt")

with open("./output.txt", 'r') as f:
    y_pred = np.array([float(prediction) for prediction in f.readlines()])

print(f'ROC AUC = {roc_auc_score(test_y, y_pred)}')
print(f'Log loss = {log_loss(test_y, y_pred)}')

{'task': 'binary', 'lr': 1.0, 'lambda': 0.001, 'k': 6, 'metric': 'auc'}
ROC AUC = 0.7810819121427732
Log loss = 0.13979996266528832


Итого, включая первое задание, у нас следующие результаты

### Baseline
- ROC AUC = 0.5
- Log loss = 0.15486198009919758

### Linear regression
- ROC AUC = 0.7448397266390455
- Log loss = 0.1415572152890492

### FFM
- ROC AUC = 0.7810819121427732
- Log loss = 0.13979996266528832
