In [None]:
!pip install xlearn

In [1]:
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import xlearn as xl
import numpy as np
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import OneHotEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/data.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
data = pd.read_csv(path)
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [8]:
cols = ['banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1']
data = data.drop(columns=cols)
data['date_time'] = pd.to_datetime(data['date_time'])

In [9]:
categorical_cols = ["zone_id", "banner_id", "oaid_hash" ,"os_id", "country_id", "hour"]

In [10]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data = data.drop(columns=["impressions"])
    data = data[data['date_time'] > '2021-09-02']
    data['hour'] = data['date_time'].dt.hour
    for col in categorical_cols:
        if col == 'os_id':
            data.loc[data['os_id'] > 7, 'os_id'] = 7
        elif col != "oaid_hash":
            column = data[col].value_counts()
            total = column.sum()
            condition = column < total * 0.0001
            mask_obs = column[condition].index
            mask = data[col].isin(mask_obs)
            data.loc[mask, col] = 0

    for col in categorical_cols:
        data[col], _ = pd.factorize(data[col])
    return data

In [11]:
data = feature_engineering(data)

Разделим данные на test (последний день), val (предпоследний день), train (все остальные дни)

In [12]:
data = data.sort_values("date_time")
max_date = data["date_time"].iloc[-1].date()
data_test = data[(data['date_time'].dt.date == max_date)]
data = data[(data['date_time'].dt.date < max_date)]
max_date

datetime.date(2021, 10, 2)

In [13]:
data = data.sort_values("date_time")
max_date = data["date_time"].iloc[-1].date()
data_val = data[(data['date_time'].dt.date == max_date)]
data = data[(data['date_time'].dt.date < max_date)]
max_date

datetime.date(2021, 10, 1)

In [14]:
y_test = data_test['clicks']
y_val = data_val['clicks']

libffm format:

  label field_1:index_1:value_1 field_2:index_2:value_2 ...

In [11]:
def transform_libffm_format(filename, data, target, dict_field, current_code=0):
    with open(filename, "w") as f:
        for index, row in tqdm(data.iterrows()):
            result_row = str(row[target])
            for i, x in enumerate(categorical_cols):
                if (x not in dict_field):
                    dict_field[x] = {}
                    с += 1
                    dict_field[x][row[x]] = с
                elif (row[x] not in dict_field[x]):
                    с += 1
                    dict_field[x][row[x]] = с
                result_row += f' {i}:{dict_field[x][row[x]]}:1'
            result_row += '\n'
            f.write(result_row)
    return с, dict_field

In [12]:
с, dict_field = 0, {}
с, dict_field = transform_libffm_format("/content/drive/MyDrive/train.txt", data, 'clicks', dict_field, с)

12049045it [24:19, 8254.68it/s]


In [13]:
с, dict_field = transform_libffm_format("/content/drive/MyDrive/test.txt", data_test, 'clicks', dict_field, с)

2128978it [04:22, 8123.44it/s]


In [14]:
с, dict_field = transform_libffm_format("/content/drive/MyDrive/val.txt", data_val, 'clicks', dict_field, с)

1643448it [03:28, 7892.38it/s] 


Подберем латентный размер и регуляризацию по валидации

In [4]:
for k in [4, 8]:
    for l in [0.0001, 0.001, 0.01]:
        print(f'l = {l}, k = {k}')
        # создаем модель
        ffm_model = xl.create_ffm()
        # делаем конфигурацию
        params = {'task':'binary', 'lr': 0.1, 'lambda': l, 'k': 4, 'metric': 'auc'}
        ffm_model.setTrain("/content/drive/MyDrive/train.txt")
        ffm_model.setTest("/content/drive/MyDrive/val.txt")
        # обучение
        ffm_model.fit(params, './model.out')
        ffm_model.setSigmoid()
        # получем предикты вероятностей
        ffm_model.predict('./model.out', './output.txt')
        with open('output.txt', 'r') as f:
            y_pred_proba = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))
        # считаем метрики
        roc_auc_metric = roc_auc_score(y_val, y_pred_proba)
        log_loss_metric = log_loss(y_val, y_pred_proba)
        print(f'roc_auc = {roc_auc_metric}, log_loss = {log_loss_metric}')


l = 0.0001, k = 4
roc_auc = 0.7765183462988138, log_loss = 0.1630521614218184
l = 0.001, k = 4
roc_auc = 0.7989549723056272, log_loss = 0.15292019740047677
l = 0.01, k = 4
roc_auc = 0.7702013768335266, log_loss = 0.16459710422468507
l = 0.0001, k = 8
roc_auc = 0.7625661866675255, log_loss = 0.16682106379367712
l = 0.001, k = 8
roc_auc = 0.7988224602040378, log_loss = 0.15308032857051024
l = 0.01, k = 8
roc_auc = 0.7706129984465239, log_loss = 0.16451980888263873


In [5]:
k = 4
lambda_ = 0.001

Возьмем оптимальные значения гиперпараметров, обучим и протестируем итоговую модель


In [6]:
ffm_model = xl.create_ffm()
ffm_model.setTrain("/content/drive/MyDrive/train.txt")
ffm_model.setTest("/content/drive/MyDrive/test.txt")
param = {'task':'binary', 'lr': 0.1, 'lambda': lambda_, 'k': k, 'metric': 'auc'}
ffm_model.fit(param, './model.out')
ffm_model.setSigmoid()
ffm_model.predict('./model.out', './output.txt')
with open('output.txt', 'r') as f:
    y_pred_proba = np.array(list(map(float, filter(lambda s: len(s) > 0, f.read().split('\n')))))
roc_auc_metric = roc_auc_score(y_test, y_pred_proba)
log_loss_metric = log_loss(y_test, y_pred_proba)
print(f'roc_auc = {roc_auc_metric}, log_loss = {log_loss_metric}')

roc_auc = 0.7957858048855749, log_loss = 0.13009003410844072


В прошлой работе
log_loss: 0.13487352    
AUC: 0.7795530