In [1]:
!pip install git+https://github.com/suessmann/pytorch-fm.git

Collecting git+https://github.com/suessmann/pytorch-fm.git
  Cloning https://github.com/suessmann/pytorch-fm.git to /tmp/pip-req-build-2wtxzzsa
  Running command git clone --filter=blob:none --quiet https://github.com/suessmann/pytorch-fm.git /tmp/pip-req-build-2wtxzzsa
  Resolved https://github.com/suessmann/pytorch-fm.git to commit 2ab8cf3538f974a438ceb6008567885b9129879f
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: torchfm
  Building wheel for torchfm (setup.py) ... [?25ldone
[?25h  Created wheel for torchfm: filename=torchfm-0.7.0-py3-none-any.whl size=20127 sha256=1297af9a9dc7380c94c3982071b751aab536b0c153933ebf711d3eb58596bdbc
  Stored in directory: /tmp/pip-ephem-wheel-cache-a2xg24gq/wheels/36/0a/03/829f109c444c9766d8b82b54b0ff313c3f0bd3f2da117872b3
Successfully built torchfm
Installing collected packages: torchfm
Successfully installed torchfm-0.7.0


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torchfm.model.ffm import FieldAwareFactorizationMachineModel

from tqdm.notebook import tqdm
from sklearn.metrics import log_loss, roc_auc_score



In [3]:
df = pd.read_csv('/kaggle/input/hw-recsys/data.csv')
df.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [4]:
df.shape

(15821472, 17)

In [5]:
df.drop(["banner_id0", "banner_id1", "rate0", 
           "rate1", "g0", "g1", "coeff_sum0", 
           "coeff_sum1", "campaign_clicks"], axis=1, inplace=True)

Соединим редковстречающиеся категориальные фичи в одну, чтоб размер данных стал поменьше и училось быстрее.

In [6]:
categorical_features = ["zone_id", "os_id", "country_id"]


In [7]:
for col in categorical_features:
    lt = np.quantile(np.unique(df[col], return_counts=True)[1], 0.999)
    df.loc[df[col].value_counts()[df[col]].values < lt, col] = df[col].unique().max() + 1

In [8]:
len(df['zone_id'].unique())

5

Посчитаем филды и приведём в порядок индексы

In [9]:
all_categorical_features = ["zone_id", "banner_id", "os_id", "country_id", "oaid_hash"]

for field in all_categorical_features:
    uniques = np.unique(df[field])
    mapping = {v: i for i, v in enumerate(uniques)}
    df[field] = df[field].map(mapping)

In [10]:
field_dim = [np.max(df[k]) + 1 for k in ["zone_id", "banner_id", 
                                         "oaid_hash", "os_id", 
                                         "country_id", "impressions"]]
field_dim

[5, 1633, 6510316, 2, 2, 2]

Теперь отделим тестовые данные от трейна. Для теста возьмём последний доступный из данных день

In [11]:
pd.to_datetime(df.date_time)

0          2021-09-27 00:01:30
1          2021-09-26 22:54:49
2          2021-09-26 23:57:20
3          2021-09-27 00:04:30
4          2021-09-27 00:06:21
                   ...        
15821467   2021-10-02 15:51:35
15821468   2021-09-27 22:03:14
15821469   2021-10-02 17:41:10
15821470   2021-09-29 00:39:32
15821471   2021-09-28 07:00:18
Name: date_time, Length: 15821472, dtype: datetime64[ns]

In [12]:
df.date_time = pd.DatetimeIndex(pd.to_datetime(df.date_time)).normalize()

In [13]:
df.date_time.max()

Timestamp('2021-10-02 00:00:00')

In [14]:
df[df.date_time == '2021-10-02']

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,os_id,country_id,impressions,clicks
164,2021-10-02,4,76,5650809,1,1,1,1
166,2021-10-02,2,46,4788930,1,0,1,1
168,2021-10-02,4,76,2878122,0,1,1,1
169,2021-10-02,2,46,3653701,1,0,1,1
359,2021-10-02,2,2,3415627,1,0,1,1
...,...,...,...,...,...,...,...,...
15821452,2021-10-02,4,89,5678878,1,1,1,0
15821455,2021-10-02,4,132,287339,0,0,1,0
15821461,2021-10-02,3,52,2985717,0,1,1,0
15821467,2021-10-02,4,530,3053210,0,1,1,0


In [15]:
test_data = df[df.date_time == '2021-10-02'].drop('date_time', axis=1).values
train_data = df[df.date_time < '2021-10-02'].drop('date_time', axis=1).values
test_data.shape, train_data.shape

((2128978, 7), (13692494, 7))

Создадим также валидационный сет, включим в него 20% данных

In [16]:
np.random.seed(42)
idx = np.arange(len(train_data))
train_len = int(np.floor(len(train_data) * 0.8))

np.random.shuffle(idx)

val_data = train_data[idx[train_len:]]
train_data = train_data[idx[:train_len]]

In [17]:
train_data.shape, val_data.shape

((10953995, 7), (2738499, 7))

Создадим всё что нужно для торча

In [18]:
class ClicksDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        clck = self.data[idx, -1]
        feats = self.data[idx, :-1]
        return feats, clck

In [19]:
train_dataset = ClicksDataset(train_data)
val_dataset = ClicksDataset(val_data)

In [20]:
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False, num_workers=4, pin_memory=True)

In [21]:
model = FieldAwareFactorizationMachineModel(field_dim, 5).cuda()
optim = torch.optim.Adam(model.parameters(), lr=3e-3)
criterion = torch.nn.BCELoss()

In [22]:
pbar = tqdm(total=len(train_loader))
for feats, clck in train_loader:
    feats = torch.LongTensor(feats).to('cuda')
    clck = clck.float().to('cuda')
    pred = model(feats)
    loss = criterion(pred, clck)
    loss.backward()
    optim.step()
    optim.zero_grad()
    pbar.set_description(f"Training loss: {loss.cpu().detach().item()}")
    pbar.update(1)

  0%|          | 0/10698 [00:00<?, ?it/s]

In [23]:
y_true = []
y_pred = []

for feats, clck in tqdm(val_loader):
    feats = torch.LongTensor(feats).to('cuda')
    y_true += clck.tolist()
    pred = model(feats)
    y_pred += pred.detach().cpu().tolist()
    
    assert len(y_true) == len(y_pred), f'{pred.shape}, {clck.shape}'

  0%|          | 0/2675 [00:00<?, ?it/s]

Скоры на валидации

In [24]:
print("Log loss", log_loss(y_true, y_pred))
print("ROC AUC", roc_auc_score(y_true, y_pred))

Log loss 0.10728681771317067
ROC AUC 0.7677379801446128


In [25]:
test_dataset = ClicksDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=4, pin_memory=True)

In [26]:
y_true = []
y_pred = []

for feats, clck in tqdm(test_loader):
    feats = torch.LongTensor(feats).to('cuda')
    y_true += clck.tolist()
    pred = model(feats)
    y_pred += pred.detach().cpu().tolist()
    
    assert len(y_true) == len(y_pred), f'{pred.shape}, {clck.shape}'

  0%|          | 0/2080 [00:00<?, ?it/s]

Скоры на тесте

In [27]:
print("Log loss", log_loss(y_true, y_pred))
print("ROC AUC", roc_auc_score(y_true, y_pred))

Log loss 0.14735679216798764
ROC AUC 0.7385301507265672
