- ref: https://www.kaggle.com/code/radek1/matrix-factorization-pytorch-merlin-dataloader
- ref: https://www.kaggle.com/code/cpmpml/matrix-factorization-with-gpu
- ref: https://www.kaggle.com/code/taichin/otto-rapids-tsne-and-mf

In [1]:
!pip install merlin-dataloader==0.0.2

Collecting merlin-dataloader==0.0.2
  Downloading merlin-dataloader-0.0.2.tar.gz (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m526.1 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
[?25hCollecting merlin-core
  Downloading merlin-core-0.7.0.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
  Downloading merlin-core-0.6.0.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Install

In [2]:
import os
import gc
import random
from tqdm.notebook import tqdm
import pprint

import cudf
import pickle
from cuml.neighbors import NearestNeighbors

import pandas as pd
import numpy as np
import tensorflow as tf

from merlin.loader.torch import Loader 
from merlin.io import Dataset

from collections import defaultdict

import torch
from torch import nn
from torch.optim import SparseAdam

from cuml import UMAP, TSNE, PCA
import torch.optim as optim

In [3]:
num_epochs=50
lr=0.01

In [4]:
def seed_everything(seed=42): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

In [5]:
class MatrixFactorization(nn.Module):
    def __init__(self, n_aids, n_factors):
        super().__init__()
        self.aid_factors = nn.Embedding(n_aids, n_factors, sparse=True)
        
    def forward(self, aid1, aid2):
        aid1 = self.aid_factors(aid1)
        aid2 = self.aid_factors(aid2)
        
        return (aid1 * aid2).sum(dim=1)
    
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

In [6]:
def make_candidates(aid_nns, session_items):
    '''make candidates by approximate nearest search'''    
    # sessionごとのcandidateの最低数
    cand_per_session = 60 

    labels = []

    for AIDs in tqdm(session_items):
        #  we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        
        items = AIDs[:10]
        cand_per_item = cand_per_session // len(items)
                
        nns_result = []
        # and look for some neighbors! (探索対象、何個の近傍点を探すか)
        for item in items[1:]:
            added_items = list(aid_nns[item][1:cand_per_item+1])
            added_items = [i for i in added_items if i not in nns_result]
            nns_result += added_items
            
        added_items = list(aid_nns[items[0]][1:cand_per_item*2+1])
        added_items = [i for i in added_items if i not in nns_result]
        nns_result = added_items + nns_result

        #assert len(nns_result) == cand_per_session
        labels.append(nns_result)
    
    predictions = pd.DataFrame(data={'session': session_items.index, 'aid': labels})
    predictions["session"] = predictions["session"].astype('int32')
    predictions = predictions.explode('aid') 
    predictions['aid'] = predictions['aid'].astype('int32') 
    
    return predictions

In [7]:
def learning(valid_flg):
    if valid_flg:
        # input
        test_path = '../input/otto-validation/test_parquet/*'
        train_path =  '../input/otto-validation/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_validation.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_validation.pqt'
        test_label_path = '../input/otto-validation/test_labels.parquet' 

        # output
        train_output_path = "w2vec_train_cands_for_validation.pqt"
        test_output_path = "w2vec_test_cands_for_validation.pqt"
        real_embeddings_path = "real_embeddings_for_validation.pickle"
        d2_embeddings_path = "real_embeddings_for_validation.pickle"
    
    else:
        #input
        test_path = '../input/otto-chunk-data-inparquet-format/test_parquet/*'
        train_path = '../input/otto-chunk-data-inparquet-format/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_full_train.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_full_train.pqt'
    
        #output
        train_output_path = "w2vec_train_cands_for_full_train.pqt"
        test_output_path = "w2vec_test_cands_for_full_train.pqt"
        real_embeddings_path = "real_embeddings_for_full_train.pickle"
        d2_embeddings_path = "real_embeddings_for_full_train.pickle"
        
    print("data preparation")
    train = cudf.read_parquet(train_path)[['session', 'aid']]
    test = cudf.read_parquet(test_path)[['session', 'aid']]

    train_pairs = cudf.concat([train, test])
    train_pairs['aid_next'] = train_pairs.groupby('session').aid.shift(-1)
    train_pairs = train_pairs[['aid', 'aid_next']].dropna().reset_index(drop=True)
    cardinality_aids = max(train_pairs['aid'].max(), train_pairs['aid_next'].max())
    del train, test
        
    train_pairs.to_pandas().to_parquet('train_pairs.parquet')
    train_pairs[-10_000_000:].to_pandas().to_parquet('valid_pairs.parquet')
    del train_pairs
    
    print("learning embeddings")
    train_ds = Dataset('train_pairs.parquet')
    train_dl_merlin = Loader(train_ds, 65536, True)

    valid_ds = Dataset('valid_pairs.parquet')
    valid_dl_merlin = Loader(valid_ds, 65536, True)

    model = MatrixFactorization(cardinality_aids+1, 32)
    optimizer = SparseAdam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_dl_merlin))
    criterion = nn.BCEWithLogitsLoss()
    
    model.to('cuda')
    for epoch in range(num_epochs):
        for batch, _ in train_dl_merlin:
            model.train()
            losses = AverageMeter('Loss', ':.4e')
            
            aid1, aid2 = batch['aid'], batch['aid_next']
            aid1 = aid1.to('cuda')
            aid2 = aid2.to('cuda')
            output_pos = model(aid1, aid2)
            output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])
        
            output = torch.cat([output_pos, output_neg])
            targets = torch.cat([torch.ones_like(output_pos), torch.zeros_like(output_pos)])
            loss = criterion(output, targets)
            losses.update(loss.item())
        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        model.eval()
    
        with torch.no_grad():
            accuracy = AverageMeter('accuracy')
            for batch, _ in valid_dl_merlin:
                aid1, aid2 = batch['aid'], batch['aid_next']
                output_pos = model(aid1, aid2)
                output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])
                accuracy_batch = torch.cat([output_pos.sigmoid() > 0.5, output_neg.sigmoid() < 0.5]).float().mean()
                accuracy.update(accuracy_batch, aid1.shape[0])
            
        print(f'{epoch+1:02d}: * TrainLoss {losses.avg:.3f}  * Accuracy {accuracy.avg:.3f}')
    del train_ds, valid_ds, train_dl_merlin, valid_dl_merlin
        
    embeddings = model.aid_factors.weight.detach().cpu().numpy()
    with open(real_embeddings_path, mode="wb") as f:
        pickle.dump(embeddings, f)
    
    print("fit transform tsne")
    em_2d = TSNE(n_components=2, random_state = 224).fit_transform(embeddings)
    print('TSNE embeddings have shape',em_2d.shape)
    
    with open(d2_embeddings_path, mode="wb") as f:
        pickle.dump(em_2d, f)
    
    knn = NearestNeighbors(n_neighbors=21, metric='euclidean')
    knn.fit(embeddings)
    
    _, aid_nns = knn.kneighbors(embeddings)
    del embeddings, em_2d, knn, model
    
    return aid_nns

In [8]:
def make_outputs(aid_nns, valid_flg):
    metrics = {}
    if valid_flg:
        # input
        test_path = '../input/otto-validation/test_parquet/*'
        train_path =  '../input/otto-validation/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_validation.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_validation.pqt'
        test_label_path = '../input/otto-validation/test_labels.parquet' 

        # output
        train_output_path = "w2vec_train_cands_for_validation.pqt"
        test_output_path = "w2vec_test_cands_for_validation.pqt"
        real_embeddings_path = "real_embeddings_for_validation.pickle"
        d2_embeddings_path = "real_embeddings_for_validation.pickle"
    
    else:
        #input
        test_path = '../input/otto-chunk-data-inparquet-format/test_parquet/*'
        train_path = '../input/otto-chunk-data-inparquet-format/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_full_train.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_full_train.pqt'
    
        #output
        train_output_path = "w2vec_train_cands_for_full_train.pqt"
        test_output_path = "w2vec_test_cands_for_full_train.pqt"
        real_embeddings_path = "real_embeddings_for_full_train.pickle"
        d2_embeddings_path = "real_embeddings_for_full_train.pickle"
        
    print("make history for candidate inference")
    tr_train = cudf.read_parquet(truncated_train_path)
    tr_session = tr_train.to_pandas()["session"].unique()[-2000000:]
    train_session_AIDs = tr_train.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
    train_session_AIDs = train_session_AIDs[train_session_AIDs.index.isin(tr_session)]
    del tr_session, tr_train
    
    print("make train candidates")
    train_candidates = make_candidates(aid_nns, train_session_AIDs)
    train_candidates.to_parquet(train_output_path)
    del train_session_AIDs, train_candidates
        
    print("make test candidates")
    test = cudf.read_parquet(test_path)
    test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
    del test
    test_candidates = make_candidates(aid_nns, test_session_AIDs)
    test_candidates.to_parquet(test_output_path)
    del test_session_AIDs, test_candidates
        
    print("metrics check")
    train_labels = pd.read_parquet(train_label_path)
    train_labels = train_labels.rename(columns = {"aid": "ground_truth"})
    train_candidates = pd.read_parquet(train_output_path)
    train_candidates = train_candidates.groupby(["session"])["aid"].apply(list).reset_index()
    for type_ in ["clicks", "orders", "carts"]:
        target_labels = train_labels[train_labels.type==type_].copy().drop("type", axis=1)
        denominator = target_labels.shape[0]
        target_labels = target_labels.groupby(["session"])["ground_truth"].apply(list).reset_index()
        target_labels = pd.merge(target_labels, train_candidates, on = ["session"], how = "inner")
        target_labels['hits'] = target_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
        metrics["train_"+type_+"_recall"] = np.sum(target_labels['hits']) / denominator
    del train_labels, target_labels, train_candidates
    
    if valid_flg:
        test_labels = pd.read_parquet(test_label_path)
        test_labels = test_labels.explode("ground_truth")
        test_candidates = pd.read_parquet(test_output_path)
        test_candidates = test_candidates.groupby(["session"])["aid"].apply(list).reset_index()
        for type_ in ["clicks", "orders", "carts"]:
            target_labels = test_labels[test_labels.type==type_].copy().drop("type", axis=1)
            denominator = target_labels.shape[0]
            target_labels = target_labels.groupby(["session"])["ground_truth"].apply(list).reset_index()
            target_labels = pd.merge(target_labels, test_candidates, on = ["session"], how = "inner")
            target_labels['hits'] = target_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
            metrics["test_"+type_+"_recall"] = np.sum(target_labels['hits']) / denominator
        del test_labels, target_labels, test_candidates
    gc.collect()
    pprint.pprint(metrics)

# execution

In [9]:
aid_nns = learning(valid_flg=True)

data preparation
learning embeddings
01: * TrainLoss 0.726  * Accuracy 0.554
02: * TrainLoss 0.685  * Accuracy 0.582
03: * TrainLoss 0.643  * Accuracy 0.636
04: * TrainLoss 0.617  * Accuracy 0.673
05: * TrainLoss 0.605  * Accuracy 0.692
06: * TrainLoss 0.597  * Accuracy 0.703
07: * TrainLoss 0.594  * Accuracy 0.709
08: * TrainLoss 0.591  * Accuracy 0.714
09: * TrainLoss 0.589  * Accuracy 0.716
10: * TrainLoss 0.586  * Accuracy 0.718
11: * TrainLoss 0.585  * Accuracy 0.720
12: * TrainLoss 0.585  * Accuracy 0.721
13: * TrainLoss 0.584  * Accuracy 0.722
14: * TrainLoss 0.582  * Accuracy 0.723
15: * TrainLoss 0.581  * Accuracy 0.724
16: * TrainLoss 0.584  * Accuracy 0.724
17: * TrainLoss 0.580  * Accuracy 0.724
18: * TrainLoss 0.582  * Accuracy 0.725
19: * TrainLoss 0.581  * Accuracy 0.725
20: * TrainLoss 0.579  * Accuracy 0.725
21: * TrainLoss 0.581  * Accuracy 0.726
22: * TrainLoss 0.583  * Accuracy 0.726
23: * TrainLoss 0.580  * Accuracy 0.726
24: * TrainLoss 0.580  * Accuracy 0.726
25:

In [10]:
make_outputs(aid_nns, valid_flg=True)
del aid_nns

make history for candidate inference
make train candidates


  0%|          | 0/2000000 [00:00<?, ?it/s]

make test candidates


  0%|          | 0/1801251 [00:00<?, ?it/s]

metrics check
{'test_carts_recall': 0.07373406770118643,
 'test_clicks_recall': 0.09560965495399121,
 'test_orders_recall': 0.07655538960770139,
 'train_carts_recall': 0.03691359148200391,
 'train_clicks_recall': 0.05062116429523796,
 'train_orders_recall': 0.04061328679046307}


In [11]:
# make_outputs(valid_flg=False)