In [1]:
import cudf
import numpy as np 
import pandas as pd 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import scipy
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import cupy
import glob
import time

In [2]:
train_df = cudf.read_parquet('/kaggle/input/otto-full-optimized-memory-footprint/train.parquet')
test_df = cudf.read_parquet('/kaggle/input/otto-full-optimized-memory-footprint/test.parquet')

In [3]:
%time
train_pairs = cudf.concat([train_df, test_df])[['session','aid']]

del train_df, test_df

train_pairs['aid_next'] = train_pairs.groupby('session').aid.shift(-1)
train_pairs = train_pairs[['aid', 'aid_next']].dropna().reset_index(drop=True)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.39 µs


In [4]:
cardinality_aids = max(train_pairs['aid'].max(), train_pairs['aid_next'].max())
cardinality_aids

1855602

In [5]:
!pip install merlin-dataloader==0.0.2

Collecting merlin-dataloader==0.0.2
  Downloading merlin-dataloader-0.0.2.tar.gz (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
[?25hCollecting merlin-core
  Downloading merlin-core-0.7.0.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
  Downloading merlin-core-0.6.0.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build

In [6]:
from merlin.loader.torch import Loader 

train_pairs[:-10_000_000].to_pandas().to_parquet('train_pairs.parquet')
train_pairs[-10_000_000:].to_pandas().to_parquet('valid_pairs.parquet')

In [7]:
from merlin.loader.torch import Loader 
from merlin.io import Dataset

train_ds = Dataset('train_pairs.parquet')
train_dl_merlin = Loader(train_ds, 65536, True)

In [8]:
%time

for batch in train_dl_merlin:
    aid1, aid2 = batch[0], batch[1]

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.78 µs


# Matrix Factorization

**referencing the implementations of https://www.kaggle.com/code/cpmpml/matrix-factorization-with-gpu**

In [9]:
import torch
from torch import nn

class MatrixFactorization(nn.Module):
    def __init__(self, n_aids, n_factors):
        super().__init__()
        self.aid_factors = nn.Embedding(n_aids, n_factors, sparse=True)
        
    def forward(self, aid1, aid2):
        aid1 = self.aid_factors(aid1)
        aid2 = self.aid_factors(aid2)
        
        return (aid1 * aid2).sum(dim=1)
    
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

valid_ds = Dataset('valid_pairs.parquet')
valid_dl_merlin = Loader(valid_ds, 65536, True)

In [10]:
from torch.optim import SparseAdam, AdamW

num_epochs=25
lr=0.01

model = MatrixFactorization(cardinality_aids+1, 64)
optimizer = SparseAdam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

In [11]:
%%time
model.to('cuda')
for epoch in range(num_epochs):
    for batch, _ in train_dl_merlin:
        model.train()
        losses = AverageMeter('Loss', ':.4e')
            
        aid1, aid2 = batch['aid'], batch['aid_next']
        aid1 = aid1.to('cuda')
        aid2 = aid2.to('cuda')
        output_pos = model(aid1, aid2)
        output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])
        
        output = torch.cat([output_pos, output_neg])
        targets = torch.cat([torch.ones_like(output_pos), torch.zeros_like(output_pos)])
        loss = criterion(output, targets)
        losses.update(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.eval()
    
    with torch.no_grad():
        accuracy = AverageMeter('accuracy')
        for batch, _ in valid_dl_merlin:
            aid1, aid2 = batch['aid'], batch['aid_next']
            output_pos = model(aid1, aid2)
            output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])
            accuracy_batch = torch.cat([output_pos.sigmoid() > 0.5, output_neg.sigmoid() < 0.5]).float().mean()
            accuracy.update(accuracy_batch, aid1.shape[0])
            
    print(f'{epoch+1:02d}: * TrainLoss {losses.avg:.3f}  * Accuracy {accuracy.avg:.3f}')

01: * TrainLoss 0.624  * Accuracy 0.666
02: * TrainLoss 0.605  * Accuracy 0.694
03: * TrainLoss 0.598  * Accuracy 0.705
04: * TrainLoss 0.594  * Accuracy 0.711
05: * TrainLoss 0.592  * Accuracy 0.715
06: * TrainLoss 0.590  * Accuracy 0.718
07: * TrainLoss 0.589  * Accuracy 0.719
08: * TrainLoss 0.588  * Accuracy 0.721
09: * TrainLoss 0.588  * Accuracy 0.722
10: * TrainLoss 0.587  * Accuracy 0.722
11: * TrainLoss 0.586  * Accuracy 0.723
12: * TrainLoss 0.585  * Accuracy 0.724
13: * TrainLoss 0.585  * Accuracy 0.724
14: * TrainLoss 0.585  * Accuracy 0.724
15: * TrainLoss 0.585  * Accuracy 0.725
16: * TrainLoss 0.584  * Accuracy 0.725
17: * TrainLoss 0.584  * Accuracy 0.725
18: * TrainLoss 0.585  * Accuracy 0.726
19: * TrainLoss 0.586  * Accuracy 0.726
20: * TrainLoss 0.585  * Accuracy 0.726
21: * TrainLoss 0.585  * Accuracy 0.726
22: * TrainLoss 0.584  * Accuracy 0.726
23: * TrainLoss 0.585  * Accuracy 0.726
24: * TrainLoss 0.584  * Accuracy 0.727
25: * TrainLoss 0.583  * Accuracy 0.727


In [12]:
#getting the embeddings
%time
embeddings = model.aid_factors.weight.detach().cpu().numpy()

from cuml.neighbors import NearestNeighbors


knn = NearestNeighbors(n_neighbors=21, metric='euclidean')
knn.fit(embeddings)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


NearestNeighbors()

In [13]:
%time

_, aid_nns = knn.kneighbors(embeddings)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


In [14]:
from collections import defaultdict

sample_sub = pd.read_csv('../input/otto-recommender-system//sample_submission.csv')
test = cudf.read_parquet('../input/otto-full-optimized-memory-footprint/test.parquet')

session_types = ['clicks', 'carts', 'orders']
gr = test.reset_index(drop=True).to_pandas().groupby('session')
test_session_AIDs = gr['aid'].apply(list)
test_session_types = gr['type'].apply(list)

labels = []

type_weight_multipliers = {0: 1, 1: 6, 2: 3}
for AIDs, types in zip(test_session_AIDs, test_session_types):
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types): 
            aids_temp[aid]+= w * type_weight_multipliers[t]
            
        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use approximate nearest neighbor search and our embeddings
        # to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        
        # let's grab the most recent aid
        most_recent_aid = AIDs[0]
        
        # and look for some neighbors!
        nns = list(aid_nns[most_recent_aid])
                        
        labels.append((AIDs+nns)[:20])

In [15]:
labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

predictions = pd.DataFrame(data={'session_type': test_session_AIDs.index, 'labels': labels_as_strings})

prediction_dfs = []

for st in session_types:
    modified_predictions = predictions.copy()
    modified_predictions.session_type = modified_predictions.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(modified_predictions)

submission = pd.concat(prediction_dfs).reset_index(drop=True)
submission.to_csv('submission.csv', index=False)