In [1]:
!pip install polars

Collecting polars
  Downloading polars-0.15.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.15.6
[0m

In [2]:
import os
import gc
import random
import pprint

import implicit 
import numpy as np
import polars as pl
import pandas as pd
from scipy import sparse
from annoy import AnnoyIndex
from tqdm.notebook import tqdm
from collections import defaultdict
from sklearn.neighbors import NearestNeighbors

In [3]:
def make_candidates(model, article2id, id2article, session_items):
    '''make candidates by approximate nearest search'''    
    # sessionごとのcandidateの最低数
    cand_per_session = 60 

    labels = []

    for AIDs in tqdm(session_items):
        #  we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        AIDs = [article2id[i] for i in AIDs]
        
        # if session is long, consider only latest 10 items
        AIDs = AIDs[:10]

        remain = cand_per_session % len(AIDs)
        
        # itemあたりの近傍candidate
        cand_per_item = cand_per_session // len(AIDs)
                
        nns_result = []
        # and look for some neighbors! (探索対象、何個の近傍点を探すか)
        # 距離が最小となるのは自分自身なので二番目以降から取り出す        
        # from 2nd last item
        for item in AIDs[1:]:
            added_items = np.array(model.similar_items(item, cand_per_item + 1))[1:,0]
            added_items = [i for i in added_items if i not in nns_result]
            nns_result += added_items

        # candidate数が足りない分は最新アイテムを参照して加える
        # last item
        if len(nns_result) < cand_per_session:
            need_num = cand_per_session - len(nns_result)
            last_cands = np.array(model.similar_items(AIDs[0], 100))[1:,0]
            last_cands = [i for i in last_cands if i not in nns_result][:need_num]
            nns_result += last_cands
            
        assert len(nns_result) == cand_per_session
        nns_result = [id2article[i] for i in nns_result]
        labels.append(nns_result)
    
    predictions = pd.DataFrame(data={'session': session_items.index, 'aid': labels})
    predictions["session"] = predictions["session"].astype('int32')
    predictions = predictions.explode('aid') 
    predictions['aid'] = predictions['aid'].astype('int32') 
    
    return predictions

In [4]:
def make_outputs(valid_flg):
    metrics = {}
    if valid_flg:
        # input
        test_path = '../input/otto-validation/test_parquet/*'
        train_path =  '../input/otto-validation/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_validation.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_validation.pqt'
        test_label_path = '../input/otto-validation/test_labels.parquet' 

        # output
        train_output_path = "w2vec_train_cands_for_validation.pqt"
        test_output_path = "w2vec_test_cands_for_validation.pqt"
        w2vec_pickle_path = "w2vec_model_for_validation.model"
    
    else:
        #input
        test_path = '../input/otto-chunk-data-inparquet-format/test_parquet/*'
        train_path = '../input/otto-chunk-data-inparquet-format/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_full_train.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_full_train.pqt'
    
        #output
        train_output_path = "w2vec_train_cands_for_full_train.pqt"
        test_output_path = "w2vec_test_cands_for_full_train.pqt"
        w2vec_pickle_path = "w2vec_model_for_full_train.model"  
        
    print("data preparation")
    train = pl.read_parquet(train_path)
    test = pl.read_parquet(test_path)
    tr_train = pl.read_parquet(truncated_train_path)

    train = train.sort(["session", "ts"]).drop("ts")
    test = test.sort(["session", "ts"]).drop("ts")
    
    df = pl.concat([train, test])
    df = df.to_pandas()
    
    print("make history for candidate inference")
    tr_session = tr_train.to_pandas()["session"].unique()[-2000000:]
    train_session_AIDs = tr_train.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
    train_session_AIDs = train_session_AIDs[train_session_AIDs.index.isin(tr_session)]
    del tr_session
    test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
    del tr_train, train, test
        
    print("learning embeddings")
    customers = df["session"].unique()
    articles = df["aid"].unique()

    customer2id = {customer: index for index, customer in enumerate(customers)}
    article2id = {article: index for index, article in enumerate(articles)}

    id2customer = {index:customer for index, customer in enumerate(customers)}
    id2article = {index:article for index, article in enumerate(articles)}

    df["session"] = df["session"].map(customer2id)
    df["aid"] = df["aid"].map(article2id)

    df = df.drop_duplicates(["session", "aid", "type"])
    df["rating"] = 1

    data = df["rating"].values
    row = df["session"].values
    col = df["aid"].values
    del df
    sparse_matrix = sparse.csr_matrix((data, (row, col)), shape=(len(customers), len(articles)))

    model = implicit.bpr.BayesianPersonalizedRanking(factors=32, learning_rate=0.1, iterations=100, random_state=0)
    model.fit(sparse_matrix)

    
    print("make train candidates")
    train_candidates = make_candidates(model, article2id, id2article, train_session_AIDs)
    train_candidates.to_parquet(train_output_path)
    del train_session_AIDs, train_candidates
    
    print("make test candidates")
    test_candidates = make_candidates(model, article2id, id2article, test_session_AIDs)
    test_candidates.to_parquet(test_output_path)
    del test_session_AIDs, test_candidates
        
    print("metrics check")
    train_labels = pd.read_parquet(train_label_path)
    train_labels = train_labels.rename(columns = {"aid": "ground_truth"})
    train_candidates = pd.read_parquet(train_output_path)
    train_candidates = train_candidates.groupby(["session"])["aid"].apply(list).reset_index()
    for type_ in ["clicks", "orders", "carts"]:
        target_labels = train_labels[train_labels.type==type_].copy().drop("type", axis=1)
        denominator = target_labels.shape[0]
        target_labels = target_labels.groupby(["session"])["ground_truth"].apply(list).reset_index()
        target_labels = pd.merge(target_labels, train_candidates, on = ["session"], how = "inner")
        target_labels['hits'] = target_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
        metrics["train_"+type_+"_recall"] = np.sum(target_labels['hits']) / denominator
    del train_labels, target_labels, train_candidates
    
    if valid_flg:
        test_labels = pd.read_parquet(test_label_path)
        test_labels = test_labels.explode("ground_truth")
        test_candidates = pd.read_parquet(test_output_path)
        test_candidates = test_candidates.groupby(["session"])["aid"].apply(list).reset_index()
        for type_ in ["clicks", "orders", "carts"]:
            target_labels = test_labels[test_labels.type==type_].copy().drop("type", axis=1)
            denominator = target_labels.shape[0]
            target_labels = target_labels.groupby(["session"])["ground_truth"].apply(list).reset_index()
            target_labels = pd.merge(target_labels, test_candidates, on = ["session"], how = "inner")
            target_labels['hits'] = target_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
            metrics["test_"+type_+"_recall"] = np.sum(target_labels['hits']) / denominator
        del test_labels, target_labels, test_candidates
    gc.collect()
    pprint.pprint(metrics)

In [5]:
 make_outputs(False)

data preparation
make history for candidate inference
learning embeddings


  0%|          | 0/100 [00:00<?, ?it/s]

make train candidates


  0%|          | 0/1801269 [00:00<?, ?it/s]

KeyError: 9417883.0