- fasttext candidate generation, but not using gpu <br>
]https://www.kaggle.com/code/konradb/product-embeddings

In [1]:
!pip install polars

Collecting polars
  Downloading polars-0.15.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.15.6
[0m

In [2]:
import os
import gc
import glob
import random
import numpy as np
import pandas as pd
import polars as pl
from tqdm.notebook import tqdm
from gensim.models import FastText  
import pprint
import hashlib
import torch
from annoy import AnnoyIndex

In [3]:
def load_df(path):    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(path)):
        chunk = pl.read_parquet(chunk_file)
        chunk.ts = (chunk["ts"]/1000)
        chunk.with_column(pl.col('ts').cast(pl.Int32, strict=False))
        dfs.append(chunk)
    return pl.concat(dfs)#.reset_index(drop=True)

In [4]:
class CFG:
    emb_size = 32
    cutoff = 2
    modelname = 'efficientnet_b0'

In [5]:
def set_seed(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [6]:
# w2v learning
def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)

def make_w2v_model(type_, valid_flg, input_df, seed_num=42):
    sentences_df = input_df.groupby('session').agg(pl.col(type_).alias('sentence'))
    sentences = sentences_df['sentence'].to_list()
    
    if valid_flg:
        model = FastText(vector_size = CFG.emb_size, window = 3, min_count=1, workers = 4, epochs=5)   
    else:
        model = FastText(vector_size = CFG.emb_size, window = 3, min_count=1, workers = 4, epochs=10)   

    print("fasttext start")
    model.build_vocab(sentences)
    model.train(sentences, total_examples=len(sentences), epochs=5)
    return model

In [7]:
# annoy learning
def make_annoy_model(model, seed_num=42):
    aid2idx = {aid: i for i, aid in enumerate(model.wv.index_to_key)}
    index = AnnoyIndex(n_dim, 'euclidean') #n次元のユークリッド空間を用意

    index.set_seed(seed_num)
    
    # annoyモデルを学習させる
    for aid, idx in aid2idx.items():
        index.add_item(idx, model.wv.vectors[idx])
    
    # k-d tree をビルドする
    index.build(30)
    
    return index

In [8]:
def make_w2v_candidates(w2vec, index, session_items):
    '''make candidates by approximate nearest search'''
    aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}
    
    # sessionごとのcandidateの最低数
    cand_per_session = 60 

    labels = []

    for AIDs in tqdm(session_items):
        #  we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        
        # if session is long, consider only latest 10 items
        AIDs = AIDs[:10]

        remain = cand_per_session % len(AIDs)
        
        # itemあたりの近傍candidate
        cand_per_item = cand_per_session // len(AIDs)
                
        nns_result = []
        # and look for some neighbors! (探索対象、何個の近傍点を探すか)
        # 距離が最小となるのは自分自身なので二番目以降から取り出す        
        # from 2nd last item
        for item in AIDs[1:]:
            added_items = index.get_nns_by_item(aid2idx[item], cand_per_item + 1)[1:]
            added_items = [i for i in added_items if i not in nns_result]
            nns_result += added_items

        # candidate数が足りない分は最新アイテムを参照して加える
        # last item
        if len(nns_result) < cand_per_session:
            need_num = cand_per_session - len(nns_result)
            last_cands = index.get_nns_by_item(aid2idx[AIDs[0]], 100)[1:]
            last_cands = [i for i in last_cands if i not in nns_result][:need_num]
            nns_result += last_cands
            
        nns_result = [w2vec.wv.index_to_key[i] for i in nns_result]
        assert len(nns_result) == cand_per_session
        labels.append(nns_result)
    
    predictions = pd.DataFrame(data={'session': session_items.index, 'aid': labels})
    predictions["session"] = predictions["session"].astype('int32')
    predictions = predictions.explode('aid') 
    predictions['aid'] = predictions['aid'].astype('int32') 
    
    return predictions

# prepare data

In [9]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
n_dim = 32
set_seed()

In [10]:
def make_outputs(valid_flg):
    metrics = {}
    if valid_flg:
        # input
        test_path = '../input/otto-validation/test_parquet/*'
        train_path =  '../input/otto-validation/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_validation.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_validation.pqt'
        test_label_path = '../input/otto-validation/test_labels.parquet' 

        # output
        train_output_path = "w2vec_train_cands_for_validation.pqt"
        test_output_path = "w2vec_test_cands_for_validation.pqt"
        w2vec_pickle_path = "w2vec_model_for_validation.model"
    
    else:
        #input
        test_path = '../input/otto-chunk-data-inparquet-format/test_parquet/*'
        train_path = '../input/otto-chunk-data-inparquet-format/train_parquet/*'
        
        # for val
        truncated_train_path = '/kaggle/input/otto-truncated-tr-dataset/truncated_train_sessions_for_full_train.pqt'
        train_label_path = '../input/otto-truncated-tr-dataset/truncated_train_labels_for_full_train.pqt'
    
        #output
        train_output_path = "w2vec_train_cands_for_full_train.pqt"
        test_output_path = "w2vec_test_cands_for_full_train.pqt"
        w2vec_pickle_path = "w2vec_model_for_full_train.model"  
        
    print("data preparation")
    train = load_df(train_path)
    test = load_df(test_path)
    tr_train = pl.read_parquet(truncated_train_path)

    train = train.sort(["session", "ts"]).drop("ts")
    test = test.sort(["session", "ts"]).drop("ts")
    all_ = pl.concat([train, test])

    print("make history for candidate inference")
    tr_session = tr_train.to_pandas()["session"].unique()[-2000000:]
    train_session_AIDs = tr_train.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
    train_session_AIDs = train_session_AIDs[train_session_AIDs.index.isin(tr_session)]
    del tr_session
    test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
    del tr_train, train, test
        
    print("learning")
    model = make_w2v_model("aid", valid_flg, all_)
    del all_
    
    index = make_annoy_model(model)
    model.save(w2vec_pickle_path)
    
    print("make train candidates")
    train_candidates = make_w2v_candidates(model, index, train_session_AIDs)
    train_candidates.to_parquet(train_output_path)
    del train_session_AIDs, train_candidates
    
    print("make test candidates")
    test_candidates = make_w2v_candidates(model, index, test_session_AIDs)
    test_candidates.to_parquet(test_output_path)
    del test_session_AIDs, test_candidates
    del model

    print("metrics check")
    train_labels = pd.read_parquet(train_label_path)
    train_labels = train_labels.rename(columns = {"aid": "ground_truth"})
    train_candidates = pd.read_parquet(train_output_path)
    train_candidates = train_candidates.groupby(["session"])["aid"].apply(list).reset_index()
    for type_ in ["clicks", "orders", "carts"]:
        target_labels = train_labels[train_labels.type==type_].copy().drop("type", axis=1)
        denominator = target_labels.shape[0]
        target_labels = target_labels.groupby(["session"])["ground_truth"].apply(list).reset_index()
        target_labels = pd.merge(target_labels, train_candidates, on = ["session"], how = "inner")
        target_labels['hits'] = target_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
        metrics["train_"+type_+"_recall"] = np.sum(target_labels['hits']) / denominator
    del train_labels, target_labels, train_candidates
    
    if valid_flg:
        test_labels = pd.read_parquet(test_label_path)
        test_labels = test_labels.explode("ground_truth")
        test_candidates = pd.read_parquet(test_output_path)
        test_candidates = test_candidates.groupby(["session"])["aid"].apply(list).reset_index()
        for type_ in ["clicks", "orders", "carts"]:
            target_labels = test_labels[test_labels.type==type_].copy().drop("type", axis=1)
            denominator = target_labels.shape[0]
            target_labels = target_labels.groupby(["session"])["ground_truth"].apply(list).reset_index()
            target_labels = pd.merge(target_labels, test_candidates, on = ["session"], how = "inner")
            target_labels['hits'] = target_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.aid))), axis=1)
            metrics["test_"+type_+"_recall"] = np.sum(target_labels['hits']) / denominator
        del test_labels, target_labels, test_candidates
    gc.collect()
    pprint.pprint(metrics)

In [11]:
model = make_outputs(valid_flg=True)

data preparation
make history for candidate inference
learning
fasttext start
make train candidates


  0%|          | 0/2000000 [00:00<?, ?it/s]

make test candidates


  0%|          | 0/1801251 [00:00<?, ?it/s]

metrics check
{'test_carts_recall': 0.011833331324668527,
 'test_clicks_recall': 0.017529708909084073,
 'test_orders_recall': 0.008384789552291088,
 'train_carts_recall': 0.005684920598807845,
 'train_clicks_recall': 0.009474270066954327,
 'train_orders_recall': 0.0048872738767240685}


In [12]:
#make_outputs(valid_flg=False)