In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import polars as pl
import faiss
from tqdm.notebook import tqdm

### Data

In [2]:
# https://www.kaggle.com/datasets/radek1/otto-full-optimized-memory-footprint
train = pl.read_parquet('../../../data/train.parquet')
test = pl.read_parquet('../../../data/test.parquet')

### Training Sequence

In [3]:
aid_seq = train.sort(by = ['session', 'ts'])
sentences_df = aid_seq.groupby('session').agg(pl.col('aid').alias('sentence'))

In [4]:
sentences_df.head()

session,sentence
i32,list[i32]
0,"[1517085, 1563459, ... 161938]"
1,"[424964, 1492293, ... 497868]"
2,"[763743, 137492, ... 672473]"
3,"[1425967, 1425967, ... 925352]"
4,"[613619, 298827, ... 479396]"


In [5]:
sentences = sentences_df['sentence'].to_list()

### Training

In [6]:
word2vec_model = Word2Vec(sentences = sentences, vector_size = 128, window = 10, min_count = 1, workers = 4)

In [7]:
len(word2vec_model.wv)

1855603

In [8]:
word2vec_model.save('word2vec.model')

In [3]:
word2vec_model = Word2Vec.load('word2vec.model')

In [4]:
item_dict = {}
embeddings = []
for item_id, item_idx in word2vec_model.wv.key_to_index.items():
    item_dict[item_idx] = item_id
    embeddings.append(word2vec_model.wv.vectors[item_idx])

In [5]:
test_embedding = test.to_pandas()
test_embedding['embedding'] = test_embedding['aid'].apply(lambda x: word2vec_model.wv[x])

In [6]:
test_embedding.head()

Unnamed: 0,session,aid,ts,type,embedding
0,12899779,59625,1661724000,0,"[-0.0803801, -0.20465985, 0.22250593, -0.02041..."
1,12899780,1142000,1661724000,0,"[-2.5437164, -0.23740415, 2.849504, 0.86475575..."
2,12899780,582732,1661724058,0,"[-1.5751047, -0.102995045, 1.020169, 0.9091487..."
3,12899780,973453,1661724109,0,"[-0.44183218, 1.127307, 1.2233655, 1.5057317, ..."
4,12899780,736515,1661724136,0,"[2.7693973, -0.26971614, 0.49649286, -0.107219..."


### Generate Candidates

#### Use mean embedding of AIDs as session embedding

In [41]:
test_mean_session_embedding = test_embedding.groupby('session')['embedding'].apply(lambda x: np.mean(x, axis = 0)).reset_index()

In [47]:
session_dict = dict(zip(test_mean_session_embedding.session.index, test_mean_session_embedding.session.astype(str)))
session_embedding = np.array(test_mean_session_embedding.embedding.tolist())

index_data = np.array(embeddings)
faiss.normalize_L2(index_data)
faiss.normalize_L2(session_embedding)
index = faiss.IndexHNSWFlat(128, 32, faiss.METRIC_INNER_PRODUCT)

index.add(index_data)
sim_matrix, candidate_ids_matrix = index.search(session_embedding, 20)

sim_index = []
for idx, candidate_ids in tqdm(enumerate(candidate_ids_matrix)):
    sims = sim_matrix[idx]
    candidates = list(zip(candidate_ids, sims))
    candidates = list(filter(lambda x: x[0] in item_dict, candidates))
    
    preds = []
    for cand in candidates:
        preds.append(str(item_dict[cand[0]]))
    sim_index.append([session_dict[idx] + '_clicks', ' '.join(preds)])
    sim_index.append([session_dict[idx] + '_carts', ' '.join(preds)])
    sim_index.append([session_dict[idx] + '_orders', ' '.join(preds)])

In [52]:
submission = pd.DataFrame(sim_index, columns = ['session_type', 'labels'])
submission.to_csv('../../../out/item2vec_mean_embedding.csv', index = False)

#### Use the embedding of most recent AID as session embedding

In [82]:
test_last_session_embedding = test_embedding.groupby('session')['embedding'].last().reset_index()

In [83]:
session_dict = dict(zip(test_last_session_embedding.session.index, test_last_session_embedding.session.astype(str)))
session_embedding = np.array(test_last_session_embedding.embedding.tolist())

index_data = np.array(embeddings)
faiss.normalize_L2(index_data)
faiss.normalize_L2(session_embedding)
index = faiss.IndexHNSWFlat(128, 32, faiss.METRIC_INNER_PRODUCT)

index.add(index_data)
sim_matrix, candidate_ids_matrix = index.search(session_embedding, 20)

sim_index = []
for idx, candidate_ids in tqdm(enumerate(candidate_ids_matrix)):
    sims = sim_matrix[idx]
    candidates = list(zip(candidate_ids, sims))
    candidates = list(filter(lambda x: x[0] in item_dict, candidates))
    
    preds = []
    for cand in candidates:
        preds.append(str(item_dict[cand[0]]))
    sim_index.append([session_dict[idx] + '_clicks', ' '.join(preds)])
    sim_index.append([session_dict[idx] + '_carts', ' '.join(preds)])
    sim_index.append([session_dict[idx] + '_orders', ' '.join(preds)])

0it [00:00, ?it/s]

In [84]:
submission = pd.DataFrame(sim_index, columns = ['session_type', 'labels'])
submission.to_csv('../../../out/item2vec_last_embedding.csv', index = False)

#### Use the mean embedding of 3 most recent AID as session embedding

In [7]:
test_last_3_session_embedding = test_embedding.groupby('session').tail(3).groupby('session')['embedding'].apply(lambda x: np.mean(x, axis = 0)).reset_index()

In [8]:
session_dict = dict(zip(test_last_3_session_embedding.session.index, test_last_3_session_embedding.session.astype(str)))
session_embedding = np.array(test_last_3_session_embedding.embedding.tolist())

index_data = np.array(embeddings)
faiss.normalize_L2(index_data)
faiss.normalize_L2(session_embedding)
index = faiss.IndexHNSWFlat(128, 32, faiss.METRIC_INNER_PRODUCT)

index.add(index_data)
sim_matrix, candidate_ids_matrix = index.search(session_embedding, 20)

sim_index = []
for idx, candidate_ids in tqdm(enumerate(candidate_ids_matrix)):
    sims = sim_matrix[idx]
    candidates = list(zip(candidate_ids, sims))
    candidates = list(filter(lambda x: x[0] in item_dict, candidates))
    
    preds = []
    for cand in candidates:
        preds.append(str(item_dict[cand[0]]))
    sim_index.append([session_dict[idx] + '_clicks', ' '.join(preds)])
    sim_index.append([session_dict[idx] + '_carts', ' '.join(preds)])
    sim_index.append([session_dict[idx] + '_orders', ' '.join(preds)])

0it [00:00, ?it/s]

In [9]:
submission = pd.DataFrame(sim_index, columns = ['session_type', 'labels'])
submission.to_csv('../../../out/item2vec_last_3_embedding.csv', index = False)