In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
import polars as pl
from annoy import AnnoyIndex
from gensim.models import Word2Vec

In [2]:
train = pl.read_parquet("../input/otto-full-optimized-memory-footprint/train.parquet")
test = pl.read_parquet("../input/otto-full-optimized-memory-footprint/test.parquet")

In [6]:
train.head()

session,aid,ts,type
i32,i32,i32,u8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0


In [3]:
sentences_df = pl.concat([train, test]).groupby("session").agg(pl.col("aid").alias("sentence"))

In [5]:
sentences_df.head()

session,sentence
i32,list[i32]
66848,"[1188193, 203819, 886392]"
13353536,[423010]
229984,"[244251, 1848423, ... 851141]"
1781184,"[623372, 715280, ... 12618]"
2324416,"[1853174, 185695, ... 1133767]"


In [7]:
sentences = sentences_df["sentence"].to_list()

In [9]:
w2vec = Word2Vec(sentences=sentences, vector_size=32, min_count=1, workers=4)

In [13]:
w2vec.wv.index_to_key[:3]

[1460571, 485256, 108125]

In [14]:
aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}

In [21]:
index = AnnoyIndex(32, "euclidean")

In [22]:
for aid, idx in aid2idx.items():
    index.add_item(idx, w2vec.wv.vectors[idx])

In [23]:
index.build(10)

True

In [24]:
session_types = ["clicks", "carts", "orders"]
test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby("session")["aid"].apply(list)
test_session_types = test.to_pandas().reset_index(drop=True).groupby("session")["type"].apply(list)

In [27]:
test_session_AIDs.head()

session
12899779                                              [59625]
12899780           [1142000, 582732, 973453, 736515, 1142000]
12899781    [141736, 199008, 57315, 194067, 199008, 199008...
12899782    [1669402, 1494780, 1494780, 1494780, 1494780, ...
12899783    [255297, 1114789, 255297, 300127, 198385, 3001...
Name: aid, dtype: object

In [28]:
test_session_types.head()

session
12899779                                                  [0]
12899780                                      [0, 0, 0, 0, 0]
12899781                    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
12899782    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
12899783                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: type, dtype: object

In [None]:
labels = []
type_weight_multipliers = {0: 1, 1: 6, 2: 3}
for AIDs, types in zip(test_session_AIDs, test_session_types):
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights = np.logspace(0.1, 1, len(AIDs), base=2, endpoint=True) - 1
        aids_temp = defaultdict(lambda: 0)
        for aid, w, t in zip(AIDs, weights, types):
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        # let's grab the most recent aid
        most_recent_aid = AIDs[0]
        # and look for some neighbors!
        nns = [w2vec.wv.index_to_key[i] for i in index.get_nns_by_item(aid2idx[most_recent_aid], 21)[1:]]
        labels.append((AIDs + nns)[:20])
labels_as_strings = [" ".join([str(l) for l in lls]) for lls in labels]
predictions = pd.DataFrame(data={"session_type": test_session_AIDs.index, "labels": labels_as_strings})
prediction_dfs = []
for st in session_types:
    modified_predictions = predictions.copy()
    modified_predictions.session_type = modified_predictions.session_type.astype("str") + f"_{st}"
    prediction_dfs.append(modified_predictions)
submission = pd.concat(prediction_dfs).reset_index(drop=True)
submission.to_csv("submission.csv", index=False)

In [45]:
AIDs = test_session_AIDs.iloc[1]
types = test_session_types.iloc[1]
AIDs = list(dict.fromkeys(AIDs[::-1]))
most_recent_aid = AIDs[0]
most_recent_aid

1142000

In [46]:
hoge = index.get_nns_by_item(aid2idx[most_recent_aid], 21)

In [48]:
train.head()

session,aid,ts,type
i32,i32,i32,u8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
