In [1]:
import polars as pl

train = pl.read_parquet('/kaggle/input/otto-full-optimized-memory-footprint/train.parquet')
validA = pl.read_parquet('../input/otto-train-and-test-data-for-local-validation/test.parquet')
validB = pl.read_parquet('../input/otto-train-and-test-data-for-local-validation/test_labels.parquet')

In [None]:
# Get subset of data 
fraction_of_sessions = 0.2

train_sessions = train['session'].sample(fraction=fraction_of_sessions, seed=42)
train = train.filter(pl.col("session").is_in(train_sessions))
train = train.sort("session")

validation_sessions = validA['session'].sample(fraction=fraction_of_sessions, seed=42)
validA = validA.filter(pl.col("session").is_in(validation_sessions))
validA = validA.sort("session")

validB = validB.filter(pl.col("session").is_in(validation_sessions))
validB = validB.sort("session")

print(train.shape[0], validA.shape[0], validB.shape[0])

print(train, validA, validB)

Filter for clicks events and train - does not take into account interaction between clicks

In [None]:
%%time
EVENT_TYPE = 0

train_df = train.filter(pl.col("type") == EVENT_TYPE)
sentences_df = train_df.groupby('session').agg(sentence = pl.col('aid'))
sentences = sentences_df['sentence'].to_list()

with open("w2v_input.txt", "w") as f:
    for sentence in sentences:
        f.write(" ".join(map(str, sentence)) + "\n")
        
from gensim.models import Word2Vec

params = {
    'vector_size': 50,
    'window': 3,
    'epochs': 5,
    'min_count': 1,
    'sample': 1e-3,
}

w2v_model= Word2Vec(corpus_file="w2v_input.txt", **params)
w2v_model.save("word2vec_click.model")

In [None]:
%%time
EVENT_TYPE = 1

train_df = train.filter(pl.col("type") == EVENT_TYPE)
sentences_df = train_df.groupby('session').agg(sentence = pl.col('aid'))
sentences = sentences_df['sentence'].to_list()

with open("w2v_input.txt", "w") as f:
    for sentence in sentences:
        f.write(" ".join(map(str, sentence)) + "\n")
        
from gensim.models import Word2Vec

params = {
    'vector_size': 50,
    'window': 3,
    'epochs': 5,
    'min_count': 1,
    'sample': 1e-3,
}

w2v_model= Word2Vec(corpus_file="w2v_input.txt", **params)
w2v_model.save("word2vec_cart.model")

In [None]:
%%time
EVENT_TYPE = 2

train_df = train.filter(pl.col("type") == EVENT_TYPE)
sentences_df = train_df.groupby('session').agg(sentence = pl.col('aid'))
sentences = sentences_df['sentence'].to_list()

with open("w2v_input.txt", "w") as f:
    for sentence in sentences:
        f.write(" ".join(map(str, sentence)) + "\n")
        
from gensim.models import Word2Vec

params = {
    'vector_size': 50,
    'window': 3,
    'epochs': 5,
    'min_count': 1,
    'sample': 1e-3,
}

w2v_model= Word2Vec(corpus_file="w2v_input.txt", **params)
w2v_model.save("word2vec_order.model")

In [None]:
from gensim.similarities.annoy import AnnoyIndexer
import numpy as np

annoy_index = AnnoyIndexer(w2v_model, 300)

type_weight = {0: 1, 1: 6, 2: 3}
def generate_candidates(df):
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    unique_aids = list(dict.fromkeys(aids[::-1]))

    time_weights = np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
    aids_counter = {}
    for aid, w, t in zip(aids, time_weights, types):
        aids_counter[aid] = aids_counter.get(aid, 0) + 1 + w * type_weight[t]

    aids_counter_sorted = sorted(aids_counter.items(), key=lambda x: x[1])
    candidates = [k for k, v in aids_counter_sorted]

    if len(candidates) <= 20:
        secondary_candidates_counter = Counter()
        for candidate in candidates:
            secondary_candidates_counter.update(list(map(str, model.wv.most_similar(candidate.str(), topn=20, indexer=annoy_index))))  
        secondary_candidates = [k for k, v in secondary_candidates_counter.most_common(20-len(candidates))]
        return candidates[:20] + secondary_candidates[:-1]

    return candidates[:20]

In [None]:
validA = validA.filter(pl.col("type") == EVENT_TYPE)
alidA_df = validA.sort(['session', 'ts'], descending=[False, True]).groupby('session').agg(pl.col("aid"))
validA_df = validA_df.with_columns(pl.col("session").apply(lambda x: generate_candidates(x)).alias('pred'))

validB_df = validB.filter(pl.col("type") == EVENT_TYPE)

def get_metric(gt, pred):
    gt = set(gt.to_list())
    pred = set(spred.to_list())
    return gt.intersection(pred)
    
pred_vs_gt = validA_df.join(validB_df, on="session")
pred_vs_gt = pred_vs_gt.with_columns(pl.struct(pl.col(['ground_truth', 'pred'])).apply(lambda x: get_metric(x['ground_truth'], x['pred'])).alias("metric_nom"),
                                     pl.col("ground_truth").list.lengths().alias('gt_len'))
pred_vs_gt = pred_vs_gt.with_columns(pl.when(pl.col("gt_len") > 20).then(pl.col("gt_len")).otherwise(20).alias("metric_denom"))

score = sum(pred_vs_gt['metric_nom'])/sum(pred_vs_gt['metric_denom'])