In [1]:
import os
import json
import time
import numpy as np
import pandas as pd

import fasttext
import faiss

from tqdm.auto import tqdm

## Data

In [2]:
data_path = '../data/train_sessions.jsonl'
  
train_sessions = pd.DataFrame()
chunks = pd.read_json(data_path, lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        # train_sessions = pd.concat([train_sessions, chunk])
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        train_sessions = pd.concat([train_sessions, chunk_session])
    else:
        break
        
train_sessions = train_sessions.reset_index(drop=True)
train_sessions.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks


In [3]:
data_path = '../data/test_sessions.jsonl'
  
test_sessions = pd.DataFrame()
chunks = pd.read_json(data_path, lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        # train_sessions = pd.concat([train_sessions, chunk])
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        test_sessions = pd.concat([test_sessions, chunk_session])
    else:
        break
        
test_sessions = test_sessions.reset_index(drop=True)
test_sessions.head()

Unnamed: 0,session,aid,ts,type
0,12383433,1542913,1661551200081,clicks
1,12383434,8211,1661551200511,clicks
2,12383435,940546,1661551201055,carts
3,12383435,45443,1661551213043,clicks
4,12383435,1769360,1661551246239,clicks


## Training sequence

In [4]:
aid_seq = train_sessions.sort_values(["session", "ts"]).reset_index(drop=True)
aid_seq["aid_2"] = aid_seq.aid.shift(1)
aid_seq = aid_seq[aid_seq.aid != aid_seq.aid_2]
aid_seq = aid_seq[["session", "aid", "ts", "type"]]
aid_seq["aid"] = aid_seq["aid"].astype(str)
aid_seq = aid_seq.groupby(["session"]).agg(list)["aid"].reset_index()
aid_seq = aid_seq[(aid_seq.aid.apply(len) >= 5) & (aid_seq.aid.apply(len) <= 300)].reset_index(drop=True)
aid_seq.head()

Unnamed: 0,session,aid
0,0,"[1517085, 1563459, 1309446, 16246, 1781822, 11..."
1,1,"[424964, 1492293, 910862, 1491172, 424964, 151..."
2,2,"[763743, 137492, 504789, 137492, 795863, 37834..."
3,3,"[1425967, 1343406, 1425967, 1343406, 1815570, ..."
4,4,"[613619, 298827, 383828, 255379, 1838173, 1453..."


In [5]:
with open("../data/train_aid_seq.txt", 'w') as f:
    for aid_list in aid_seq.aid:
        print("__label__1", " ".join(aid_list), file = f)

## Training

In [11]:
model = fasttext.train_unsupervised(
    '../data/train_aid_seq.txt', 
    model = 'skipgram',
    ws = 5,
    dim = 64, 
    epoch = 5, 
    lr = 0.01,
    minn = 0,
    maxn = 0
)

Read 7M words
Number of words:  259512
Number of labels: 1
Progress: 100.0% words/sec/thread:  101397 lr:  0.000000 avg.loss:  2.761024 ETA:   0h 0m 0s


## ANN index

### Method 1 (out of memory)

In [9]:
item_dict = {}
embeddings = []
for i, item_id in enumerate(model.words):
    if item_id != '</s>':
        item_dict[i-1] = item_id
        embeddings.append(model[item_id])
    
index_data = np.array(embeddings)
faiss.normalize_L2(index_data)
index = faiss.IndexHNSWFlat(128, 32, faiss.METRIC_INNER_PRODUCT)

index.add(index_data)
sim_matrix, candidate_ids_matrix = index.search(index_data, 20)

sim_index = []
for idx, candidate_ids in tqdm(enumerate(candidate_ids_matrix)):
    sims = sim_matrix[idx]
    candidates = list(zip(candidate_ids, sims))
    candidates = list(filter(lambda x: x[0] != idx, candidates))
    candidates = list(filter(lambda x: x[0] in item_dict, candidates))
    sim_index.extend([(item_dict[idx], item_dict[x[0]], float(x[1])) for x in candidates])

0it [00:00, ?it/s]

In [None]:
from pyspark.sql import SparkSession

# SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("item2vec") \
    .getOrCreate()

# Similariy table and sessions 
sim_table = spark.createDataFrame(sim_index).toDF("trigger_id", "sim_id", "score")
sessions = spark.createDataFrame(test_sessions)

# To temp view
sim_table.createOrReplaceTempView("sim_table")
sessions.createOrReplaceTempView("test_sessions")
sim_table.show(5)

In [None]:
querySQL = """
    WITH sim AS (
        SELECT
            *
        FROM
            sim_table
    ),
    session AS (
        SELECT
            session,
            aid
        FROM
            test_sessions
    ),
    preds AS (
        SELECT
            session,
            sim_id,
            SUM(score) AS score
        FROM (
            SELECT
                session,
                sim_id,
                score
            FROM
                session LEFT JOIN sim ON session.aid = sim.trigger_id
        ) t
        GROUP BY
            session,
            sim_id
    )
    SELECT
        session,
        sim_id,
        score
    FROM (
        SELECT
            session,
            sim_id,
            score,
            ROW_NUMBER() OVER (PARTITION BY session ORDER BY score DESC) AS rn
        FROM
            preds
    ) t
    WHERE
        rn <= 20
    ORDER BY
        session ASC,
        score DESC
"""

preds = spark.sql(querySQL).toPandas()

### Method 2

In [12]:
# Compute session embeddings
session_embeddings_df = test_sessions[["session", "aid"]].copy()
session_embeddings_df["embedding"] = session_embeddings_df.aid.apply(lambda x: model[str(x)])
session_embeddings_df = session_embeddings_df \
    .groupby(["session"]) \
    .agg(lambda x: np.mean(x, axis=0))["embedding"] \
    .reset_index()

In [16]:
item_dict = {}
embeddings = []
for i, item_id in enumerate(model.words):
    if item_id != '</s>':
        item_dict[i-1] = item_id
        embeddings.append(model[item_id])

session_dict = dict(zip(session_embeddings_df.session.index, session_embeddings_df.session.astype(str)))
session_embedding = np.array(session_embeddings_df.embedding.tolist())
    
index_data = np.array(embeddings)
faiss.normalize_L2(index_data)
faiss.normalize_L2(session_embedding)
index = faiss.IndexHNSWFlat(64, 32, faiss.METRIC_INNER_PRODUCT)

index.add(index_data)
sim_matrix, candidate_ids_matrix = index.search(session_embedding, 20)

sim_index = []
for idx, candidate_ids in tqdm(enumerate(candidate_ids_matrix)):
    sims = sim_matrix[idx]
    candidates = list(zip(candidate_ids, sims))
    candidates = list(filter(lambda x: x[0] in item_dict, candidates))
    
    preds = []
    for cand in candidates:
        preds.append(item_dict[cand[0]])
    sim_index.append([session_dict[idx] + "_clicks", " ".join(preds)])
    sim_index.append([session_dict[idx] + "_carts", " ".join(preds)])
    sim_index.append([session_dict[idx] + "_orders", " ".join(preds)])

0it [00:00, ?it/s]

In [17]:
submission = pd.DataFrame(sim_index, columns=["session_type", "labels"])
submission.to_csv("../out/i2v_submission.csv", index=False)