In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

data_path = Path('/kaggle/input/otto-recommender-system/')

In [None]:
def read_json(target: str) -> pd.DataFrame():
    sessions = pd.DataFrame()
    chunks = pd.read_json(data_path / f'{target}.jsonl', lines=True, chunksize=100_000)

    for e, chunk in enumerate(chunks):
        event_dict = {
            'session': [],
            'aid': [],
            'ts': [],
            'type': [],
        }
        if e < 2:
            for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
                for event in events:
                    event_dict['session'].append(session)
                    event_dict['aid'].append(event['aid'])
                    event_dict['ts'].append(event['ts'])
                    event_dict['type'].append(event['type'])
            chunk_session = pd.DataFrame(event_dict)
            sessions = pd.concat([sessions, chunk_session])
        else:
            break
    return sessions.reset_index(drop=True)
train_sessions = read_json('train')
test_sessions = read_json('test')

In [None]:
import hashlib
import os
from gensim.models import word2vec
from gensim.models import KeyedVectors
os.environ["PYTHONHASHSEED"] = str(42)
def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)

from tqdm.notebook import tqdm

In [None]:
raw_corpus = []
for session, group_df in tqdm(train_sessions.groupby(['session'])):
    raw_corpus.append(list(group_df['aid'].astype(str) + '_' + group_df['type']))
for session, group_df in tqdm(test_sessions.groupby(['session'])):
    raw_corpus.append(list(group_df['aid'].astype(str) + '_' + group_df['type']))

In [None]:
w2v = Word2Vec(sentences=raw_corpus, vector_size=100, window=5, min_count=1, sg=0, workers=-1, seed=42, hashfxn=hashfxn)

In [None]:
w2v.wv.save_word2vec_format('otto_aid2vec.bin', binary=True)

In [None]:
sub_list = []

for session, group_df in tqdm(test_sessions.groupby('session')) :
  aid_list = []
  results = w2v.wv.most_similar(positive=list((group_df['aid'].astype(str) + '_'+group_df['tyep']), topn=500)
  for result in results :
    aid = result[0].split('_')[0]
    if aid not in aid_list:
            aid_list.append(aid)
        if len(aid_list) == 20:
            aid_list = ' '.join(aid_list)
            break
    sub_list.append([f'{session}_clicks', aid_list])
    sub_list.append([f'{session}_carts', aid_list])
    sub_list.append([f'{session}_orders', aid_list])