## Libraries

In [1]:
import os
import json
import time
import numpy as np
import pandas as pd

from collections import defaultdict
from collections import Counter

import fasttext

## Data

In [2]:
data_path = '../data/train_sessions.jsonl'
  
train_sessions = pd.DataFrame()
chunks = pd.read_json(data_path, lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        # train_sessions = pd.concat([train_sessions, chunk])
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        train_sessions = pd.concat([train_sessions, chunk_session])
    else:
        break
        
train_sessions = train_sessions.reset_index(drop=True)
train_sessions.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks


In [2]:
data_path = '../data/test_sessions.jsonl'
  
test_sessions = pd.DataFrame()
chunks = pd.read_json(data_path, lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        # train_sessions = pd.concat([train_sessions, chunk])
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        test_sessions = pd.concat([test_sessions, chunk_session])
    else:
        break
        
test_sessions = test_sessions.reset_index(drop=True)
test_sessions.head()

Unnamed: 0,session,aid,ts,type
0,12383433,1542913,1661551200081,clicks
1,12383434,8211,1661551200511,clicks
2,12383435,940546,1661551201055,carts
3,12383435,45443,1661551213043,clicks
4,12383435,1769360,1661551246239,clicks


## Baseline

* Training

In [12]:
# Provided by https://www.kaggle.com/code/ttahara/otto-mors-aid-frequency-baseline/notebook?scriptVersionId=109781928
sorted_ids_list = []
test_sessions = pd.read_json('../data/test_sessions.jsonl', lines=True, chunksize=1000)

for chunk in test_sessions:
    for session_id, events in chunk.values:
        
        # All aids in the session
        aid_list = []
        for action in events:
            aid_list.append(action["aid"])
            
        # Sort aids by the count of aids in the session
        cnt = Counter(aid_list)
        sorted_aids = sorted(set(aid_list), key=lambda x: cnt[x], reverse=True)
        sorted_ids_list.append([session_id, sorted_aids])

* Prediction

In [16]:
data_list = []
for session_id, sorted_aids in sorted_ids_list:
    sorted_aids_20_str = " ".join(map(str, sorted_aids[:20]))
    data_list.append([f"{session_id}_clicks", sorted_aids_20_str])
    data_list.append([f"{session_id}_carts", sorted_aids_20_str])
    data_list.append([f"{session_id}_orders", sorted_aids_20_str])
    
result = pd.DataFrame(data_list, columns=["session_type", "labels"])
result.to_csv("../out/submission.csv", index=False)

In [3]:
!python ../src/evaluate.py --test-labels ../data/test_labels.jsonl --predictions ../out/submission.csv

Traceback (most recent call last):
  File "/Users/ruiqin/Desktop/Study/Projects/recsys/otto_recsys/notebook/../src/evaluate.py", line 6, in <module>
    from beartype import beartype
ModuleNotFoundError: No module named 'beartype'


## Item2Vec

* Training sequence

In [4]:
aid_seq = train_sessions.sort_values(["session", "ts"]).reset_index(drop=True)
aid_seq["aid_2"] = aid_seq.aid.shift(1)
aid_seq = aid_seq[aid_seq.aid != aid_seq.aid_2]
aid_seq = aid_seq[["session", "aid", "ts", "type"]]
aid_seq["aid"] = aid_seq["aid"].astype(str)
aid_seq = aid_seq.groupby(["session"]).agg(list)["aid"].reset_index()
aid_seq = aid_seq[(aid_seq.aid.apply(len) >= 5) & (aid_seq.aid.apply(len) <= 300)].reset_index(drop=True)
aid_seq.head()

Unnamed: 0,session,aid
0,0,"[1517085, 1563459, 1309446, 16246, 1781822, 11..."
1,1,"[424964, 1492293, 910862, 1491172, 424964, 151..."
2,2,"[763743, 137492, 504789, 137492, 795863, 37834..."
3,3,"[1425967, 1343406, 1425967, 1343406, 1815570, ..."
4,4,"[613619, 298827, 383828, 255379, 1838173, 1453..."


In [19]:
with open("../data/train_aid_seq.txt", 'w') as f:
    for aid_list in aid_seq.aid:
        print("__label__1", " ".join(aid_list), file = f)

* Training

In [22]:
model = fasttext.train_unsupervised(
    '../data/train_aid_seq.txt', 
    model = 'skipgram',
    ws = 5,
    dim = 128, 
    epoch = 3, 
    lr = 0.01,
    minn = 0,
    maxn = 0
)

Read 7M words
Number of words:  259512
Number of labels: 1
Progress: 100.0% words/sec/thread:   62867 lr:  0.000000 avg.loss:  3.302127 ETA:   0h 0m 0s


In [43]:
embeddings = {}
for i in model.words:
    embeddings[i] = model[i]

* Ann index