In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from pathlib import Path

data_path = Path('/kaggle/input/otto-recommender-system/')

In [None]:
def read_jsonl(target: str) -> pd.DataFrame():
    sessions = pd.DataFrame()
    chunks = pd.read_json(data_path / f'{target}.jsonl', lines=True, chunksize=150)

    for e, chunk in enumerate(chunks):
        event_dict = {
            'session': [],
            'aid': [],
            'ts': [],
            'type': [],
        }
        if e < 2:
            for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
                for event in events:
                    event_dict['session'].append(session)
                    event_dict['aid'].append(event['aid'])
                    event_dict['ts'].append(event['ts'])
                    event_dict['type'].append(event['type'])
            chunk_session = pd.DataFrame(event_dict)
            sessions = pd.concat([sessions, chunk_session])
        else:
            break
    return sessions.reset_index(drop=True)

train= read_jsonl('train')
test= read_jsonl('test')

In [1]:
train.head()

NameError: ignored

Word2Vec

In [None]:
import hashlib
import os
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts
os.environ["PYTHONHASHSEED"] = str(42)
def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)

In [None]:
raw_corpus = []
for session, group_df in tqdm(train.groupby(['session'])):
    raw_corpus.append(list(group_df['aid'].astype(str) + '_' + group_df['type']))
for session, group_df in tqdm(test.groupby(['session'])):
    raw_corpus.append(list(group_df['aid'].astype(str) + '_' + group_df['type']))

In [None]:
w2vec=Word2Vec(sentences=raw_corpus, vector_size=100, window=5, min_count=1, sg=0, workers=-1, seed=42, hashfxn=hashfxn)

With the model fully train, let us use similarity btw trained representations of our aids to create a submission.

The search functionality where we look for nearest neighbors in the embedding space is built into gensim, but it is unfortunately super slow. Let's use annoy which is much fater.

In [None]:
%%time

from annoy import AnnoyIndex

aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}
index = AnnoyIndex(32, 'euclidean')

for aid, idx in aid2idx.items():
    index.add_item(idx, w2vec.wv.vectors[idx])
    
index.build(10)

output submission

In [None]:
import pandas as pd
import numpy as np

from collections import defaultdict

sample_sub = pd.read_csv('../input/otto-recommender-system//sample_submission.csv')

session_types = ['clicks', 'carts', 'orders']
test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = test.to_pandas().reset_index(drop=True).groupby('session')['type'].apply(list)

labels = []

type_weight_multipliers = {0: 1, 1: 6, 2: 3}
for AIDs, types in zip(test_session_AIDs, test_session_types):
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types): 
            aids_temp[aid]+= w * type_weight_multipliers[t]
            
        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        
        # let's grab the most recent aid
        most_recent_aid = AIDs[0]
        
        # and look for some neighbors!
        nns = [w2vec.wv.index_to_key[i] for i in index.get_nns_by_item(aid2idx[most_recent_aid], 21)[1:]]
                        
        labels.append((AIDs+nns)[:20])

#Word2vec to generate Candidates/feature for training

Just like a covisiation matrix, for any AID Word2Vec can give us a list of AIds resembling our query AID. The output will be ordered starting with AIds that are most alike.

In order for us to visualize what is happening, let me give you a simplified example.

##Mock data

In our data, we have aids organized by session.

In [None]:
data = pl.DataFrame(data={'session' : [0,0,1,1], 'aid' : [10,20,20,30], 'type' : [0,0,1,0]})
data

NameError: ignored

We can use word2vec to generate candidates. For instance, maybe using word2ved we would generate the following candidates for the sessions in our data :

{0 : [11,20], 1 : [25,6]}

We can reshape our candidates to look as follows :

In [None]:
candidates = pl.DataFrame(data={'session' : [0,0,1,1], 'aid' : [11,20,25,6]})

cancidates

As you see, for our candidates we don't have too much information apart from session and aid. This is exactly like the output word2ved can give us

And that is okay. Our ranker can deal with that, For some rwos we will have information in this or that column, for another we won't. This is not an issue to a ranking model.

Here, out ranker will see that we don't have type information for candidates. But we will create another imnportnat column that will allow it to uniquely identify our candidates as coming from word2vec.

##1. Add ordering information to our candidates.

The order is important. A candidate appearing earlier in the list of candidates in some sense has a higher score, is more similar to the AIDs in a session.

In [None]:
candidates = candidates.with_column(pl.col('aid').cumcount().over('session').alias('word2vec_rank')+1)

candidates

##2. Merge this information onto candidates

Now, we need to take this information and add this onto our original data.

But how do we add candidates?! If we just concat these df together, we will have duplicate entry for sesion 0 for aid of 20.

What we need to do is a join but a specific kind

We want to keep the rows that are already in data, append information to them where there is a match AND create new rows if there isn't.

so, using ***outer join***

In [None]:
data = data.join(candidates, on=['session','aid'], how='outer')
