# Data

In [1]:
# !aicrowd login

In [2]:
# !aicrowd dataset download --challenge task-1-next-product-recommendation

# Packages

In [86]:
%load_ext autoreload
%autoreload 2
import sys
import logging
sys.path.append('../')
import os
from utils import *

import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

from annoy import AnnoyIndex
import polars as pl

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Config

In [4]:
debug = False

debug_session_num = 1000

train_data_dir = '.'
test_data_dir = '.'
task = 'task1'

num_tree = 100

model_dir = '../model_training/v2'

# target locales: locales needed for task1

target_locals = ["DE", 'JP', 'UK']

w2v_model_file = os.path.join(model_dir, 'w2v.model')

annoy_index_file = os.path.join(model_dir, f'annoy_index_{str(num_tree)}_trees.index')

In [73]:
annoy_index_file

'../model_training/v2/annoy_index_100_trees.index'

In [6]:
# ! mkdir {model_dir}

# Function 

# Data Description

The Multilingual Shopping Session Dataset is a collection of **anonymized customer sessions** containing products from six different locales, namely English, German, Japanese, French, Italian, and Spanish. It consists of two main components: **user sessions** and **product attributes**. User sessions are a list of products that a user has engaged with in chronological order, while product attributes include various details like product title, price in local currency, brand, color, and description.

---

### Each product as its associated information:


**locale**: the locale code of the product (e.g., DE)

**id**: a unique for the product. Also known as Amazon Standard Item Number (ASIN) (e.g., B07WSY3MG8)

**title**: title of the item (e.g., “Japanese Aesthetic Sakura Flowers Vaporwave Soft Grunge Gift T-Shirt”)

**price**: price of the item in local currency (e.g., 24.99)

**brand**: item brand name (e.g., “Japanese Aesthetic Flowers & Vaporwave Clothing”)

**color**: color of the item (e.g., “Black”)

**size**: size of the item (e.g., “xxl”)

**model**: model of the item (e.g., “iphone 13”)

**material**: material of the item (e.g., “cotton”)

**author**: author of the item (e.g., “J. K. Rowling”)

**desc**: description about a item’s key features and benefits called out via bullet points (e.g., “Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers …”)


## EDA 💽

In [7]:
def read_locale_data(locale, task):
    products = read_product_data().query(f'locale == "{locale}"')
    sess_train = read_train_data().query(f'locale == "{locale}"')
    sess_test = read_test_data(task).query(f'locale == "{locale}"')
    return products, sess_train, sess_test

def show_locale_info(locale, task):
    products, sess_train, sess_test = read_locale_data(locale, task)

    train_l = sess_train['prev_items'].apply(lambda sess: len(sess))
    test_l = sess_test['prev_items'].apply(lambda sess: len(sess))

    print(f"Locale: {locale} \n"
          f"Number of products: {products['id'].nunique()} \n"
          f"Number of train sessions: {len(sess_train)} \n"
          f"Train session lengths - "
          f"Mean: {train_l.mean():.2f} | Median {train_l.median():.2f} | "
          f"Min: {train_l.min():.2f} | Max {train_l.max():.2f} \n"
          f"Number of test sessions: {len(sess_test)}"
        )
    if len(sess_test) > 0:
        print(
             f"Test session lengths - "
            f"Mean: {test_l.mean():.2f} | Median {test_l.median():.2f} | "
            f"Min: {test_l.min():.2f} | Max {test_l.max():.2f} \n"
        )
    print("======================================================================== \n")

In [83]:
products = read_product_data(train_data_dir=train_data_dir)
# locale_names = products['locale'].unique()
# for locale in locale_names:
#     show_locale_info(locale, task)

In [85]:
products.shape

(1551057, 11)

In [59]:
train_sessions = read_train_data(train_data_dir=train_data_dir)
train_sessions.sample(5)

Unnamed: 0,prev_items,next_item,locale
896553,['B09XDTFKYG' 'B09XDTVVJD' 'B09XDTFKYG' 'B09XD...,B09L86P41T,DE
1222933,['B07T8FJYSC' 'B07TBKK4ND' 'B07T8FJYSC' 'B07T9...,B0896WBZ3G,JP
2280921,['B00BWZALU6' 'B08B3Z7B6K' 'B08B3Z7B6K' 'B00BW...,B07N6MFY1G,UK
2303367,['B010Q9GT2M' 'B01GGS0L1C' 'B010Q9GYYU'],B00B4B2TAS,UK
596268,['B00OLY38DM' 'B00OLY38DM'],B00A3TCFD8,DE


In [10]:
test_data_dir

'.'

In [12]:
test_sessions = read_test_data(task, test_data_dir=test_data_dir)
test_sessions.sample(5)

Unnamed: 0,prev_items,locale
169839,['B07HLK75TC' 'B01JO5I63E' 'B01JO5HVV2'],JP
312051,['B09V5KDC3B' 'B09V5MD3J6'],UK
37271,['B08NPTZ55J' 'B09Y9PSLTD' 'B09Y9PJSZL' 'B08JQ...,DE
255723,['B09MM1WRBC' 'B09XJC9B2L' 'B0813S3VDM'],UK
36307,['B08DRQR6J8' 'B09ZLQ6GZW' 'B09Q934288' 'B09Q9...,DE


In [13]:
if debug:
    train_sessions = train_sessions.sample(debug_session_num)
    test_sessions = test_sessions.sample(debug_session_num)

In [13]:
train_sessions.shape

(3606249, 3)

In [14]:
train_sessions['prev_items'] = train_sessions.apply(lambda row: process_item_lst(row), axis=1)
# test_sessions['prev_items'] = test_sessions.apply(lambda row: process_item_lst(row), axis=1)

In [15]:
test_sessions['locale'].value_counts()

locale
UK    115936
DE    104568
JP     96467
Name: count, dtype: int64

In [16]:
target_locals

['DE', 'JP', 'UK']

In [60]:
train_sessions = train_sessions[train_sessions['locale'].isin(target_locals)]

In [18]:
train_sessions.shape

(3272716, 3)

# Word2vec

In [7]:
# train_sessions['prev_items'].to_list()

In [20]:
vector_size = 32
epochs = 10
sg = 1 # 1 for skip-gram
pop_thresh = 0.82415
window = 4

sentences = train_sessions['prev_items'].to_list() + test_sessions['prev_items'].to_list()
len(sentences)

3589687

In [21]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

In [22]:
w2vec = Word2Vec(sentences=sentences, vector_size=vector_size, epochs = epochs, sg=sg,
                 min_count=1, workers=14,
                 window=window,
                  compute_loss=True
              , callbacks=[callback()])

Loss after epoch 0: 11067542.0
Loss after epoch 1: 6493280.0
Loss after epoch 2: 2888668.0
Loss after epoch 3: 2580968.0
Loss after epoch 4: 2329516.0
Loss after epoch 5: 2048868.0
Loss after epoch 6: 1821566.0
Loss after epoch 7: 1773816.0
Loss after epoch 8: 1667380.0
Loss after epoch 9: 1025936.0


In [23]:
w2v_model_file

'../model_training/v2/w2v.model'

In [24]:
w2vec.save(w2v_model_file)

In [25]:
# ! ls sample_data

Generate Submission 🏋️‍♀️



Submission format:
1. The submission should be a **parquet** file with the sessions from all the locales. 
2. Predicted products ids per locale should only be a valid product id of that locale. 
3. Predictions should be added in new column named **"next_item_prediction"**.
4. Predictions should be a list of string id values

In [26]:
# def random_predicitons(locale, sess_test_locale):
#     random_state = np.random.RandomState(42)
#     products = read_product_data().query(f'locale == "{locale}"')
#     predictions = []
#     for _ in range(len(sess_test_locale)):
#         predictions.append(
#             list(products['id'].sample(PREDS_PER_SESSION, replace=True, random_state=random_state))
#         ) 
#     sess_test_locale['next_item_prediction'] = predictions
#     sess_test_locale.drop('prev_items', inplace=True, axis=1)
#     return sess_test_locale

In [27]:
# test_sessions.head()

In [28]:
# test_sessions

## Load model 

In [8]:
w2v_model_file

'../model_training/v2/w2v.model'

In [10]:
w2vec = Word2Vec.load(w2v_model_file)

In [14]:

test_sessions.shape

(316971, 2)

In [15]:
test_sessions.head()

Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE


In [16]:
# predictions = []
# test_locale_names = test_sessions['locale'].unique()
# for locale in test_locale_names:
#     sess_test_locale = test_sessions.query(f'locale == "{locale}"').copy()
#     predictions.append(
#         random_predicitons(locale, sess_test_locale)
#     )
# predictions = pd.concat(predictions).reset_index(drop=True)
# predictions.sample(5)

In [17]:
def get_predictions(row):
    prev_items = row['prev_items']
    # try:
    similarity_dic = w2vec.wv.most_similar(positive=prev_items, topn=100)
    res = [item for item, simi in similarity_dic] 
        # print(err)
    # except Exception as e:
        # print(e)
    # res = prev_items
    return res 

In [18]:
# # def get_session_vector(df, w2vec):
# #   aids = df.aid.unique()
# #   for i, aid in enumerate(aids):
# #     vec = w2vec.wv[aid] if i == 0 else vec + w2vec.wv[aid]
# #   vec = vec / len(aids)
# #   return vec

# # def get_close_aids(df, w2vec, index, idx2aid, n=20):
# #   session_vec = get_session_vector(df, w2vec)
# #   close_aids = get_nearest_neighbours(session_vec, index, idx2aid, n)
# #   return close_aids

# # def get_nearest_neighbours(x, index, idx2aid, n=20):
# #   indexes, distances = index.get_nns_by_vector(x, n, search_k=-1, include_distances=True)
# #   aids = [idx2aid[i] for i in indexes]
# #   df = pd.DataFrame(data={'aid' : aids, 'w2vec_dist' : distances})
# #   return df

# index = AnnoyIndex(vector_size, distance)
# aid2idx = {}

# popular_aids = test.groupby('aid', as_index=False).agg({'session' : 'count'})
# popular_aids = popular_aids.loc[popular_aids['session'] > popular_aids['session'].quantile(pop_thresh)]
# popular_aid_list = popular_aids.aid.unique()

# for i, aid in enumerate(popular_aid_list):
# aid = str(aid)
# aid2idx[aid] = i
# index.add_item(i, w2vec.wv[aid])
# idx2aid = { v : k for k, v in aid2idx.items()}
# index.build(40) # build 40 trees

# reduced_test = test.copy()
# reduced_test['aid'] = reduced_test['aid'].astype('str')
# reduced_test['aid_vector'] = reduced_test['aid'].apply(lambda x: w2vec.wv[x])

# reduced_test = reduced_test.groupby('session').apply(lambda x: get_close_aids(x, w2vec, index, idx2aid, n)).reset_index().drop(columns='level_1')


In [42]:
# df = test_sessions.sample(100000)

In [29]:
# %%time
# df['next_item_prediction'] = df.apply(lambda row: get_predictions(row), axis=1)

In [30]:
test_sessions.sample()

Unnamed: 0,prev_items,locale
227651,['B09WF5TQ81' 'B09WF5TQ81'],UK


In [21]:
from gensim.similarities.annoy import AnnoyIndexer

# 100 trees are being used in this example
annoy_index = AnnoyIndexer(w2vec, num_tree)

In [25]:
lst = ['B002ZCXPDU', 'B083MNDJLD', 'B08GR61FN6']

approximate_neighbors = w2vec.wv.most_similar(positive=lst, topn=5, indexer=annoy_index)
exact_neighbors = w2vec.wv.most_similar(positive=lst, topn=5)

In [75]:
annoy_index_file

'../model_training/v2/annoy_index_100_trees.index'

In [76]:
annoy_index.save(annoy_index_file)

In [26]:
print("Approximate: ")
print(approximate_neighbors)
print()
print('Exact: ')
print(exact_neighbors)

Approximate: 
[('B083MNDJLD', 0.990120471165223), ('B09S7FVRFF', 0.9889326698995329), ('B08P2JQ3SQ', 0.9872487178796704), ('B01KZP0J3W', 0.9831200845688705), ('B08T9KW4NZ', 0.9819146977060748)]

Exact: 
[('B09S7FVRFF', 0.9889327883720398), ('B08P2JQ3SQ', 0.9872486591339111), ('B01KZP0J3W', 0.9831200838088989), ('B08T9KW4NZ', 0.9819145798683167), ('B08RGTG3KQ', 0.9777733683586121)]


In [27]:
def get_rec(prev_items, topn=10, annoy_index=None):
    # print(prev_items)
    res = [ele.replace('[', '').replace(']', '').replace('\n', '').replace("'", '').replace(' ', '') for ele in prev_items.split(' ')]
    # print(type(res))
    try:
        if annoy_index is not None:
            similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn, indexer=annoy_index)
        else:
            similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn)
        res = [item for item, simi in similarity_dic] 
        # print(err)
    except Exception as e:
        print(e)
    
    return res

In [33]:
# df = test_sessions.sample(10)

In [43]:
df = test_sessions.copy()

In [44]:
df.shape

(316971, 2)

In [45]:
%%time
pl_df = pl.from_dataframe(df)
pl_df = (
    pl_df
        .with_columns(pl.col('prev_items').apply(lambda row: get_rec(row, annoy_index=annoy_index)).alias('next_item_prediction'))
)
result_df = pl_df.to_pandas()

CPU times: user 1min 55s, sys: 704 ms, total: 1min 56s
Wall time: 1min 56s


In [46]:
# pl_df.head()

In [47]:
# %%time
# pl_df = pl.from_dataframe(df)
# pl_df = (
#     pl_df
#         .with_columns(pl.col('prev_items').apply(lambda row: get_rec(row)).alias('next_item_prediction'))
# )
# result_df = pl_df.to_pandas()

In [48]:
result_df.shape

(316971, 3)

In [49]:
# test_sessions['next_item_prediction'] = test_sessions.apply(lambda row: get_predictions(row), axis=1)

In [50]:
predictions = result_df[['locale', 'next_item_prediction']]

In [51]:
predictions.head()

Unnamed: 0,locale,next_item_prediction
0,DE,"[B08496TCCQ, B0BJ6TWCQW, B096BGC3XF, B07XRQ8YS..."
1,DE,"[B00R9RZ9ZS, B00R9R5ND6, B004ZXMV4Q, B08PM4PWD..."
2,DE,"[B07G7Q5N6G, B08C9Q7QVK, B000KSLHNQ, B0B5QNFWJ..."
3,DE,"[3772476953, B09M6X8SM8, 3955350878, 395535084..."
4,DE,"[B09XTSD7XC, B0B7WRBHWH, B09MS15K58, B09CCPNPS..."


In [58]:
predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316971 entries, 0 to 316970
Data columns (total 2 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   locale                316971 non-null  object
 1   next_item_prediction  316971 non-null  object
dtypes: object(2)
memory usage: 4.8+ MB


# Model Eval

In [61]:
train_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3272716 entries, 0 to 3272715
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   prev_items  object
 1   next_item   object
 2   locale      object
dtypes: object(3)
memory usage: 99.9+ MB


In [63]:
sample_df = train_sessions.sample(1000)
sample_df.head(2)

Unnamed: 0,prev_items,next_item,locale
2346537,['140835473X' '1800783019' '1471197220'],1408357488,UK
82962,['B00O4E4L12' 'B09JSLTC9S' 'B08KTP6M3P' 'B08KT...,B085FLPL3V,DE


In [65]:
pl_df = pl.from_dataframe(sample_df)
pl_df = (
    pl_df
        .with_columns(pl.col('prev_items').apply(lambda row: get_rec(row, annoy_index=annoy_index, topn=3)).alias('next_item_prediction'))
)

In [66]:
pl_df

prev_items,next_item,locale,next_item_prediction
str,str,str,list[str]
"""['140835473X' …","""1408357488""","""UK""","[""1409582159"", ""0008507694"", ""1999829549""]"
"""['B00O4E4L12' …","""B085FLPL3V""","""DE""","[""B08KTP4HHH"", ""B00O4E4L12"", ""B085FLFHKP""]"
"""['B07F6YZDQ8' …","""B07F715JL6""","""DE""","[""B00I6EY3PU"", ""B08F2TCD6M"", ""B08MWQRD64""]"
"""['B06X6JLZ1T' …","""B0043W42ZU""","""DE""","[""B094NW9YS2"", ""B01BY9RTO0"", ""B088H614FL""]"
"""['B076WRVJMQ' …","""B08KJWFNLP""","""UK""","[""B0744871FX"", ""B01LQAHN0W"", ""B01MR51N92""]"
"""['B07TWFK9TZ' …","""B07SH1HJDW""","""JP""","[""B078B5CTZP"", ""B09T2LWLK5"", ""B07FQR4KFY""]"
"""['B0B73RM6Z5' …","""B07YLJMCQJ""","""JP""","[""B00EYYNWLW"", ""B00GHI037E"", ""B00QK4AUKA""]"
"""['B002PLXR8E' …","""B00KACQ00W""","""DE""","[""B07FYVTXN7"", ""B000W8UXGQ"", ""B07J5V84JJ""]"
"""['B09RX2HBX1' …","""B09RX2BBB3""","""UK""","[""B09BW3PCM8"", ""B09BW18Q12"", ""B0B59F4M5Y""]"
"""['B0B68VQCMV' …","""B0BJPYVRJF""","""JP""","[""B0BB95MQ6J"", ""B09V7T39HV"", ""B0BJDQNRLF""]"


# Validate predictions ✅ 😄

In [88]:
check_predictions(predictions, test_sessions=test_sessions, 
                  # check_products=True, product_df=products
                 )

In [54]:
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(f'submission_{task}.parquet', engine='pyarrow')

## Submit to AIcrowd 🚀

In [55]:
# You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f "submission_task1.parquet"

[2K[1;34msubmission_task1.parquet[0m [90m━━━━━━━━━━━━[0m [35m100.0%[0m • [32m29.3/29.3 MB[0m • [31m3.4 MB/s[0m • [36m0:00:00[0m00:01[0m00:01[0m
[?25h                                                                                 ╭─────────────────────────╮                                                                                 
                                                                                 │ [1mSuccessfully submitted![0m │                                                                                 
                                                                                 ╰─────────────────────────╯                                                                                 
[3m                                                                                       Important links                                                                                       [0m
┌──────────────────┬───────────────────────────────────────