# Data

In [1]:
# !aicrowd login

In [2]:
# !aicrowd dataset download --challenge task-1-next-product-recommendation

# Packages

In [4]:
%load_ext autoreload
%autoreload 2
import sys
import logging
sys.path.append('../')
import os
from utils import *

import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

from annoy import AnnoyIndex
import polars as pl

# Config

In [5]:
debug = False

debug_session_num = 1000

In [6]:
train_data_dir = '.'
test_data_dir = '.'
task = 'task1'
PREDS_PER_SESSION = 100

model_dir = '../model_training/v2'

# target locales: locales needed for task1

target_locals = ["DE", 'JP', 'UK']

w2v_model_file = os.path.join(model_dir, 'w2v.model')

In [7]:
# ! mkdir {model_dir}

# Function 

# Data Description

The Multilingual Shopping Session Dataset is a collection of **anonymized customer sessions** containing products from six different locales, namely English, German, Japanese, French, Italian, and Spanish. It consists of two main components: **user sessions** and **product attributes**. User sessions are a list of products that a user has engaged with in chronological order, while product attributes include various details like product title, price in local currency, brand, color, and description.

---

### Each product as its associated information:


**locale**: the locale code of the product (e.g., DE)

**id**: a unique for the product. Also known as Amazon Standard Item Number (ASIN) (e.g., B07WSY3MG8)

**title**: title of the item (e.g., “Japanese Aesthetic Sakura Flowers Vaporwave Soft Grunge Gift T-Shirt”)

**price**: price of the item in local currency (e.g., 24.99)

**brand**: item brand name (e.g., “Japanese Aesthetic Flowers & Vaporwave Clothing”)

**color**: color of the item (e.g., “Black”)

**size**: size of the item (e.g., “xxl”)

**model**: model of the item (e.g., “iphone 13”)

**material**: material of the item (e.g., “cotton”)

**author**: author of the item (e.g., “J. K. Rowling”)

**desc**: description about a item’s key features and benefits called out via bullet points (e.g., “Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers …”)


## EDA 💽

In [7]:
def read_locale_data(locale, task):
    products = read_product_data().query(f'locale == "{locale}"')
    sess_train = read_train_data().query(f'locale == "{locale}"')
    sess_test = read_test_data(task).query(f'locale == "{locale}"')
    return products, sess_train, sess_test

def show_locale_info(locale, task):
    products, sess_train, sess_test = read_locale_data(locale, task)

    train_l = sess_train['prev_items'].apply(lambda sess: len(sess))
    test_l = sess_test['prev_items'].apply(lambda sess: len(sess))

    print(f"Locale: {locale} \n"
          f"Number of products: {products['id'].nunique()} \n"
          f"Number of train sessions: {len(sess_train)} \n"
          f"Train session lengths - "
          f"Mean: {train_l.mean():.2f} | Median {train_l.median():.2f} | "
          f"Min: {train_l.min():.2f} | Max {train_l.max():.2f} \n"
          f"Number of test sessions: {len(sess_test)}"
        )
    if len(sess_test) > 0:
        print(
             f"Test session lengths - "
            f"Mean: {test_l.mean():.2f} | Median {test_l.median():.2f} | "
            f"Min: {test_l.min():.2f} | Max {test_l.max():.2f} \n"
        )
    print("======================================================================== \n")

In [8]:
# products = read_product_data(train_data_dir=train_data_dir)
# locale_names = products['locale'].unique()
# for locale in locale_names:
#     show_locale_info(locale, task)

In [9]:
train_sessions = read_train_data(train_data_dir=train_data_dir)
train_sessions.sample(5)

Unnamed: 0,prev_items,next_item,locale
873599,['B00EVA1FFO' 'B00EVA1H66'],B00H3APNFC,DE
2391555,['B087Q7BCRQ' 'B08DK1GS2W'],B09L7TSRWF,UK
547188,['B0713WW5XB' 'B077GS2LJ5' 'B0713WW5XB' 'B077G...,B09PBJJD9F,DE
1194408,['B09V3JGPXG' 'B09V3KZ5J7' 'B09V3HXTB7' 'B09V3...,B09V3JT6N8,JP
2159784,['B08V98Z8BK' 'B08V98Z8BK' 'B09QFPZ9B7' 'B018U...,B018FZNQ4I,UK


In [10]:
test_data_dir

'.'

In [8]:
test_sessions = read_test_data(task, test_data_dir=test_data_dir)
test_sessions.sample(5)

Unnamed: 0,prev_items,locale
285769,['B07B5S36DP' 'B083NMTVS5' 'B07B5S36DP'],UK
110917,['B07S8VWT5B' 'B06W9DXWY8' 'B06WRN4VGJ' 'B010R...,JP
217459,['B09BNX11LQ' 'B09BNWY6SB'],UK
134735,['B0891LMDBD' 'B09BV9P5WM'],JP
289671,['B08VF94DSV' 'B07SWQTPDZ' 'B08VF94DSV' 'B0BGH...,UK


In [12]:
if debug:
    train_sessions = train_sessions.sample(debug_session_num)
    test_sessions = test_sessions.sample(debug_session_num)

In [13]:
train_sessions.shape

(3606249, 3)

In [14]:
train_sessions['prev_items'] = train_sessions.apply(lambda row: process_item_lst(row), axis=1)
# test_sessions['prev_items'] = test_sessions.apply(lambda row: process_item_lst(row), axis=1)

In [15]:
test_sessions['locale'].value_counts()

locale
UK    115936
DE    104568
JP     96467
Name: count, dtype: int64

In [16]:
target_locals

['DE', 'JP', 'UK']

In [17]:
train_sessions = train_sessions[train_sessions['locale'].isin(target_locals)]

In [18]:
train_sessions.shape

(3272716, 3)

# Word2vec

In [19]:
# train_sessions['prev_items'].to_list()

In [20]:
vector_size = 32
epochs = 10
sg = 1 # 1 for skip-gram
pop_thresh = 0.82415
window = 4

sentences = train_sessions['prev_items'].to_list() + test_sessions['prev_items'].to_list()
len(sentences)

3589687

In [21]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

In [22]:
w2vec = Word2Vec(sentences=sentences, vector_size=vector_size, epochs = epochs, sg=sg,
                 min_count=1, workers=14,
                 window=window,
                  compute_loss=True
              , callbacks=[callback()])

Loss after epoch 0: 11067542.0
Loss after epoch 1: 6493280.0
Loss after epoch 2: 2888668.0
Loss after epoch 3: 2580968.0
Loss after epoch 4: 2329516.0
Loss after epoch 5: 2048868.0
Loss after epoch 6: 1821566.0
Loss after epoch 7: 1773816.0
Loss after epoch 8: 1667380.0
Loss after epoch 9: 1025936.0


In [23]:
w2v_model_file

'../model_training/v2/w2v.model'

In [24]:
w2vec.save(w2v_model_file)

In [25]:
# ! ls sample_data

Generate Submission 🏋️‍♀️



Submission format:
1. The submission should be a **parquet** file with the sessions from all the locales. 
2. Predicted products ids per locale should only be a valid product id of that locale. 
3. Predictions should be added in new column named **"next_item_prediction"**.
4. Predictions should be a list of string id values

In [26]:
# def random_predicitons(locale, sess_test_locale):
#     random_state = np.random.RandomState(42)
#     products = read_product_data().query(f'locale == "{locale}"')
#     predictions = []
#     for _ in range(len(sess_test_locale)):
#         predictions.append(
#             list(products['id'].sample(PREDS_PER_SESSION, replace=True, random_state=random_state))
#         ) 
#     sess_test_locale['next_item_prediction'] = predictions
#     sess_test_locale.drop('prev_items', inplace=True, axis=1)
#     return sess_test_locale

In [27]:
# test_sessions.head()

In [28]:
# test_sessions

## Load model 

In [9]:
w2v_model_file

'../model_training/v2/w2v.model'

In [10]:
w2vec = Word2Vec.load(w2v_model_file)

In [11]:

test_sessions.shape

(316971, 2)

In [12]:
test_sessions.head()

Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE


In [13]:
# predictions = []
# test_locale_names = test_sessions['locale'].unique()
# for locale in test_locale_names:
#     sess_test_locale = test_sessions.query(f'locale == "{locale}"').copy()
#     predictions.append(
#         random_predicitons(locale, sess_test_locale)
#     )
# predictions = pd.concat(predictions).reset_index(drop=True)
# predictions.sample(5)

In [14]:
def get_predictions(row):
    prev_items = row['prev_items']
    # try:
    similarity_dic = w2vec.wv.most_similar(positive=prev_items, topn=100)
    res = [item for item, simi in similarity_dic] 
        # print(err)
    # except Exception as e:
        # print(e)
    # res = prev_items
    return res 

In [15]:
# # def get_session_vector(df, w2vec):
# #   aids = df.aid.unique()
# #   for i, aid in enumerate(aids):
# #     vec = w2vec.wv[aid] if i == 0 else vec + w2vec.wv[aid]
# #   vec = vec / len(aids)
# #   return vec

# # def get_close_aids(df, w2vec, index, idx2aid, n=20):
# #   session_vec = get_session_vector(df, w2vec)
# #   close_aids = get_nearest_neighbours(session_vec, index, idx2aid, n)
# #   return close_aids

# # def get_nearest_neighbours(x, index, idx2aid, n=20):
# #   indexes, distances = index.get_nns_by_vector(x, n, search_k=-1, include_distances=True)
# #   aids = [idx2aid[i] for i in indexes]
# #   df = pd.DataFrame(data={'aid' : aids, 'w2vec_dist' : distances})
# #   return df

# index = AnnoyIndex(vector_size, distance)
# aid2idx = {}

# popular_aids = test.groupby('aid', as_index=False).agg({'session' : 'count'})
# popular_aids = popular_aids.loc[popular_aids['session'] > popular_aids['session'].quantile(pop_thresh)]
# popular_aid_list = popular_aids.aid.unique()

# for i, aid in enumerate(popular_aid_list):
# aid = str(aid)
# aid2idx[aid] = i
# index.add_item(i, w2vec.wv[aid])
# idx2aid = { v : k for k, v in aid2idx.items()}
# index.build(40) # build 40 trees

# reduced_test = test.copy()
# reduced_test['aid'] = reduced_test['aid'].astype('str')
# reduced_test['aid_vector'] = reduced_test['aid'].apply(lambda x: w2vec.wv[x])

# reduced_test = reduced_test.groupby('session').apply(lambda x: get_close_aids(x, w2vec, index, idx2aid, n)).reset_index().drop(columns='level_1')


In [16]:
df = test_sessions.sample(1000)

In [37]:
# %%time
# df['next_item_prediction'] = df.apply(lambda row: get_predictions(row), axis=1)

In [23]:
def get_rec(prev_items, topn=10):
    # print(prev_items)
    res = [ele.replace('[', '').replace(']', '').replace('\n', '').replace("'", '').replace(' ', '') for ele in prev_items.split(' ')]
    # print(type(res))
    try:
        similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn)
        res = [item for item, simi in similarity_dic] 
        # print(err)
    except Exception as e:
        print(e)
    
    return res

In [24]:
%%time
pl_df = pl.from_dataframe(df)
pl_df = (
    pl_df
        .with_columns(pl.col('prev_items').apply(lambda row: get_rec(row)).alias('next_item_prediction'))
)
result_df = pl_df.to_pandas()

CPU times: user 2min 23s, sys: 2min 42s, total: 5min 6s
Wall time: 45 s


In [25]:
result_df.shape

(1000, 3)

In [26]:
# test_sessions['next_item_prediction'] = test_sessions.apply(lambda row: get_predictions(row), axis=1)

In [27]:
predictions = result_df[['locale', 'next_item_prediction']]

In [28]:
predictions.head()

Unnamed: 0,locale,next_item_prediction
0,UK,"[B093375R52, B08CB8TM7H, B0895XDJ4M, B08Q7WG4T..."
1,DE,"[B06VWFTT3V, B079ZV816K, B07RL66RQ9, B079ZV4V3..."
2,UK,"[B08CSC94BT, B0BGHZJCMJ, B0992B1K3K, B09R4JKCK..."
3,JP,"[B09H2PNSVG, B0BCHKRNPK, B09CYCF44J, B09MY7P1B..."
4,DE,"[B08GR2HD5X, B0742KVRJR, B00BYNCTOW, B08KNTFQX..."


# Validate predictions ✅ 😄

In [None]:
def check_predictions(predictions, check_products=False):
    """
    These tests need to pass as they will also be applied on the evaluator
    """
    test_locale_names = test_sessions['locale'].unique()
    for locale in test_locale_names:
        sess_test = test_sessions.query(f'locale == "{locale}"')
        preds_locale =  predictions[predictions['locale'] == sess_test['locale'].iloc[0]]
        assert sorted(preds_locale.index.values) == sorted(sess_test.index.values), f"Session ids of {locale} doesn't match"

        if check_products:
            # This check is not done on the evaluator
            # but you can run it to verify there is no mixing of products between locales
            # Since the ground truth next item will always belong to the same locale
            # Warning - This can be slow to run
            products = read_product_data().query(f'locale == "{locale}"')
            predicted_products = np.unique( np.array(list(preds_locale["next_item_prediction"].values)) )
            assert np.all( np.isin(predicted_products, products['id']) ), f"Invalid products in {locale} predictions"

In [None]:
check_predictions(predictions)

In [None]:
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(f'submission_{task}.parquet', engine='pyarrow')

## Submit to AIcrowd 🚀

In [None]:
# You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f "submission_task1.parquet"