# Packages

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import logging
base_dir = '../'
sys.path.append(base_dir)
import os
from gensim.similarities.annoy import AnnoyIndexer

from utils import *

import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from tqdm import tqdm
import polars as pl
from annoy import AnnoyIndex
import polars as pl
import polars as pl
from utils import *
from src.eval import model_eval
from src.config import raw_data_session_id_dir, candidate_dir

# Config

In [2]:
debug = True

if debug:
    n_rows = 1000
else:
    n_rows = None

train_data_dir = '.'
test_data_dir = '.'
task = 'task1'

num_tree = 100

model_dir = '../model_training/v2'

# target locales: locales needed for task1

target_locals = ["DE", 'JP', 'UK']

w2v_model_file = os.path.join(model_dir, 'w2v.model')

annoy_index_file = os.path.join(model_dir, f'annoy_index_{str(num_tree)}_trees.index')
    
# train_eval_result_file = os.path.join(eval_data_dir, 'train_result_w2v.parquet')

# Function 

In [3]:
# # %%time
# def w2v_rec(df, w2vec, annoy_index, topn):
#     next_item_prediction_lst = []
#     for a in tqdm(df.iterrows(), total=len(df)):
#         prev_items = a[1]['prev_items']
#         res = list(prev_items)
#         # print(res)
#         # print(type(res))
#         # print(res.shape)
#         similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn, indexer=annoy_index)
#         res = [item for item, simi in similarity_dic]
#         next_item_prediction_lst.append(res)
#     df['next_item_prediction'] = next_item_prediction_lst
#     return df

def w2v_rec(target_df, w2vec, annoy_index, topn):
    target_df = (
        target_df.with_columns(
            pl.col('prev_items').apply(lambda x: 
                                       # x.to_list()
                                       list(map(list, zip(*w2vec.wv.most_similar(positive=x.to_list(),
                                                                     topn=100,
                                                                     indexer=annoy_index))))[0]
                                      ).alias('next_item_prediction'))
    )
    return target_df

# Data Description

The Multilingual Shopping Session Dataset is a collection of **anonymized customer sessions** containing products from six different locales, namely English, German, Japanese, French, Italian, and Spanish. It consists of two main components: **user sessions** and **product attributes**. User sessions are a list of products that a user has engaged with in chronological order, while product attributes include various details like product title, price in local currency, brand, color, and description.

---

### Each product as its associated information:


**locale**: the locale code of the product (e.g., DE)

**id**: a unique for the product. Also known as Amazon Standard Item Number (ASIN) (e.g., B07WSY3MG8)

**title**: title of the item (e.g., “Japanese Aesthetic Sakura Flowers Vaporwave Soft Grunge Gift T-Shirt”)

**price**: price of the item in local currency (e.g., 24.99)

**brand**: item brand name (e.g., “Japanese Aesthetic Flowers & Vaporwave Clothing”)

**color**: color of the item (e.g., “Black”)

**size**: size of the item (e.g., “xxl”)

**model**: model of the item (e.g., “iphone 13”)

**material**: material of the item (e.g., “cotton”)

**author**: author of the item (e.g., “J. K. Rowling”)

**desc**: description about a item’s key features and benefits called out via bullet points (e.g., “Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers …”)


## EDA 💽

In [3]:
def read_locale_data(locale, task):
    products = read_product_data().query(f'locale == "{locale}"')
    sess_train = read_train_data().query(f'locale == "{locale}"')
    sess_test = read_test_data(task).query(f'locale == "{locale}"')
    return products, sess_train, sess_test

def show_locale_info(locale, task):
    products, sess_train, sess_test = read_locale_data(locale, task)

    train_l = sess_train['prev_items'].apply(lambda sess: len(sess))
    test_l = sess_test['prev_items'].apply(lambda sess: len(sess))

    print(f"Locale: {locale} \n"
          f"Number of products: {products['id'].nunique()} \n"
          f"Number of train sessions: {len(sess_train)} \n"
          f"Train session lengths - "
          f"Mean: {train_l.mean():.2f} | Median {train_l.median():.2f} | "
          f"Min: {train_l.min():.2f} | Max {train_l.max():.2f} \n"
          f"Number of test sessions: {len(sess_test)}"
        )
    if len(sess_test) > 0:
        print(
             f"Test session lengths - "
            f"Mean: {test_l.mean():.2f} | Median {test_l.median():.2f} | "
            f"Min: {test_l.min():.2f} | Max {test_l.max():.2f} \n"
        )
    print("======================================================================== \n")

In [4]:
# products = read_product_data(train_data_dir=train_data_dir)
# # locale_names = products['locale'].unique()
# # for locale in locale_names:
# #     show_locale_info(locale, task)

In [5]:
products.shape

(1551057, 11)

In [6]:
train_sessions = read_train_data(train_data_dir=train_data_dir)
train_sessions.sample(5)

Unnamed: 0,prev_items,next_item,locale
2687268,['B096Z9652W' 'B09WH4J2K7' 'B08Y71Y8YT'],B0927VPDKR,UK
1838192,['B00I95NJNS' 'B00IEA4OSW' 'B0872MYW83' 'B00I9...,B091CH9KKX,JP
464603,['B0079F3ICQ' 'B0079F3KR4' 'B08VMMH3G7' 'B08VL...,B0842N3QGB,DE
3233842,['B095KL6H7J' 'B095KKM33R' 'B095KKSF72' 'B095K...,B087GTFVJF,UK
414310,['B08FWYZL9V' 'B08FWWV3WH'],B08BGF2T1H,DE


In [7]:
test_data_dir

'.'

In [8]:
test_sessions = read_test_data(task, test_data_dir=test_data_dir)
test_sessions.sample(5)

Unnamed: 0,prev_items,locale
69546,['B0B4JWSCFT' 'B09TT4LXWS'],DE
137545,['B07P6W92FK' 'B07P8KPPDC'],JP
221672,['B0BHMHV9Z3' 'B09Q934288' 'B09Q956R8R' 'B0B5X...,UK
113348,['B0B3F181L1' 'B0BBLJ3D2K'],JP
162411,['B07XS4XN9V' 'B01MXLEVR7' 'B01N40PO2M' 'B07XP...,JP


In [9]:
train_sessions = train_sessions[train_sessions['locale'].isin(target_locals)]

In [10]:

if debug:
    train_sessions = train_sessions.sample(debug_session_num)
    test_sessions = test_sessions.sample(debug_session_num)

In [79]:
train_sessions.shape

(3272716, 3)

In [14]:
train_sessions['prev_items'] = train_sessions.apply(lambda row: process_item_lst(row), axis=1)
# test_sessions['prev_items'] = test_sessions.apply(lambda row: process_item_lst(row), axis=1)

In [15]:
test_sessions['locale'].value_counts()

locale
UK    115936
DE    104568
JP     96467
Name: count, dtype: int64

In [16]:
target_locals

['DE', 'JP', 'UK']

In [18]:
train_sessions.shape

(3272716, 3)

# Word2vec

In [7]:
# train_sessions['prev_items'].to_list()

## Train model & annnoy index 

In [20]:
vector_size = 32
epochs = 10
sg = 1 # 1 for skip-gram
pop_thresh = 0.82415
window = 4

sentences = train_sessions['prev_items'].to_list() + test_sessions['prev_items'].to_list()
len(sentences)


from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1
w2vec = Word2Vec(sentences=sentences, vector_size=vector_size, epochs = epochs, sg=sg,
                 min_count=1, workers=14,
                 window=window,
                  compute_loss=True
              , callbacks=[callback()])

3589687

Loss after epoch 0: 11067542.0
Loss after epoch 1: 6493280.0
Loss after epoch 2: 2888668.0
Loss after epoch 3: 2580968.0
Loss after epoch 4: 2329516.0
Loss after epoch 5: 2048868.0
Loss after epoch 6: 1821566.0
Loss after epoch 7: 1773816.0
Loss after epoch 8: 1667380.0
Loss after epoch 9: 1025936.0


In [23]:
w2v_model_file

'../model_training/v2/w2v.model'

In [None]:

# 100 trees are being used in this example
annoy_index = AnnoyIndexer(w2vec, num_tree)

## Save Model 

In [24]:
w2vec.save(w2v_model_file)

In [None]:

annoy_index.save(annoy_index_file)

In [31]:
annoy_index_file

'../model_training/v2/annoy_index_100_trees.index'

Generate Submission 🏋️‍♀️



Submission format:
1. The submission should be a **parquet** file with the sessions from all the locales. 
2. Predicted products ids per locale should only be a valid product id of that locale. 
3. Predictions should be added in new column named **"next_item_prediction"**.
4. Predictions should be a list of string id values

In [26]:
# def random_predicitons(locale, sess_test_locale):
#     random_state = np.random.RandomState(42)
#     products = read_product_data().query(f'locale == "{locale}"')
#     predictions = []
#     for _ in range(len(sess_test_locale)):
#         predictions.append(
#             list(products['id'].sample(PREDS_PER_SESSION, replace=True, random_state=random_state))
#         ) 
#     sess_test_locale['next_item_prediction'] = predictions
#     sess_test_locale.drop('prev_items', inplace=True, axis=1)
#     return sess_test_locale

In [27]:
# test_sessions.head()

In [28]:
# test_sessions

## Load model 

In [4]:
w2v_model_file

'../model_training/v2/w2v.model'

In [5]:
w2vec = Word2Vec.load(w2v_model_file)
annoy_index = AnnoyIndexer()
annoy_index.load(annoy_index_file)

## Load data 

In [6]:
train_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))
eval_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_eval.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

# df_sess.head(3).collect()
test_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_test_task1.parquet'), n_rows=n_rows).with_columns(pl.col('prev_items').apply(str2list))

## Model Eval

In [7]:
# %%time
# eval_df = w2v_rec(df=eval_pl.collect().to_pandas(),
#                    w2vec=w2vec, annoy_index=annoy_index, topn=100)

In [8]:
# eval_pl.schema

In [15]:
%%time
eval_pl = w2v_rec(target_df=eval_pl,
                   w2vec=w2vec, annoy_index=annoy_index, topn=100)

CPU times: user 113 µs, sys: 17 µs, total: 130 µs
Wall time: 135 µs


In [16]:
# eval_pl.head().collect()

In [17]:
# target_df = pl.from_pandas(eval_df)

In [18]:
# target_df.schema

In [19]:
eval_pl.schema

{'prev_items': Unknown,
 'next_item': Utf8,
 'locale': Utf8,
 'session_id': Int64,
 'next_item_prediction': Unknown}

In [20]:
# pl.List()

In [21]:
# eval_pl.with_columns(
#     pl.col('next_item_prediction').cast(pl.List(pl.Utf8))
# ).schema

In [22]:
# eval_pl.collect().schema

In [23]:
model_eval(target_df=eval_pl)

mrr,recall@20,recall@100
f64,f64,f64
0.034678,0.159,0.269


## Candidate Saving 

In [9]:
# train_pl.head().collect()

In [80]:
# x = ["B09SMK3R8H", "B01N4ND0F9"]
# w2vec.wv.most_similar(positive=x, topn=topn, indexer=annoy_index)

In [81]:
# train_pl.schema

In [82]:
# target_df.collect().schema

In [83]:
# # %%time
# def pd_get_rec(df, w2vec, annoy_index, topn):
#     next_item_prediction_lst = []
#     for a in tqdm(df.iterrows(), total=len(df)):
#         prev_items = a[1]['prev_items']
#         res = [ele.replace('[', '').replace(']', '').replace('\n', '').replace("'", '').replace(' ', '') for ele in prev_items.split(' ')]
#         # print(type(res))
#         similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn, indexer=annoy_index)
#         res = [item for item, simi in similarity_dic]
#         next_item_prediction_lst.append(res)
#     df['next_item_prediction'] = next_item_prediction_lst
#     return df

In [85]:
test_df = w2v_rec(df=test_pl.collect().to_pandas(),
                   w2vec=w2vec, annoy_index=annoy_index, topn=100)

100%|██████████| 316971/316971 [13:38<00:00, 387.39it/s]


In [86]:
train_df = w2v_rec(df=train_pl.collect().to_pandas(),
                   w2vec=w2vec, annoy_index=annoy_index, topn=100)

100%|██████████| 2946273/2946273 [2:12:06<00:00, 371.71it/s]  


### Saving 

In [21]:
cols_to_keep = ['session_id', 'next_item_prediction']


In [20]:
pl.from_pandas(train_df).select(cols_to_keep).write_parquet(os.path.join(base_dir, candidate_dir, 'task1_train_w2v_top100.parquet'))
pl.from_pandas(test_df).select(cols_to_keep).write_parquet(os.path.join(base_dir, candidate_dir, 'task1_test_w2v_top100.parquet'))

NameError: name 'train_df' is not defined

In [22]:
pl.from_pandas(eval_df).select(cols_to_keep).write_parquet(os.path.join(base_dir, candidate_dir, 'task1_eval_w2v_top100.parquet'))


In [34]:
# def w2v_rec(target_df, w2vec, annoy_index, topn=100):
    
#     next_item_prediction_lst = []
#     for a in tqdm(df.iterrows(), total=len(df)):
#         prev_items = a[1]['prev_items']
#         res = [ele.replace('[', '').replace(']', '').replace('\n', '').replace("'", '').replace(' ', '') for ele in prev_items.split(' ')]
#         # print(type(res))
#         similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn, indexer=annoy_index)
#         res = [item for item, simi in similarity_dic]
#         next_item_prediction_lst.append(res)
#     df['next_item_prediction'] = next_item_prediction_lst
    
#     return df

target_df = train_pl

# def get_w2v_rec(x):
#     res = w2vec.wv.most_similar(positive=x, topn=topn, indexer=annoy_index)
#     return res

get_vector = w2v.wv[pl.element()]

topn=100

(
    target_df
        .collect()
        .with_columns(
            pl.col('prev_items').apply(lambda x: get_w2v_rec(x=x))
            )
).head().collect()

ComputeError: ValueError: cannot do arithmetic with series of dtype: Utf8 and argument of type: <class 'str'>

In [18]:
# predictions = []
# test_locale_names = test_sessions['locale'].unique()
# for locale in test_locale_names:
#     sess_test_locale = test_sessions.query(f'locale == "{locale}"').copy()
#     predictions.append(
#         random_predicitons(locale, sess_test_locale)
#     )
# predictions = pd.concat(predictions).reset_index(drop=True)
# predictions.sample(5)

In [17]:
# def get_predictions(row):
#     prev_items = row['prev_items']
#     # try:
#     similarity_dic = w2vec.wv.most_similar(positive=prev_items, topn=100)
#     res = [item for item, simi in similarity_dic] 
#         # print(err)
#     # except Exception as e:
#         # print(e)
#     # res = prev_items
#     return res 

In [18]:
# # def get_session_vector(df, w2vec):
# #   aids = df.aid.unique()
# #   for i, aid in enumerate(aids):
# #     vec = w2vec.wv[aid] if i == 0 else vec + w2vec.wv[aid]
# #   vec = vec / len(aids)
# #   return vec

# # def get_close_aids(df, w2vec, index, idx2aid, n=20):
# #   session_vec = get_session_vector(df, w2vec)
# #   close_aids = get_nearest_neighbours(session_vec, index, idx2aid, n)
# #   return close_aids

# # def get_nearest_neighbours(x, index, idx2aid, n=20):
# #   indexes, distances = index.get_nns_by_vector(x, n, search_k=-1, include_distances=True)
# #   aids = [idx2aid[i] for i in indexes]
# #   df = pd.DataFrame(data={'aid' : aids, 'w2vec_dist' : distances})
# #   return df

# index = AnnoyIndex(vector_size, distance)
# aid2idx = {}

# popular_aids = test.groupby('aid', as_index=False).agg({'session' : 'count'})
# popular_aids = popular_aids.loc[popular_aids['session'] > popular_aids['session'].quantile(pop_thresh)]
# popular_aid_list = popular_aids.aid.unique()

# for i, aid in enumerate(popular_aid_list):
# aid = str(aid)
# aid2idx[aid] = i
# index.add_item(i, w2vec.wv[aid])
# idx2aid = { v : k for k, v in aid2idx.items()}
# index.build(40) # build 40 trees

# reduced_test = test.copy()
# reduced_test['aid'] = reduced_test['aid'].astype('str')
# reduced_test['aid_vector'] = reduced_test['aid'].apply(lambda x: w2vec.wv[x])

# reduced_test = reduced_test.groupby('session').apply(lambda x: get_close_aids(x, w2vec, index, idx2aid, n)).reset_index().drop(columns='level_1')


In [19]:
# df = test_sessions.sample(100000)

In [20]:
# %%time
# df['next_item_prediction'] = df.apply(lambda row: get_predictions(row), axis=1)

In [21]:
# test_sessions.sample()

In [22]:
# lst = ['B002ZCXPDU', 'B083MNDJLD', 'B08GR61FN6']

# approximate_neighbors = w2vec.wv.most_similar(positive=lst, topn=5, indexer=annoy_index)
# exact_neighbors = w2vec.wv.most_similar(positive=lst, topn=5)

In [23]:
# annoy_index_file

In [24]:
# print("Approximate: ")
# print(approximate_neighbors)
# print()
# print('Exact: ')
# print(exact_neighbors)

In [25]:
def get_rec(prev_items, topn=10, annoy_index=None):
    # print(prev_items)
    res = [ele.replace('[', '').replace(']', '').replace('\n', '').replace("'", '').replace(' ', '') for ele in prev_items.split(' ')]
    # print(type(res))
    try:
        if annoy_index is not None:
            similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn, indexer=annoy_index)
        else:
            similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn)
        res = [item for item, simi in similarity_dic] 
        # print(err)
    except Exception as e:
        print(e)
    
    return res

In [26]:
# df = test_sessions.sample(10)

In [27]:
def i2i(df):
    pl_df = pl.from_dataframe(df)
    pl_df = (
        pl_df
            .with_columns(pl.col('prev_items').apply(lambda row: get_rec(row, annoy_index=annoy_index, topn=100)).alias('next_item_prediction'))
    )
    return pl_df

In [29]:
# %%time
def pd_get_rec(df, w2vec, annoy_index, topn):
    next_item_prediction_lst = []
    for a in tqdm(df.iterrows(), total=len(df)):
        prev_items = a[1]['prev_items']
        res = [ele.replace('[', '').replace(']', '').replace('\n', '').replace("'", '').replace(' ', '') for ele in prev_items.split(' ')]
        # print(type(res))
        similarity_dic = w2vec.wv.most_similar(positive=res, topn=topn, indexer=annoy_index)
        res = [item for item, simi in similarity_dic]
        next_item_prediction_lst.append(res)
    df['next_item_prediction'] = next_item_prediction_lst
    return df

In [28]:
test_sessions.shape

(316971, 2)

In [34]:
%%time
pl_df = i2i(test_sessions.sample(100))
result_df = pl_df.to_pandas()

CPU times: user 338 ms, sys: 237 ms, total: 575 ms
Wall time: 51.8 s


In [69]:
# test_sessions.head()

In [68]:
# df

CPU times: user 1.06 ms, sys: 0 ns, total: 1.06 ms
Wall time: 900 µs


In [99]:
test_sessions = pd_get_rec(test_sessions, w2vec, annoy_index, 100)

100%|██████████| 316971/316971 [40:26<00:00, 130.61it/s] 


In [76]:
# df

In [63]:
# similarity_dic[0]

In [64]:
train_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 885 entries, 2866972 to 1610085
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   prev_items  885 non-null    object
 1   next_item   885 non-null    object
 2   locale      885 non-null    object
dtypes: object(3)
memory usage: 27.7+ KB


In [46]:
# pl_df.head()

In [47]:
# %%time
# pl_df = pl.from_dataframe(df)
# pl_df = (
#     pl_df
#         .with_columns(pl.col('prev_items').apply(lambda row: get_rec(row)).alias('next_item_prediction'))
# )
# result_df = pl_df.to_pandas()

In [48]:
result_df.shape

(316971, 3)

In [49]:
# test_sessions['next_item_prediction'] = test_sessions.apply(lambda row: get_predictions(row), axis=1)

In [100]:
predictions = result_df[['locale', 'next_item_prediction']]

In [101]:
predictions.head()

Unnamed: 0,locale,next_item_prediction
0,UK,"[B08M2PY6J5, B07GDRHBKQ, B07JFH3QVK, B07ZKWQQ8..."
1,UK,"[B0B61RHQWQ, B08FF24CPS, B09XTQCHMW, B08HJDGQD..."
2,UK,"[B08MFDT65P, B079R741XF, B08MFH1TTJ, B08LQBJ9C..."
3,JP,"[B07BYFCVJP, B09Y1WTR2Z, B004O4C0RY, B07FPGLLL..."
4,UK,"[1789083451, 0008534993, B08L9YYZSZ, 178294413..."


In [102]:
predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   locale                100 non-null    object
 1   next_item_prediction  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


## Model Eval

In [30]:
train_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3272716 entries, 0 to 3272715
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   prev_items  object
 1   next_item   object
 2   locale      object
dtypes: object(3)
memory usage: 99.9+ MB


In [31]:
%%time
if debug:
    num = 30
else:
    num = 30000
train_df = pd_get_rec(df=train_sessions.sample(num), w2vec=w2vec, topn=100, annoy_index=annoy_index)
# train_eval_df = pl_df.to_pandas()

100%|██████████| 300000/300000 [15:21<00:00, 325.62it/s]


CPU times: user 13min 24s, sys: 7.04 s, total: 13min 31s
Wall time: 15min 21s


In [32]:
%%time
eval_cols = ['len', 'recall@20', 'recall@100']
train_df[eval_cols] = train_df.apply(pd_get_recall_at_k, axis=1, result_type='expand')
print(train_df[eval_cols].mean())

len           100.00000
recall@20       0.17188
recall@100      0.29100
dtype: float64
CPU times: user 14.2 s, sys: 176 ms, total: 14.3 s
Wall time: 14.3 s


In [33]:
train_df.shape

(300000, 7)

In [34]:
train_df.to_parquet(f'../data/eval_data/w2v_train_eval_result_300k.parquet', engine='pyarrow')

In [78]:
# df.apply(pd_get_recall_at_k, axis=1, result_type='expand')

# Validate predictions ✅ 😄

In [104]:
test_sessions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316971 entries, 0 to 316970
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   prev_items            316971 non-null  object
 1   locale                316971 non-null  object
 2   next_item_prediction  316971 non-null  object
dtypes: object(3)
memory usage: 7.3+ MB


In [103]:
check_predictions(predictions, test_sessions=test_sessions, 
                  # check_products=True, product_df=products
                 )

AssertionError: Session ids of DE doesn't match

In [None]:
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(f'submission_{task}.parquet', engine='pyarrow')

## Submit to AIcrowd 🚀

In [None]:
# You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f "submission_task1.parquet"