## Setup data and task information

In [1]:
import os
import numpy as np
import pandas as pd
from functools import lru_cache

In [2]:
train_data_dir = '../data/raw_data/'
test_data_dir = '../data/raw_data/'
task = 'task2'
PREDS_PER_SESSION = 100

In [3]:
# Cache loading of data for multiple calls

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(os.path.join(train_data_dir, 'products_train.csv'))

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(train_data_dir, 'sessions_train.csv'))

@lru_cache(maxsize=3)
def read_test_data(task):
    return pd.read_csv(os.path.join(test_data_dir, f'sessions_test_{task}.csv'))

## Data Description

The Multilingual Shopping Session Dataset is a collection of **anonymized customer sessions** containing products from six different locales, namely English, German, Japanese, French, Italian, and Spanish. It consists of two main components: **user sessions** and **product attributes**. User sessions are a list of products that a user has engaged with in chronological order, while product attributes include various details like product title, price in local currency, brand, color, and description.

---

### Each product as its associated information:


**locale**: the locale code of the product (e.g., DE)

**id**: a unique for the product. Also known as Amazon Standard Item Number (ASIN) (e.g., B07WSY3MG8)

**title**: title of the item (e.g., “Japanese Aesthetic Sakura Flowers Vaporwave Soft Grunge Gift T-Shirt”)

**price**: price of the item in local currency (e.g., 24.99)

**brand**: item brand name (e.g., “Japanese Aesthetic Flowers & Vaporwave Clothing”)

**color**: color of the item (e.g., “Black”)

**size**: size of the item (e.g., “xxl”)

**model**: model of the item (e.g., “iphone 13”)

**material**: material of the item (e.g., “cotton”)

**author**: author of the item (e.g., “J. K. Rowling”)

**desc**: description about a item’s key features and benefits called out via bullet points (e.g., “Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers …”)


## EDA 💽

In [4]:
def read_locale_data(locale, task):
    products = read_product_data().query(f'locale == "{locale}"')
    sess_train = read_train_data().query(f'locale == "{locale}"')
    sess_test = read_test_data(task).query(f'locale == "{locale}"')
    return products, sess_train, sess_test

def show_locale_info(locale, task):
    products, sess_train, sess_test = read_locale_data(locale, task)

    train_l = sess_train['prev_items'].apply(lambda sess: len(sess))
    test_l = sess_test['prev_items'].apply(lambda sess: len(sess))

    print(f"Locale: {locale} \n"
          f"Number of products: {products['id'].nunique()} \n"
          f"Number of train sessions: {len(sess_train)} \n"
          f"Train session lengths - "
          f"Mean: {train_l.mean():.2f} | Median {train_l.median():.2f} | "
          f"Min: {train_l.min():.2f} | Max {train_l.max():.2f} \n"
          f"Number of test sessions: {len(sess_test)}"
        )
    if len(sess_test) > 0:
        print(
             f"Test session lengths - "
            f"Mean: {test_l.mean():.2f} | Median {test_l.median():.2f} | "
            f"Min: {test_l.min():.2f} | Max {test_l.max():.2f} \n"
        )
    print("======================================================================== \n")

In [5]:
products = read_product_data()
locale_names = products['locale'].unique()
for locale in locale_names:
    show_locale_info(locale, task)

Locale: DE 
Number of products: 518327 
Number of train sessions: 1111416 
Train session lengths - Mean: 57.89 | Median 40.00 | Min: 27.00 | Max 2060.00 
Number of test sessions: 0

Locale: JP 
Number of products: 395009 
Number of train sessions: 979119 
Train session lengths - Mean: 59.61 | Median 40.00 | Min: 27.00 | Max 6257.00 
Number of test sessions: 0

Locale: UK 
Number of products: 500180 
Number of train sessions: 1182181 
Train session lengths - Mean: 54.85 | Median 40.00 | Min: 27.00 | Max 2654.00 
Number of test sessions: 0

Locale: ES 
Number of products: 42503 
Number of train sessions: 89047 
Train session lengths - Mean: 48.82 | Median 40.00 | Min: 27.00 | Max 792.00 
Number of test sessions: 8177
Test session lengths - Mean: 50.23 | Median 40.00 | Min: 27.00 | Max 396.00 


Locale: FR 
Number of products: 44577 
Number of train sessions: 117561 
Train session lengths - Mean: 47.25 | Median 40.00 | Min: 27.00 | Max 687.00 
Number of test sessions: 12521
Test session l

In [6]:
products.sample(5)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
1463884,B092MH82Q7,FR,SUSSURRO 3 Pcs Trousse d'entretien des Cuticul...,4.99,SUSSURRO,Argent,3 Unité (Lot de 1),,Acier inoxydable,,➤➤➤3 kit de manucure en 1: Ce kit manucure com...
152770,B001PT9C9K,DE,Trainings- und Ausbildungsgeschirr Side by Sid...,15.99,HUNTER,schwarz,47-69 cm,97783,Nylon,,Halsumfang individuell einstellbar und durch K...
604158,B00LIRG0Z8,JP,I live(アイリブ) ジドウテンメツタイプ [オートテールライト] ショウド&シンドウダ...,1650.0,I live(アイリブ),ブラック,幅102mm x 奥行35mm x 厚さ25mm,,プラスチック,,※ブラケットの組み替えにより縦位置、横位置での取り付けが可能
1454538,B095PNV6L6,ES,"Desatascador de Tuberías 3 metros THORQUE, Pun...",13.97,Gemini Alpha S.R.L,,3 metros,TQ000002,Acero al carbono,,[MODO DE USO] Con la compra del desatascador d...
802615,B019UQ5TKK,JP,(ジンセルフ) JINSELF アルトC デザインオカリナ 12穴 陶器 便利な3点セット ...,1799.0,JIN SELF,*ピンク*,,,陶器,,オカリナ【高さ:9cm 幅:17cm 奥行:3.7cm/重さ:227g/素材:陶器製】


In [7]:
train_sessions = read_train_data()
train_sessions.sample(5)

Unnamed: 0,prev_items,next_item,locale
682373,['B006XE8DEC' 'B08KWHS5SD'],B07P7V9SP7,DE
922147,['B01D080ILY' 'B01LXR2H1I'],B07K336J92,DE
3459835,['B07DF28TFG' 'B07DDT2PM2'],B09HTW4LJ3,FR
931773,['B01M12RE4D' 'B01LZ3DXT5' 'B01LYRVQOG' 'B01LY...,B074W8S1RN,DE
1084872,['B09VCB8WB8' 'B09HC4K3KP'],B09V82D62W,DE


In [8]:
test_sessions = read_test_data(task)
test_sessions.sample(5)

Unnamed: 0,prev_items,locale
94,['B0B4K55TTZ' 'B09HY5THQD'],ES
28082,['B0B3S5BTXH' 'B0B7SBPCYB'],IT
32641,['B07ZH7HYDX' 'B08HR1VGSR' 'B09GTTY863' 'B07Y8...,IT
31461,['B08GY7286R' 'B01N3PT6OE'],IT
5406,['B0BG8FMCMY' 'B0B1SQ3VTQ' 'B0B8Z3CLKF' 'B09XD...,ES


In [16]:
test_file = os.path.join(test_data_dir, f'sessions_test_{task}.csv')
test_file

'../data/raw_data/sessions_test_task2.csv'

In [17]:
test_df = pd.read_csv(test_file)

In [18]:
test_df.head()

Unnamed: 0,prev_items,locale
0,['B07GTS7SWK' 'B07GTS7SWK'],ES
1,['B0B33YWVHR' '849988993X' 'B09QQG85HM' 'B0BJZ...,ES
2,['B08FMPXDTJ' 'B0B4612MTM' 'B0B45YR21M' 'B0B4F...,ES
3,['B07R3W4XQ7' 'B07R3FB5B5'],ES
4,['B09V4KBWPL' 'B09G9FTLPB'],ES


In [22]:
test_df.shape

(34690, 2)

In [21]:
test_df['locale'].value_counts()

IT    13992
FR    12521
ES     8177
Name: locale, dtype: int64

## Generate Submission 🏋️‍♀️



Submission format:
1. The submission should be a **parquet** file with the sessions from all the locales. 
2. Predicted products ids per locale should only be a valid product id of that locale. 
3. Predictions should be added in new column named **"next_item_prediction"**.
4. Predictions should be a list of string id values

In [9]:
def random_predicitons(locale, sess_test_locale):
    random_state = np.random.RandomState(42)
    products = read_product_data().query(f'locale == "{locale}"')
    predictions = []
    for _ in range(len(sess_test_locale)):
        predictions.append(
            list(products['id'].sample(PREDS_PER_SESSION, replace=True, random_state=random_state))
        ) 
    sess_test_locale['next_item_prediction'] = predictions
    sess_test_locale.drop('prev_items', inplace=True, axis=1)
    return sess_test_locale

In [10]:
test_sessions = read_test_data(task)
predictions = []
test_locale_names = test_sessions['locale'].unique()
for locale in test_locale_names:
    sess_test_locale = test_sessions.query(f'locale == "{locale}"').copy()
    predictions.append(
        random_predicitons(locale, sess_test_locale)
    )
predictions = pd.concat(predictions).reset_index(drop=True)
predictions.sample(5)

Unnamed: 0,locale,next_item_prediction
23010,IT,"[B087DJZ5TK, B0784PXZ4V, B0B5RMW85P, B00AIZNCY..."
18889,FR,"[B07KJWWSGS, B09Y8D6LTH, B09D7R4SZG, B09NS944X..."
4991,ES,"[B09WH4YT7L, B084P1V7GP, B0964457BL, B0B3DXRTF..."
9936,FR,"[B097KRN5MR, B08486T5Y2, B09L7KRZNS, B09PNSL6S..."
13417,FR,"[B0B4NTM35T, B0B58XH1NB, B00R2D6ELC, B09KS2CRS..."


# Validate predictions ✅

In [11]:
def check_predictions(predictions, check_products=False):
    """
    These tests need to pass as they will also be applied on the evaluator
    """
    test_locale_names = test_sessions['locale'].unique()
    for locale in test_locale_names:
        sess_test = test_sessions.query(f'locale == "{locale}"')
        preds_locale =  predictions[predictions['locale'] == sess_test['locale'].iloc[0]]
        assert sorted(preds_locale.index.values) == sorted(sess_test.index.values), f"Session ids of {locale} doesn't match"

        if check_products:
            # This check is not done on the evaluator
            # but you can run it to verify there is no mixing of products between locales
            # Since the ground truth next item will always belong to the same locale
            # Warning - This can be slow to run
            products = read_product_data().query(f'locale == "{locale}"')
            predicted_products = np.unique( np.array(list(preds_locale["next_item_prediction"].values)) )
            assert np.all( np.isin(predicted_products, products['id']) ), f"Invalid products in {locale} predictions"

In [12]:
check_predictions(predictions)

In [13]:
test_locale_names = test_sessions['locale'].unique()

In [14]:
test_locale_names

array(['ES', 'FR', 'IT'], dtype=object)

In [15]:
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(f'submission_{task}.parquet', engine='pyarrow')

## Submit to AIcrowd 🚀

In [None]:
# You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-2-next-product-recommendation-for-underrepresented-languages -f "submission_task2.parquet"