# Package 

In [None]:
%load_ext autoreload
%autoreload 2
import sys
import logging
sys.path.append('../')
import os
import warnings
warnings.simplefilter('ignore')

import pickle
import gc
import re
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
import polars as pl

from src.eval import get_recall_at_k, pd_get_recall_at_k
from utils import *

In [1]:
# df_prod = pd.read_csv('data/products_train.csv')
# df_prod

# Config 

In [4]:
debug = False
model_for_eval = True

model_version = 'next_item_counter'


debug_session_num = 100
train_data_dir = '.'
test_data_dir = '.'
task = 'task1'

model_dir = f'../model_training/{model_version}/'

# target locales: locales needed for task1
target_locals = ["DE", 'JP', 'UK']


submit_file = f'submission_{task}_next_item_counter.parquet'

if model_for_eval:
    model_file = os.path.join(model_dir, 'model_for_eval.pkl')
else:
    model_file = os.path.join(model_dir, 'model.pkl')

In [5]:
! mkdir {model_dir}

mkdir: cannot create directory ‘../model_training/next_item_counter/’: File exists


In [6]:
model_file

'../model_training/next_item_counter/model_for_eval.pkl'

# Function 

In [7]:
def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [i for i in x.split() if i]
    return l

# Data 

In [8]:
df_sess = pd.read_csv('sessions_train.csv')
df_sess

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE
...,...,...,...
3606244,['B086CYFSKW' 'B0874F9859' 'B086CYFSKW'],B07B5TYD76,IT
3606245,['B09NRZKZ7V' 'B08WJTPV93'],B08L1P4C3D,IT
3606246,['B085JFX7MP' 'B085JGHW8R'],B01MPWVD44,IT
3606247,['B00B0UING2' 'B00B0UING2'],B00D3HYEZ4,IT


In [9]:
df_test = pd.read_csv('sessions_test_task1.csv')
df_test

Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE
...,...,...
316966,['B077SZ2C3Y' 'B0B14M3VZX'],UK
316967,['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851...,UK
316968,['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1...,UK
316969,['B01MCQMORK' 'B09JYZ325W'],UK


In [86]:
df_sess = df_sess[df_sess['locale'].isin(target_locals)]

if debug:
    df_sess = df_sess.sample(debug_session_num)
    df_test = df_test.sample(debug_session_num)

In [87]:
df_sess.shape

(3272716, 3)

# Next Item Statistics 

In [30]:
next_item_dict = defaultdict(list)

for _, row in tqdm(df_sess.iterrows(), total=len(df_sess)):
    prev_items = str2list(row['prev_items'])
    if not model_for_eval:
        next_item = row['next_item']
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        if not model_for_eval:
            next_item_dict[prev_items[0]].append(next_item)
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])
        if not model_for_eval:
            next_item_dict[prev_items[-1]].append(next_item)

  0%|          | 0/100 [00:00<?, ?it/s]

In [32]:
# next_item_dict

In [33]:
for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    prev_items = str2list(row['prev_items'])
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        continue
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])

  0%|          | 0/100 [00:00<?, ?it/s]

In [72]:
next_item_map = {}
topn = 100
for item in tqdm(next_item_dict):
    counter = Counter(next_item_dict[item])
    most_common_cnt = counter.most_common(1)[0][1]
    most_common_lst = [np.array(ele) for ele in list(zip(*counter.most_common(topn)))]
    most_common_lst[1] = most_common_lst[1]/most_common_cnt
    next_item_map[item] = most_common_lst
    # next_item_map[item] = [i[0] for i in counter.most_common(100)]

  0%|          | 0/487 [00:00<?, ?it/s]

In [73]:
# (np.array([1,2, 4])/1).shape

In [74]:
# dir([1, 2])

In [75]:
# most_common_lst

In [77]:
# next_item_map

# Top200 for fallback logics

In [164]:
k = []
v = []

for item in next_item_dict:
    k.append(item)
    v.append(next_item_dict[item])
    
df_next = pd.DataFrame({'item': k, 'next_item': v})
df_next = df_next.explode('next_item').reset_index(drop=True)
df_next

Unnamed: 0,item,next_item
0,B09W9FND7K,B09JSPLN1M
1,B09W9FND7K,B09JSPLN1M
2,B09W9FND7K,B09JSPLN1M
3,B09W9FND7K,B09JSPLN1M
4,B09W9FND7K,B09W9FND7K
...,...,...
11859719,B0BFPLN8FQ,B0BF5GT13X
11859720,B09737CD6H,B08DJ9SQFY
11859721,B084GZ3QZ7,B00CBAW8SE
11859722,B081TXFKS5,B004RN5I6W


In [165]:
# df_next['next_item'].value_counts().index.tolist()[:200]

In [166]:
top200 = df_next['next_item'].value_counts().index.tolist()[:200]

In [167]:
model = {
    'top200': top200
    , 'next_item_map': next_item_map
}

# Model anlaysis

## top200 model 

In [120]:
products = read_product_data(train_data_dir=train_data_dir)


In [121]:
top200_df = pd.DataFrame({'item': top200})

In [122]:
products.head(2)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...


In [124]:
top200_df = top200_df.merge(products, how='left', left_on='item', right_on='id')

In [125]:
top200_df.sample(2)

Unnamed: 0,item,id,locale,title,price,brand,color,size,model,material,author,desc
412,B0B85SZ4HW,B0B85SZ4HW,JP,10個セット【2022-08オミクロン株対応】日本製 抗原検査キット 抗原検査スティック 【...,3800.0,ＬｉｌｙＮａ,,,,,,［この商品について］本件抗原検査キットは、疫学調査等のための試験研究を目的とするものであり、...
274,B09HGGV5R5,B09HGGV5R5,ES,JBL WAVE 200TWS Auriculares inalámbricos intra...,53.2,Harman,Negro,in Ear,JBLW200TWSBLK,,,Auriculares sin cables con asistente de voz co...


In [127]:
top200_df['locale'].value_counts()

locale
JP    113
DE     83
UK     83
FR     56
ES     48
IT     45
Name: count, dtype: int64

## Save model 

In [168]:
model_file

'../model_training/next_item_counter/model_for_eval.pkl'

In [169]:
with open(model_file, 'wb') as f:
    pickle.dump(model, f)

# Get final result 

## Load Model 

In [10]:
with open(model_file, 'rb') as f:
    model = pickle.load(f)

In [11]:
# top200

In [12]:
# next_item_map

In [13]:
def get_rec(target_df, model):
    next_item_map = model['next_item_map']
    top200  = model['top200']
    target_df['last_item'] = target_df['prev_items'].apply(lambda x: str2list(x)[-1])
    target_df['next_item_prediction'] = target_df['last_item'].map(next_item_map)
    preds = []

    for _, row in tqdm(target_df.iterrows(), total=len(target_df)):
        pred_orig = row['next_item_prediction']
        pred = pred_orig
        prev_items = str2list(row['prev_items'])
        if type(pred) == float:
            pred = top200[:100]
        else:
            if len(pred_orig) < 100:
                for i in top200:
                    if i not in pred_orig and i not in prev_items:
                        pred.append(i)
                    if len(pred) >= 100:
                        break
            else:
                pred = pred[:100]
        preds.append(pred)
    target_df['next_item_prediction'] = preds
    print(target_df['next_item_prediction'].apply(len).describe())
    return target_df

In [14]:
model.keys()

dict_keys(['top200', 'next_item_map'])

# Model eval 

In [50]:
# df_sess.shape

In [51]:
# target_df = df_sess.sample(100000)

# target_df = df_test

In [52]:
# next_item_map = model['next_item_map']
# top200  = model['top200']
# target_df['last_item'] = target_df['prev_items'].apply(lambda x: str2list(x)[-1])
# target_df['next_item_prediction'] = target_df['last_item'].map(next_item_map)
# target_df['next_items_num'] = target_df['next_item_prediction'].apply(len)

In [53]:
# target_df.head(1)

In [54]:
# target_df['next_items_num'] = target_df['next_item_prediction'].apply(len)

In [23]:
# target_df['next_item_prediction'].isna().sum()/len(target_df)

In [55]:
target_df = pd.read_parquet(f'../data/eval_data/w2v_train_eval_result_300k.parquet')

In [56]:
target_df.shape

(300000, 7)

In [57]:
# target_df['next_item_prediction'].isna().sum()/len(target_df)

In [58]:
# target_df.loc[target_df['next_item_prediction'].isna(), 'next_item_prediction'] = ['B099NQFMG7']

In [59]:
target_df.sample(2)

Unnamed: 0,prev_items,next_item,locale,next_item_prediction,len,recall@20,recall@100
3209869,['0545703301' 'B07V48JZCQ' 'B08G4TK5WS' 'B08G4...,B08G4P2B7B,UK,"[B0913K77K3, B07NLKFC8X, B08V1CTCCM, B00BQB8GZ...",100,False,False
870010,['B08N52NSDF' 'B00GIV7J7W'],B07NVN75Y9,DE,"[B00GIV7J7W, B093W8TPFL, B07RHSWJW4, B004U8SE2...",100,False,False


In [60]:
# target_df['next_item_prediction'] = target_df['next_item_prediction'].fillna(['B09FY8KLR6'])

In [61]:
# target_df.head()

In [62]:
# target_df[~target_df['next_item_prediction'].isna()]['next_item_prediction'].apply(len).describe()

##f 

In [63]:
# target_df = 

In [64]:
# df_test = 
train_eval_df = get_rec(target_df=target_df, model=model)

  0%|          | 0/300000 [00:00<?, ?it/s]

count    300000.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: next_item_prediction, dtype: float64


In [65]:
# train_eval_df.head()

In [66]:

eval_cols = ['len', 'recall@20', 'recall@100']
train_eval_df[eval_cols] = train_eval_df.apply(pd_get_recall_at_k, axis=1, result_type='expand')
print(train_eval_df[eval_cols].mean())

len           100.000000
recall@20       0.353117
recall@100      0.371717
dtype: float64


In [67]:
train_eval_df.shape

(300000, 8)

In [68]:
train_eval_df.to_parquet(f'../data/eval_data/next_item_counter_train_eval_300k.parquet', engine='pyarrow')

# Submit result 

In [134]:
submit_file

'submission_task1_next_item_counter.parquet'

In [22]:
df_test[['locale', 'next_item_prediction']].to_parquet(submit_file, engine='pyarrow')

In [142]:
# df_test.info()

In [None]:
target_df[['locale', 'next_item_prediction']].to_parquet(submit_file, engine='pyarrow')

In [None]:
# You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f {submit_file}

[?25l[1;34msubmission_task1_next_item_counter.parquet[0m [90m━━━━━━━━[0m [35m0.0%[0m • [32m0.0/11…[0m • [31m?[0m • [36m-:--:--[0m
[2K[1A[2K[1;34msubmission_task1_next_item_counter.parquet[0m [90m━━━━━━━━[0m [35m0.0%[0m • [32m0.0/11…[0m • [31m?[0m • [36m-:--:--[0m
[2K[1A[2K[1;34msubmission_task1_next_item_counter.parquet[0m [90m━━━━━━━━[0m [35m0.0%[0m • [32m0.0/11…[0m • [31m?[0m • [36m-:--:--[0m
[2K[1A[2K[1;34msubmission_task1_next_item_counter.parquet[0m [90m━━━━━━━━[0m [35m0.0%[0m • [32m0.0/11…[0m • [31m?[0m • [36m-:--:--[0m
[2K[1A[2K[1;34msubmission_task1_next_item_counter.parquet[0m [90m━━━━━━━━[0m [35m0.0%[0m • [32m0.0/11…[0m • [31m?[0m • [36m-:--:--[0m
[2K[1A[2K[1;34msubmission_task1_next_item_counter.parquet[0m [90m━━━━━━━━[0m [35m0.0%[0m • [32m0.0/11…[0m • [31m?[0m • [36m-:--:--[0m
[2K[1A[2K[1;34msubmission_task1_next_item_counter.parquet[0m [90m━━━━━━[0m [35m0.0%[0m • [32m0.0/…[0m