# Basic ideas

1. check the how many times item2 is item1's next item
2. For each item1, normalized the count to [0, 1]
3. Add 0.01 -> since 0 will represent item2 has never been the next item of item1
4. Next steps
    1. [x] counts should be included 
    2. [x] train2 data should be included
    3. [x] 0 should be the min_count for each item


Example: [a, b, c] -> [ab, bc] -> 

```
[
--current_item, next_item, counts
[a, b, 1],
[b, c, 1]
]

after normalization

[
--current_item, next_item, counts
[a, b, 1.01],
[b, c, 1.01]
]
```

# Package 

In [61]:
%load_ext autoreload
%autoreload 2
import sys
import logging
base_dir = '../'
sys.path.append(base_dir)
import os
import warnings
warnings.simplefilter('ignore')

import pickle
import gc
import re
import polars as pl
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
import polars as pl
from utils import *
from src.eval import model_eval
from src.config import raw_data_session_id_dir, candidate_dir, model_for_eval, candidate_file_name

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Config 

In [62]:
debug = False


task = 'task2'

model_version = f'nic_{task}_v2'

# target locales: locales needed for task1
target_locals = ['ES', 'FR', 'IT']

topn = 100
if debug:
    n_rows = 1000
else:
    n_rows = None
# debug_session_num = 100
train_data_dir = '.'
test_data_dir = '.'


model_dir = f'../model_training/{model_version}/'


# if model_for_eval:
model_file = os.path.join(model_dir, f'nic_{model_for_eval}_for_eval.parquet')
submit_file = os.path.join('../data/sub_files/', f'submission_{task}_nic_{model_for_eval}_for_eval.parquet')

In [63]:
model_file

'../model_training/nic_task2_v2/nic_True_for_eval.parquet'

In [64]:
submit_file

'../data/sub_files/submission_task2_nic_True_for_eval.parquet'

In [65]:

train_cg_file = os.path.join(base_dir,
                             candidate_dir, 
            candidate_file_name.format(
                task=task
                , data_type='train'
                , model_version=model_version
                , model_for_eval=model_for_eval
                , topn=topn
            )
                            )
eval_cg_file = os.path.join(base_dir,
                            candidate_dir, 
            candidate_file_name.format(
                task=task
                , data_type='eval'
                , model_version=model_version
                , model_for_eval=model_for_eval
                , topn=topn
            )
                            )
test_cg_file = os.path.join(base_dir,
                            candidate_dir, 
            candidate_file_name.format(
                task=task
                , data_type='test'
                , model_version=model_version
                , model_for_eval=model_for_eval
                , topn=topn
            )
                            )

test4task3_file_name = os.path.join(base_dir,
                            candidate_dir, 
                                    candidate_file_name.format(
    task=task
    , data_type='test4task3'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
))
print(train_cg_file)
print(eval_cg_file)
print(test_cg_file)
print(test4task3_file_name)

../data/candidates/task2_train_nic_task2_v2_True_top100.parquet
../data/candidates/task2_eval_nic_task2_v2_True_top100.parquet
../data/candidates/task2_test_nic_task2_v2_True_top100.parquet
../data/candidates/task2_test4task3_nic_task2_v2_True_top100.parquet


In [66]:
test_cg_file

'../data/candidates/task2_test_nic_task2_v2_True_top100.parquet'

In [67]:
! mkdir {model_dir}

mkdir: cannot create directory ‘../model_training/nic_task2_v2/’: File exists


In [68]:
model_file

'../model_training/nic_task2_v2/nic_True_for_eval.parquet'

In [69]:
submit_file

'../data/sub_files/submission_task2_nic_True_for_eval.parquet'

# Data 

In [70]:
! ls ../{raw_data_session_id_dir}

product_unique2id.json		    sessions_test_task2_phase1.parquet
products_train.parquet		    sessions_test_task3.parquet
sessions_eval.parquet		    sessions_test_task3_phase1.parquet
sessions_test_task1.parquet	    sessions_train.parquet
sessions_test_task1_phase1.parquet  sessions_train1.parquet
sessions_test_task2.parquet	    sessions_train2.parquet


In [71]:
train_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train1.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

train2_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train2.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

eval_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_eval.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

# df_sess.head(3).collect()
test_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, f'sessions_test_{task}.parquet'), n_rows=n_rows).with_columns(pl.col('prev_items').apply(str2list))
test4task3_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_test_task3.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))



In [72]:
test4task3_pl.select('locale').collect().to_series().value_counts()

locale,counts
str,u32
"""IT""",10000
"""ES""",6422
"""FR""",10000


In [73]:
# test_pl.select('locale').collect().to_series().value_counts()

In [74]:
# test_pl.filter(pl.col('locale').is_null()).collect()

# Function 

In [75]:
# 'item', 'next_item_prediction', 'next_item_weight'

In [76]:
def nic_rec(target_pl, nic_model, rec_num=topn):
    # rec_num = 100
    final_cols = ['session_id', 'next_item_prediction', 'rec_num']
    target_pl = (
        target_pl
            .with_columns(
                pl.col('prev_items').arr.get(-1).alias('last_item')
            )
            .join(nic_model, how='left', left_on='last_item', right_on='item')
            .with_columns(
                pl.when(pl.col('next_item_prediction').is_null()).then([]).otherwise(pl.col('next_item_prediction').arr.head(rec_num)).alias('next_item_prediction')
            )
            .with_columns(
                pl.col('next_item_prediction').arr.lengths().alias('rec_num')
            )
            .select(
                final_cols
            )
    )#.head(3).collect()
    return target_pl

# Next Item Statistics 

In [77]:
model_for_eval

True

In [78]:
train_data = train_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
if not model_for_eval:
    eval_data = eval_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
else:
    eval_data = eval_pl
train2_data = train2_pl

test_data = test_pl

In [79]:
# model_for_eval

In [80]:
# if not model_for_eval:


cols_to_keep = ['prev_items']
next_items_pl = (
    pl.concat([train_data.select(cols_to_keep), 
               eval_data.select(cols_to_keep),
               train2_data.select(cols_to_keep),
               test_data.select(cols_to_keep)], how='vertical')
        .with_columns(
            pl.col('prev_items').arr.shift(-1).alias('next_item_lst')
            , pl.col('prev_items').arr.lengths().alias('length')
        )
        .select(
            pl.col('prev_items').arr.head(pl.col('length')-1).alias('prev')
            , pl.col('next_item_lst').arr.head(pl.col('length')-1).alias('next')
        )
        .explode(['prev','next' ])
        .groupby(['prev','next' ])
        .agg(
            pl.count().alias('cnt')
        )
        .sort(['prev', 'cnt'], descending=True)
        .with_columns(
            pl.col('cnt').max().over('prev').alias('max_count')
            , pl.lit(0).alias('min_count')
        )
        .with_columns(
            pl.when(pl.col('max_count')==pl.col('min_count')).then(1).otherwise((pl.col('cnt')-pl.col('min_count'))/(pl.col('max_count')-pl.col('min_count'))).alias('normalized_cnt')
        )
        .groupby('prev')
        .agg(
            pl.col('next').alias('next_item_prediction')
            , pl.col('cnt').alias('next_item_cnt')
            , (pl.col('normalized_cnt')).alias('next_item_weight')
        )
        .select(
            pl.col('prev').alias('item')
            , 'next_item_prediction'
            , pl.col('next_item_weight')
            , 'next_item_cnt'
        )
        
)

In [81]:
next_items_pl.head().collect()

item,next_item_prediction,next_item_weight,next_item_cnt
str,list[str],list[f64],list[u32]
"""B0B3HNKNHK""","[""B0B3H8DY1Y"", ""B08Z6X4PX6"", … ""B09M78232Q""]","[1.0, 0.2, … 0.2]","[5, 1, … 1]"
"""B08XXZ1TGL""","[""B0785JG1ZB"", ""B01EZ0X55C"", … ""B08JD5PV9F""]","[1.0, 1.0, … 1.0]","[1, 1, … 1]"
"""B08P8ZF7H4""","[""B075WVK551"", ""B08J449X8T"", … ""B08P8ZF7H4""]","[1.0, 0.5, … 0.5]","[2, 1, … 1]"
"""B0B46T2RCT""","[""B09DNZG3Z5"", ""B08N5V5QL5"", … ""B0949RD2VM""]","[1.0, 1.0, … 0.5]","[2, 2, … 1]"
"""B09TKRK8KM""","[""B09TKS3S1J"", ""B09HTYY1CJ""]","[1.0, 1.0]","[1, 1]"


In [82]:
# next_items_pl.collect()#.filter(pl.col('item')=='B09LXX1PQ9')

In [83]:
# next_item_dict = defaultdict(list)

# for _, row in tqdm(df_sess.iterrows(), total=len(df_sess)):
#     prev_items = str2list(row['prev_items'])
#     if not model_for_eval:
#         next_item = row['next_item']
#     prev_items_length = len(prev_items)
#     if prev_items_length <= 1:
#         if not model_for_eval:
#             next_item_dict[prev_items[0]].append(next_item)
#     else:
#         for i, item in enumerate(prev_items[:-1]):
#             next_item_dict[item].append(prev_items[i+1])
#         if not model_for_eval:
#             next_item_dict[prev_items[-1]].append(next_item)

In [84]:
# next_item_dict

In [85]:
# for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
#     prev_items = str2list(row['prev_items'])
#     prev_items_length = len(prev_items)
#     if prev_items_length <= 1:
#         continue
#     else:
#         for i, item in enumerate(prev_items[:-1]):
#             next_item_dict[item].append(prev_items[i+1])

In [86]:
# # next_item_map = {}
# topn = 100
# item_lst = []
# common_items_lst = []
# weights_lst = []
# for item in tqdm(next_item_dict):
#     counter = Counter(next_item_dict[item])
#     most_common_cnt = counter.most_common(1)[0][1]
#     most_common_lst = list(zip(*counter.most_common(topn)))
#     most_common_lst[1] = list(np.array(most_common_lst[1])/most_common_cnt)
#     item_lst.append(item)
#     common_items_lst.append(list(most_common_lst[0]))
#     weights_lst.append(most_common_lst[1])
#     # next_item_map[item] = most_common_lst
#     # next_item_map[item] = [i[0] for i in counter.most_common(100)]

In [87]:
# next_item_df = pd.DataFrame(
#     {'item': item_lst
#     , 'next_item_prediction': common_items_lst
#      , 'next_item_weight': weights_lst
#     }
# )

In [88]:
# next_item_df.sample(10)

In [89]:
# next_items_pl.collect().filter(pl.col('item')=='B07QGW8LFT')

In [90]:
# nex_item_pl = pl.from_pandas(next_item_df).lazy().select(
#     'item'
#     , pl.col('next_item_prediction').alias('next_item_rec')
#     , 'next_item_weight'
# )

In [91]:
# with open('../model_training/next_item_counter/model.pkl', 'rb') as f:
#     model = pickle.load(f)

In [92]:
# model.keys()

In [93]:
# model['next_item_map']

In [94]:
# nex_item_pl = pl.DataFrame(
#     {
#         'item': model['next_item_map'].keys()
#         , 'next_item_rec': model['next_item_map'].values()
#     }
# ).lazy()

In [95]:
next_items_pl.collect().shape

(106037, 4)

## Save model 

In [96]:
model_file

'../model_training/nic_task2_v2/nic_True_for_eval.parquet'

In [97]:
next_items_pl.collect().write_parquet(model_file)

In [98]:
del next_items_pl

## Read Model 

In [99]:
model_file

'../model_training/nic_task2_v2/nic_True_for_eval.parquet'

In [100]:
next_items_pl = pl.scan_parquet(model_file)

In [101]:
type_dict = next_items_pl.schema

In [102]:
type_dict.keys()

dict_keys(['item', 'next_item_prediction', 'next_item_weight', 'next_item_cnt'])

In [114]:
next_items_pl.head(30).collect()

item,next_item_prediction,next_item_weight,next_item_cnt
str,list[str],list[f64],list[u32]
"""B09XDBGD34""","[""B09XD5D996"", ""B0B5TT8NMT"", … ""B09CYWCSPV""]","[1.0, 0.5, … 0.5]","[2, 1, … 1]"
"""B08GLC64T5""","[""B08GLC64T5"", ""B0924MT4HF"", … ""B09JSY824R""]","[1.0, 0.333333, … 0.333333]","[3, 1, … 1]"
"""B09B2PX814""","[""B08C88HDN6"", ""B0B461Q5T9"", … ""B09VLCK6ZF""]","[1.0, 0.464286, … 0.035714]","[28, 13, … 1]"
"""B08C7KCJF5""","[""B07WD5B99P"", ""B091CQH6VT"", … ""B08Q29FLHD""]","[1.0, 1.0, … 0.5]","[2, 2, … 1]"
"""B010X1TATC""","[""B010X1TATC"", ""B005NH4O66"", … ""B08KJKCK6M""]","[1.0, 0.333333, … 0.333333]","[3, 1, … 1]"
"""B08GS323SR""","[""B08GR24CRP""]",[1.0],[1]
"""B075KK1848""","[""B09GW6S5CB"", ""B08NQ4VJ5X"", … ""B076ZR13BJ""]","[1.0, 1.0, … 1.0]","[1, 1, … 1]"
"""B09GLW17PK""","[""B09GLW17PK""]",[1.0],[1]
"""B00W79XF9U""","[""B00W79X5ZO""]",[1.0],[3]
"""B00A2O44D8""","[""B07N3C5WRG"", ""B07DCVKCMX"", ""B07MKV4B9R""]","[1.0, 1.0, 1.0]","[1, 1, 1]"


## Model eval 

In [115]:
# train_pl.schema

In [116]:
# eval_pl.schema

In [117]:
# eval_pl.select('locale').collect().to_series().value_counts()

In [118]:
eval_candidate_pl = nic_rec(target_pl=eval_pl, nic_model=next_items_pl)#.head().collect()

In [119]:

# if_hit = pl.element().rank()
target_df = eval_pl.join(eval_candidate_pl, how='left', on='session_id')


# eval_final.head().collect()

In [120]:
model_eval(target_df=target_df)

total_sessions,mrr,recall@20,recall@100
u32,f64,f64,f64
33333,0.2605,0.4536,0.4742


In [121]:
# model_eval(target_df=target_df)

## Candidate Saving 

### Eval data

In [122]:
eval_candidate_pl.head().collect()

session_id,next_item_prediction,rec_num
i64,list[str],u32
3272722,"[""B003A61V0O"", ""B08LQY6Q9C"", … ""B005L9EZWY""]",7
3272739,"[""B08LR7B41X"", ""B01HXRQ1PA"", ""B00331LFMA""]",3
3272763,"[""B07K527Y7N"", ""B08BJNJHFG"", … ""B00008D9RK""]",27
3272776,"[""B09PNDJJZ9"", ""B0B2K87MXS"", … ""B08CH6VKHW""]",26
3272785,"[""B00OUVGTJ6"", ""B00V88L9LC"", … ""B00IOEEOCE""]",6


In [123]:
eval_candidate_pl.collect().write_parquet(eval_cg_file)

### Train & eval  

In [124]:
train_candidate_pl = nic_rec(target_pl=train2_pl, nic_model=next_items_pl)#.head().collect()

In [125]:
# eval_candidate_pl = nic_rec(target_pl=eval_pl, nic_model=next_items_pl)#.head().collect()

In [126]:
train_cg_file

'../data/candidates/task2_train_nic_task2_v2_True_top100.parquet'

In [127]:
train_candidate_pl.collect().write_parquet(train_cg_file)

### Test 

In [128]:
test_candidate_pl = nic_rec(target_pl=test_pl, nic_model=next_items_pl)#.head().collect()


In [129]:
# test_candidate_pl.head().collect()

In [130]:
test_candidate_pl.collect().write_parquet(test_cg_file)

In [131]:
test_cg_file

'../data/candidates/task2_test_nic_task2_v2_True_top100.parquet'

In [132]:
# ! ls ../data/candidates/ | grep 'test_nic'

### test4task3 

In [133]:
print(test4task3_file_name)
test4task3_cg_pl = nic_rec(target_pl=test4task3_pl, nic_model=next_items_pl)#.head().collect()
test4task3_cg_pl.collect().write_parquet(test4task3_file_name)

../data/candidates/task2_test4task3_nic_task2_v2_True_top100.parquet


## Save test result

In [49]:
# test_task2 = pl.read_csv('../data/raw_data/sessions_test_task2.csv')

In [50]:
# ! ls ../data/raw_data/ | grep task2

In [51]:
# test_task2.tail()

In [52]:
predictions = test_pl.join(test_candidate_pl, how='left', on='session_id').collect()[['locale', 'next_item_prediction']].to_pandas()

In [53]:
def check_predictions(predictions,test_sessions, check_products=False, product_df=None):
    """
    These tests need to pass as they will also be applied on the evaluator
    """
    test_locale_names = test_sessions['locale'].unique()
    for locale in test_locale_names:
        sess_test = test_sessions.query(f'locale == "{locale}"')
        preds_locale =  predictions[predictions['locale'] == sess_test['locale'].iloc[0]]
        assert sorted(preds_locale.index.values) == sorted(sess_test.index.values), f"Session ids of {locale} doesn't match"

        if check_products:
            # This check is not done on the evaluator
            # but you can run it to verify there is no mixing of products between locales
            # Since the ground truth next item will always belong to the same locale
            # Warning - This can be slow to run
            products = product_df.query(f'locale == "{locale}"')
            predicted_products = np.unique( np.array(list(preds_locale["next_item_prediction"].values)) )
            assert np.all( np.isin(predicted_products, products['id']) ), f"Invalid products in {locale} predictions"

In [54]:
check_predictions(predictions, test_sessions=test_pl.collect().to_pandas(), 
                  # check_products=True, product_df=products
                 )
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(submit_file, engine='pyarrow')

In [55]:
submit_file

'../data/sub_files/submission_task2_nic_True_for_eval.parquet'

In [56]:
# !aicrowd submission create -c task-2-next-product-recommendation-for-underrepresented-languages -f {submit_file}

# Top200 for fallback logics

In [45]:
# next_item_df.head()

In [46]:
# next_item_df[cols].info()

In [47]:
popular_df = pd.concat([df_sess[['prev_items', 'locale']], df_test[['prev_items', 'locale']]], axis=0)

NameError: name 'df_sess' is not defined

In [None]:
popular_df.shape

In [None]:
popular_pl = pl.from_pandas(popular_df).lazy()

In [None]:
topn = 200
locale_popular_pl = (
    popular_pl
        .select(
            pl.col('prev_items').apply(str2list)#.explode().alias('item')
            , pl.col('locale')
        )
        .explode('prev_items')#.alias('item')
        .groupby(['locale', 'prev_items'])
        .agg(
            pl.count()
        )
        .with_columns(
            pl.col('count').rank(method='ordinal', descending=True).over('locale').alias('rank')
        )
        .filter(pl.col('rank')<=topn)
        .with_columns(
            pl.col('count').max().over('locale').alias('max_count')
            , pl.col('count').min().over('locale').alias('min_count')
        )
        .with_columns(
            ((pl.col('count')-pl.col('min_count'))/(pl.col('max_count')-pl.col('min_count'))).alias('weight')
        )
        .sort('locale', 'rank')
        .select(
            'locale'
            , 'prev_items'
            , 'weight'
        )
        .groupby('locale')
        .agg(
            pl.col('weight').alias('locale_popular_weight')
            , pl.col('prev_items').alias('locale_popular_rec')
        )
        # .count()#.head(3).collect())
        # .collect()
)

In [None]:
# locale_popular_pl.collect()

In [None]:
# locale_popular_pl.schema

In [None]:
# popular_df.apply(lambda x: str2list(x['prev_items']), axis=1)

In [None]:
# df_sess.head()

In [None]:
# df_test.head()

In [None]:
# # next_item_df['next_item_prediction'] = next_item_df['next_item_prediction'].astype(str)
# # next_item_df['next_item_weights'] = next_item_df['next_item_weights'].astype(str)
# cols = [
#     # 'item',
#         'next_item_prediction'
#         , 'next_item_weights'
#        ]
# next_item_pl = pl.from_pandas(next_item_df[cols])

In [None]:
# next_item_pl

In [None]:
# # k = []
# # v = []

# # for item in next_item_dict:
# #     k.append(item)
# #     v.append(next_item_dict[item])
    
# # df_next = pd.DataFrame({'item': k, 'next_item': v})
# df_next = next_item_df.explode('next_item_prediction').reset_index(drop=True)
# df_next = df_next.merge(products, how='left', left_on='item', right_on='id')
# df_next

In [None]:
# df_next['next_item'].value_counts().index.tolist()[:200]

In [None]:
# model = {
#     'top200': top200
#     , 'next_item_map': next_item_map
# }

## Save model 

In [None]:
# model_file

In [None]:
# with open(model_file, 'wb') as f:
#     pickle.dump(model, f)

# Get final result 

## Load Model 

In [19]:
# # with open(model_file, 'rb') as f:
#     model = pickle.load(f)

In [20]:
# top200

In [21]:
# next_item_map

In [22]:
# def get_rec(target_df, model):
#     next_item_map = model['next_item_map']
#     top200  = model['top200']
#     target_df['last_item'] = target_df['prev_items'].apply(lambda x: str2list(x)[-1])
#     target_df['next_item_prediction'] = target_df['last_item'].map(next_item_map)
#     preds = []

#     for _, row in tqdm(target_df.iterrows(), total=len(target_df)):
#         pred_orig = row['next_item_prediction']
#         pred = pred_orig
#         prev_items = str2list(row['prev_items'])
#         if type(pred) == float:
#             pred = top200[:100]
#         else:
#             if len(pred_orig) < 100:
#                 for i in top200:
#                     if i not in pred_orig and i not in prev_items:
#                         pred.append(i)
#                     if len(pred) >= 100:
#                         break
#             else:
#                 pred = pred[:100]
#         preds.append(pred)
#     target_df['next_item_prediction'] = preds
#     print(target_df['next_item_prediction'].apply(len).describe())
#     return target_df

In [23]:
# model.keys()

In [24]:
# model['next_item_map']

# Candidate for train data 

In [None]:
# train_pl = pl.scan_csv('sessions_train.csv')
train_pl = pl.scan_parquet('../data/eval_data/next_item_counter_train_eval_300k.parquet')

In [None]:
target_locals

In [None]:
train_pl.schema

In [None]:
train_pl = (
    train_pl
        .filter(pl.col('locale').is_in(target_locals))
        .with_columns(
            pl.col('prev_items').apply(str2list).arr.get(-1).alias('last_item')
        )
        .join(nex_item_pl, how='left', left_on='last_item', right_on='item')
        .with_columns(
            pl.when(pl.col('next_item_rec').is_null()).then([]).otherwise(pl.col('next_item_rec').arr.head(100)).alias('next_item_prediction')
        )
        .with_columns(
            pl.col('next_item_prediction').arr.lengths().alias('rec_num')
        )
        .select(
            'prev_items'
            , 'next_item'
            , 'locale'
            , 'next_item_prediction'
            , 'rec_num'
        )
)#.head(2).collect()

In [None]:
train_pl.collect().write_parquet('../data/candidates/task1_train_nic_without_pupular_top100_300k.parquet')

# Final resul 

In [None]:
def pl_rec(target_pl, locale_popular_pl, nex_item_pl):
    rec_num = 100
    target_pl = (
        target_pl
            .with_columns(
                pl.col('prev_items').apply(str2list).arr.get(-1).alias('last_item')
            )
            .join(nex_item_pl, how='left', left_on='last_item', right_on='item')
            .join(locale_popular_pl, how='left', on='locale')
            .with_columns(
                pl.when(pl.col('next_item_rec').is_null()).then([]).otherwise(pl.col('next_item_rec')).alias('next_item_rec')
            )
            .with_columns(
                pl.concat_list([pl.col('next_item_rec'), pl.col('locale_popular_rec')])
                    .alias('next_item_prediction')
                    .arr.head(rec_num)

            )
            .with_columns(
                pl.col('next_item_prediction').arr.lengths().alias('rec_num')
            )
    )#.head(3).collect()
    return target_pl

In [None]:
eval_pl = pl.scan_parquet(f'../data/eval_data/w2v_train_eval_result_300k.parquet')

In [None]:
eval_pl.schema

In [None]:
nex_item_pl.schema

In [None]:
# locale_popular_pl.head(3).collect()

In [None]:
eval_pl = pl_rec(target_pl=eval_pl, locale_popular_pl=locale_popular_pl, nex_item_pl=nex_item_pl)

In [None]:
# eval_pl.head(3).collect()

In [None]:
eval_pl.select(
    pl.col('next_item_prediction').arr.head(20).arr.contains(pl.col('next_item')).mean().alias('recall@20')
    , pl.col('next_item_prediction').arr.head(100).arr.contains(pl.col('next_item')).mean().alias('recall@100')
).collect()

In [None]:

# eval_cols = ['len', 'recall@20', 'recall@100']
# train_eval_df[eval_cols] = train_eval_df.apply(pd_get_recall_at_k, axis=1, result_type='expand')
# print(train_eval_df[eval_cols].mean())

In [None]:
# train_eval_df.shape

In [None]:
# model_version

In [None]:
eval_pl.collect().shape

In [None]:
eval_pl.collect().write_parquet(f'../data/eval_data/{model_version}_train_eval_300k.parquet', 
                      # engine='pyarrow'
                     )

# Submit result 

In [None]:
test_pl = pl.scan_csv('sessions_test_task1.csv')
test_pl = pl_rec(target_pl=test_pl, locale_popular_pl=locale_popular_pl, nex_item_pl=nex_item_pl)

In [None]:
submit_file

In [None]:
! ls -al | grep {submit_file}

In [None]:
test_pl.collect().shape

In [None]:
submit_file

In [None]:
test_pl.head(3).collect()

In [None]:
# test_pl.collect().select('locale', 'next_item_prediction').write_parquet(submit_file,
#                                                                          # engine='pyarrow'
#                                                                         )

In [None]:
# # You can submit with aicrowd-cli, or upload manually on the challenge page.
# !aicrowd submission create -c task-1-next-product-recommendation -f {submit_file}

# Rank  

In [None]:
w2v_df = pl.scan_parquet('submission_task1.parquet')

In [None]:
assert w2v_df.collect().shape[0] == test_pl.collect().shape[0]

In [None]:
w2v_df.schema

In [None]:
# test_pl.head()

In [None]:
target_df = pl.concat([test_pl.select('prev_items', 'locale', 'next_item_rec').collect(), w2v_df.select('next_item_prediction').collect()]
                    , how='horizontal' )

In [None]:
target_df.shape

In [None]:
target_df.head(10)

In [None]:
rec_num = 100
target_pl = (
    target_df
        .lazy()
        .select(
            'prev_items'
            , 'locale'
            , pl.concat_list([pl.col('next_item_rec'), pl.col('next_item_prediction')])
                .alias('next_item_prediction')
                .arr.head(rec_num)

        )
        .with_columns(
            pl.col('next_item_prediction').arr.lengths().alias('rec_num')
        )
)#.head(3).collect()

In [None]:
target_pl.schema

In [None]:
target_pl.head(6).collect()

In [None]:
# ! mkdir ../data/sub_files

In [None]:
target_pl.collect().select('locale', 'next_item_prediction').write_parquet('../data/sub_files/rank_v1.parquet',
                                                                         # engine='pyarrow'
                                                                        )

In [None]:
# # You can submit with aicrowd-cli, or upload manually on the challenge page.
# !aicrowd submission create -c task-1-next-product-recommendation -f '../data/sub_files/rank_v1.parquet'

## Rank2 

In [None]:
rec_num = 100
target_pl = (
    target_df
        .lazy()
        .select(
            'prev_items'
            , 'locale'
            , pl.concat_list([pl.col('next_item_rec').arr.head(20), pl.col('next_item_prediction')])
                .alias('next_item_prediction')
                .arr.head(rec_num)

        )
        .with_columns(
            pl.col('next_item_prediction').arr.lengths().alias('rec_num')
        )
)#.head(3).collect()

In [None]:
target_pl.collect().select('locale', 'next_item_prediction').write_parquet('../data/sub_files/rank_v2.parquet',
                                                                         # engine='pyarrow'
                                                                        )

In [None]:
# # You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f '../data/sub_files/rank_v2.parquet'

# Debug 

In [None]:
test_pl = pl.scan_parquet(submit_file)

In [None]:
test_pl.schema

In [None]:
test_pl.head(5).collect()

In [None]:
test_pl.select(
    pl.col('next_item_prediction').arr.lengths().min()
    , pl.col('next_item_prediction').arr.lengths().max()
).collect()