# Basic ideas

1. Check how many times two item exist in the same sequence;
2. No normalization for the final score
3. Next steps
    1. [x] train2 is not included
    2. [x] normalized score should be included, too

Example: [a, b, c] -> [ab, ac, ba, bc, ca, cb] -> 

```
[
--current_item, next_item, counts
[a, b, 1],
[a, c, 1]
[a, b, 1]
[a, c, 1]
[b, a, 1]
[b, c, 1]
[c, a, 1]
[c, b, 1]
]
```

# Package 

In [3]:
%load_ext autoreload
%autoreload 2
import sys
import logging
base_dir = '../'
sys.path.append(base_dir)
import os
import warnings
warnings.simplefilter('ignore')

import pickle
import gc
import re
import polars as pl
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
import polars as pl
from utils import *
from src.eval import model_eval
from src.config import raw_data_session_id_dir, candidate_dir, model_for_eval, candidate_file_name

In [4]:
# df_prod = pd.read_csv('data/products_train.csv')
# df_prod

# Config 

In [5]:
debug = False


task = 'task1'
version = 'v10'
model_version = f'co_visit_{task}_{version}'

if task == 'task1':
    target_locals = ['DE', 'JP', 'UK']
elif task == 'task2':
    target_locals = ['ES', 'FR', 'IT']
else:
    assert 1 == 0


topn = 300
if debug:
    n_rows = 1000
else:
    n_rows = None
# debug_session_num = 100
train_data_dir = '.'
test_data_dir = '.'

model_dir = f'../model_training/{model_version}/'

# target locales: locales needed for task1

# if model_for_eval:
model_file = os.path.join(model_dir, f'{model_version}_{model_for_eval}_for_eval.parquet')
submit_file = os.path.join('../data/sub_files/', f'submission_{task}_{model_version}_{model_for_eval}_for_eval.parquet')

In [6]:
candidate_file_name

'{task}_{data_type}_{model_version}_{model_for_eval}_top{topn}.parquet'

In [7]:
train_file_name = candidate_file_name.format(
    task=task
    , data_type='train'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
eval_file_name = candidate_file_name.format(
    task=task
    , data_type='eval'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
test_file_name = candidate_file_name.format(
    task=task
    , data_type='test'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)

test4task3_file_name = candidate_file_name.format(
    task=task
    , data_type='test4task3'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
print(train_file_name)
print(eval_file_name)
print(test_file_name)
print(test4task3_file_name)

task1_train_co_visit_task1_v10_True_top300.parquet
task1_eval_co_visit_task1_v10_True_top300.parquet
task1_test_co_visit_task1_v10_True_top300.parquet
task1_test4task3_co_visit_task1_v10_True_top300.parquet


In [8]:
! mkdir {model_dir}

mkdir: cannot create directory ‘../model_training/co_visit_task1_v10/’: File exists


In [9]:
model_file

'../model_training/co_visit_task1_v10/co_visit_task1_v10_True_for_eval.parquet'

In [10]:
submit_file

'../data/sub_files/submission_task1_co_visit_task1_v10_True_for_eval.parquet'

# Data 

In [11]:
# ! ls ../{raw_data_session_id_dir}

In [12]:
train_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train1.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

train2_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train2.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))



eval_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_eval.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

# df_sess.head(3).collect()
test_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, f'sessions_test_{task}.parquet'), n_rows=n_rows).with_columns(pl.col('prev_items').apply(str2list))

test4task3_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_test_task3.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))


In [13]:
test4task3_pl.select('locale').collect().to_series().value_counts()

locale,counts
str,u32
"""UK""",10000
"""JP""",10000
"""DE""",10000


# Function 

In [14]:
# 'item', 'next_item_prediction', 'next_item_weight'

In [15]:
def nic_rec(target_pl, nic_model, topn=topn):
    rec_num = topn
    def get_next_items(x):
        prev_items = x['prev_items']
        local_rec = x['next_item_prediction']
        final = [ele for ele in local_rec if ele not in prev_items]
        return final

    final_cols = ['session_id', 'next_item_prediction', 'rec_num']
    target_pl = (
        target_pl
            .with_columns(
                pl.col('prev_items').arr.get(-1).alias('last_item')
            )
            .join(nic_model, how='left', left_on='last_item', right_on='item')
            .with_columns(
                pl.when(pl.col('next_item_prediction').is_null())
                    .then([])
                    .otherwise(pl.col('next_item_prediction'))
                    .alias('next_item_prediction')
            ).with_columns(
                    pl.struct(["prev_items", "next_item_prediction"]).apply(
                        lambda x: get_next_items(x)).alias('next_item_prediction')
            )
            .with_columns(
                    pl.col('next_item_prediction').arr.head(rec_num)
            )
            .with_columns(
                pl.col('next_item_prediction').arr.lengths().alias('rec_num')
            )
            .select(
                final_cols
            )
    )#.head(3).collect()
    return target_pl

# Next Item Statistics 

In [16]:
train_data = train_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
if not model_for_eval:
    eval_data = eval_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
else:
    eval_data = eval_pl
train2_data = train2_pl

test_data = test_pl

In [17]:
cols_to_keep = ['prev_items']

all_train_data = pl.concat([train_data.select(cols_to_keep),
                            eval_data.select(cols_to_keep), 
                            train2_data.select(cols_to_keep), 
                            test_data.select(cols_to_keep)], how='vertical')

In [18]:
# all_train_data.head().collect()

In [19]:
def get_cnt(row):
    prev_items = row.to_list()
    # print(type(prev_items))
    # print(prev_items)
    length = len(prev_items)
    res = []
    for idx1, ele1 in enumerate(prev_items):
        for idx2, ele2 in enumerate(prev_items):
            following_weight = 0
            previous_weight = 0
            ele2 = prev_items[idx2]
            weight = 1
            distance = abs(idx2-idx1)
            if distance == 0:
                continue
            if idx1 < idx2:
                # combined_ele = '_'.join([ele1, ele2])
                following_weight = 1
            if idx1 > idx2:
                previous_weight = 1
            discount_weight = round(weight/distance, 2)
            following_discount_weight = round(following_weight/distance, 2)
            previous_discount_weight = round(previous_weight/distance, 2)
            
            res.append(
                # combined_ele
                [
                    # combined_ele
                ele1
                , ele2
                 , str(weight) # co-visit weight
                 , str(discount_weight) # co-visit discount weight
                 , str(following_weight)
                 , str(following_discount_weight)
                    ,str(previous_weight)
                    , str(previous_discount_weight)
                ]
                          )
    return res

In [20]:
next_items_pl = (
    all_train_data.with_columns(
        pl.col('prev_items').apply(lambda x: get_cnt(x))
    )
    .explode('prev_items')
    .select(
        pl.col('prev_items').arr.get(0).alias('current_item')
        , pl.col('prev_items').arr.get(1).alias('next_item')
        , pl.col('prev_items').arr.get(2).alias('weight').cast(pl.Float32)
        , pl.col('prev_items').arr.get(3).alias('discount_weight').cast(pl.Float32)
        , pl.col('prev_items').arr.get(4).alias('following_weight').cast(pl.Float32)
        , pl.col('prev_items').arr.get(5).alias('following_discount_weight').cast(pl.Float32)
        , pl.col('prev_items').arr.get(6).alias('previous_weight').cast(pl.Float32)
        , pl.col('prev_items').arr.get(7).alias('previous_discount_weight').cast(pl.Float32)
        
    )
    .groupby(['current_item', 'next_item'])
    .agg(
        pl.col('weight').sum()
        , pl.col('discount_weight').sum()
            , pl.col('following_weight').sum()
        , pl.col('following_discount_weight').sum()
        , pl.col('previous_weight').sum()
        , pl.col('previous_discount_weight').sum()

    ).sort(['current_item', 'weight'], descending=True)
    .groupby(['current_item'])
    .agg(
        pl.col('next_item')
        , pl.col('weight')
         , 'discount_weight' # co-visit discount weight
         , 'following_weight' 
         , 'following_discount_weight'
            ,'previous_weight'
            , 'previous_discount_weight'
    )
    .select(
        pl.col('current_item').alias('item')
        , pl.col('next_item').alias('next_item_prediction')
        , pl.col('weight').alias('next_item_weight')
         , 'discount_weight' # co-visit discount weight
         , 'following_weight' 
         , 'following_discount_weight'
            ,'previous_weight'
            , 'previous_discount_weight'
    )
)

In [42]:
# next_items_pl.head().collect()

## Save model 

In [22]:
model_file

'../model_training/co_visit_task1_v10/co_visit_task1_v10_True_for_eval.parquet'

In [23]:
%%time
next_items_pl = next_items_pl.collect()

CPU times: user 14min 16s, sys: 26.4 s, total: 14min 42s
Wall time: 11min 53s


In [24]:
next_items_pl.sample(3)

item,next_item_prediction,next_item_weight,discount_weight,following_weight,following_discount_weight,previous_weight,previous_discount_weight
str,list[str],list[f32],list[f32],list[f32],list[f32],list[f32],list[f32]
"""B0BJ1JBQPF""","[""B0BJ1JBQPF"", ""B0B5T2LP9Y"", … ""B095YBNT8D""]","[14.0, 10.0, … 1.0]","[14.0, 5.18, … 0.5]","[7.0, 3.0, … 1.0]","[7.0, 1.58, … 0.5]","[7.0, 7.0, … 0.0]","[7.0, 3.6, … 0.0]"
"""B0B8YTBSBM""","[""B0BJF5HW7C"", ""B09RQRG88D"", … ""B092MMDQ7H""]","[2.0, 2.0, … 1.0]","[0.28, 0.11, … 0.12]","[2.0, 2.0, … 1.0]","[0.28, 0.11, … 0.12]","[0.0, 0.0, … 0.0]","[0.0, 0.0, … 0.0]"
"""B07S8C8Z88""","[""B07S72KGMM"", ""B07B2Y7M9S"", … ""B09D3LYLSZ""]","[2.0, 2.0, … 1.0]","[1.5, 1.33, … 0.33]","[0.0, 1.0, … 0.0]","[0.0, 0.33, … 0.0]","[2.0, 1.0, … 1.0]","[1.5, 1.0, … 0.33]"


In [25]:
next_items_pl.select(pl.col('next_item_prediction').arr.lengths().alias('rec_num')).describe()

describe,rec_num
str,f64
"""count""",1311775.0
"""null_count""",0.0
"""mean""",33.130875
"""std""",60.890093
"""min""",1.0
"""max""",3012.0
"""median""",15.0


In [26]:
next_items_pl.write_parquet(model_file)

In [27]:
del next_items_pl

## Read Model 

In [28]:
model_file


'../model_training/co_visit_task1_v10/co_visit_task1_v10_True_for_eval.parquet'

In [29]:
next_items_pl = pl.scan_parquet(model_file)

In [30]:
type_dict = next_items_pl.schema

In [31]:
type_dict.keys()

dict_keys(['item', 'next_item_prediction', 'next_item_weight', 'discount_weight', 'following_weight', 'following_discount_weight', 'previous_weight', 'previous_discount_weight'])

In [32]:
# next_items_pl.collect().shape

In [33]:
next_items_pl.head().collect()

item,next_item_prediction,next_item_weight,discount_weight,following_weight,following_discount_weight,previous_weight,previous_discount_weight
str,list[str],list[f32],list[f32],list[f32],list[f32],list[f32],list[f32]
"""B0BDXKZM5L""","[""B0B7JM15SM"", ""B0BDXKZM5L"", … ""B08HNBFMNW""]","[23.0, 22.0, … 1.0]","[18.08, 7.739999, … 0.2]","[18.0, 11.0, … 0.0]","[14.58, 3.87, … 0.0]","[5.0, 11.0, … 1.0]","[3.5, 3.87, … 0.2]"
"""B0071MAR4O""","[""B00B79JGE4"", ""B07JJ9K81H"", … ""B09ST7573X""]","[43.0, 16.0, … 1.0]","[39.290001, 9.27, … 0.33]","[19.0, 12.0, … 0.0]","[18.200001, 7.91, … 0.0]","[24.0, 4.0, … 1.0]","[21.09, 1.36, … 0.33]"
"""B088KZ7V63""","[""B09MC6MS9D"", ""B0BFJ5NZQJ"", … ""B08B5KHYS7""]","[4.0, 2.0, … 1.0]","[2.21, 1.33, … 1.0]","[1.0, 0.0, … 0.0]","[1.0, 0.0, … 0.0]","[3.0, 2.0, … 1.0]","[1.21, 1.33, … 1.0]"
"""B0BGX9KD4D""","[""B00BM0IW7U""]",[1.0],[1.0],[1.0],[1.0],[0.0],[0.0]
"""B0832XBFG3""","[""B07W5BWZFW"", ""B0BK3K2GBV"", … ""B08GSRTL3K""]","[8.0, 6.0, … 1.0]","[5.34, 2.37, … 0.25]","[1.0, 0.0, … 0.0]","[1.0, 0.0, … 0.0]","[7.0, 6.0, … 1.0]","[4.34, 2.37, … 0.25]"


## Model eval 

In [34]:
# train_pl.schema

In [35]:
# eval_pl.schema

In [36]:
# nic_rec(target_pl=eval_pl.head(100), nic_model=next_items_pl, topn=topn).select(pl.col('next_item_prediction').arr.lengths().alias('rec_num')).collect().describe()

In [37]:
eval_candidate_pl = nic_rec(target_pl=eval_pl, nic_model=next_items_pl, topn=topn)#.head().collect()

In [38]:
# eval_candidate_pl.head().co

In [39]:

# if_hit = pl.element().rank()
target_df = eval_pl.join(eval_candidate_pl, how='left', on='session_id')


# eval_final.head().collect()

In [40]:
# target_df.select(pl.col('next_item_prediction').arr.lengths().alias('rec_num')).collect().describe()

In [41]:
%%time
# model_eval(target_df=target_df)
eval_final = (
        target_df
        .lazy()
        .with_columns(
            pl.col('next_item_prediction').cast(pl.List(pl.Utf8))
        )
        .with_columns(
            pl.concat_list([pl.col('next_item'), pl.col('next_item_prediction')]).alias('mrr')
        )
        .with_columns(
            pl.col('mrr').arr.eval(
                pl.arg_where(pl.element()==pl.element().first())
            )
        ).with_columns(
            pl.col('mrr').arr.eval(
                pl.when(pl.element()==0).then(0).otherwise(1/pl.element())
            )
        ).with_columns(
            pl.col('mrr').arr.sum()
            , pl.col('next_item_prediction').arr.head(20).arr.contains(pl.col('next_item')).mean().alias('recall@20')
            , pl.col('next_item_prediction').arr.head(100).arr.contains(pl.col('next_item')).mean().alias('recall@100')
            , pl.col('next_item_prediction').arr.head(topn).arr.contains(pl.col('next_item')).mean().alias('recall@all')


        )
)
final_res = eval_final.select(
        pl.count().alias('total_sessions')
        , pl.col('mrr').mean()
        , pl.col('recall@20').mean()
        , pl.col('recall@100').mean()
        , pl.col('recall@all').mean()

    ).collect()
final_res

CPU times: user 2min 29s, sys: 27.3 s, total: 2min 56s
Wall time: 39.1 s


total_sessions,mrr,recall@20,recall@100,recall@all
u32,f64,f64,f64,f64
326443,0.260901,0.452214,0.528515,0.545115


## Candidate Saving 

In [43]:
# nic_rec(target_pl=train_pl.head(100), nic_model=next_items_pl).select('rec_num').collect().describe()

### Train & eval data 

In [44]:
train_candidate_pl = nic_rec(target_pl=train2_pl, nic_model=next_items_pl)# .collect()

In [45]:
%%time
train_candidate_pl = train_candidate_pl.collect()
eval_candidate_pl = eval_candidate_pl.collect()

CPU times: user 1min 57s, sys: 16.1 s, total: 2min 14s
Wall time: 2min 12s


In [46]:
model_version

'co_visit_task1_v10'

In [47]:
for data_pl in [train_candidate_pl, eval_candidate_pl,
                # test_candidate_pl
               ]:
    print(data_pl.select('rec_num').describe())

shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 1.474219e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 104.28015  │
│ std        ┆ 101.29596  │
│ min        ┆ 0.0        │
│ max        ┆ 300.0      │
│ median     ┆ 64.0       │
└────────────┴────────────┘
shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 326443.0   │
│ null_count ┆ 0.0        │
│ mean       ┆ 104.395646 │
│ std        ┆ 101.21737  │
│ min        ┆ 0.0        │
│ max        ┆ 300.0      │
│ median     ┆ 64.0       │
└────────────┴────────────┘


In [48]:
train_candidate_pl.filter(pl.col('rec_num')>0).write_parquet(os.path.join(base_dir, candidate_dir,
                                                        train_file_name)
                                          )

In [49]:
eval_candidate_pl.filter(pl.col('rec_num')>0).write_parquet(os.path.join(base_dir, candidate_dir,
                                                       eval_file_name))

### Test data 

In [50]:
test_candidate_pl = nic_rec(target_pl=test_pl, nic_model=next_items_pl)#.head().collect()
test_candidate_pl = test_candidate_pl.collect()
test_candidate_pl.filter(pl.col('rec_num')>0).write_parquet(os.path.join(base_dir, candidate_dir, 
                                                       test_file_name))

In [51]:
eval_file_name

'task1_eval_co_visit_task1_v10_True_top300.parquet'

### test data for task3 

In [52]:
test4task3_candidate_pl = nic_rec(target_pl=test4task3_pl, nic_model=next_items_pl)#.head().collect()
test4task3_candidate_pl = test4task3_candidate_pl.collect()
test4task3_candidate_pl.filter(pl.col('rec_num')>0).write_parquet(os.path.join(base_dir, candidate_dir,test4task3_file_name))

In [53]:
os.path.join(base_dir, candidate_dir,test4task3_file_name)

'../data/candidates/task1_test4task3_co_visit_task1_v10_True_top300.parquet'

## Save test result

In [54]:
predictions = test_pl.join(test_candidate_pl, how='left', on='session_id').collect()[['locale', 'next_item_prediction']].to_pandas()

TypeError: Expected 'other' join table to be a LazyFrame, not a DataFrame

In [None]:
submit_file

In [None]:
check_predictions(predictions, test_sessions=test_pl.collect().to_pandas(), 
                  # check_products=True, product_df=products
                 )
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(submit_file, engine='pyarrow')

In [None]:
!aicrowd submission create -c task-1-next-product-recommendation -f {submit_file}

# Validate result 

In [None]:
# train_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         train_file_name))
# eval_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         eval_file_name))
# test_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         test_file_name))

In [None]:
# for data_pl in [train_candidate_pl, eval_candidate_pl, test_candidate_pl]:
#     print(data_pl.select('rec_num').collect().describe())