# Basic ideas 

1. the weighted counts where item2 is after item1 within the same sequence
2. the weight is discounted with 1/(idx2-idx1)
4. Next steps
    1. [x] normalized counts should be included 
    2. [x] train2 data should be included


Example: [a, b, c] -> [ab, ac, bc] -> 

```
[
--current_item, next_item, counts
[a, b, 1],
[a, c, 1/2],
[b, c, 1]
]
```

# Package 

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import logging
base_dir = '../'
sys.path.append(base_dir)
import os
import warnings
warnings.simplefilter('ignore')

import pickle
import gc
import re
import polars as pl
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
import polars as pl
from utils import *
from src.eval import model_eval
from src.config import raw_data_session_id_dir, candidate_dir, model_for_eval, candidate_file_name

In [2]:
# df_prod = pd.read_csv('data/products_train.csv')
# df_prod

# Config 

In [3]:
debug = False


task = 'task1'
version = 'v11'
model_version = f'nfi_{task}_{version}'

if task == 'task1':
    target_locals = ['DE', 'JP', 'UK']
elif task == 'task2':
    target_locals = ['ES', 'FR', 'IT']
else:
    assert 1 == 0

topn = 100
if debug:
    n_rows = 1000
else:
    n_rows = None
# debug_session_num = 100
train_data_dir = '.'
test_data_dir = '.'

model_dir = f'../model_training/{model_version}/'

# target locales: locales needed for task1

# if model_for_eval:
model_file = os.path.join(model_dir, f'nic_{model_for_eval}_for_eval.parquet')
submit_file = os.path.join('../data/sub_files/', f'submission_{task}_{model_version}_{model_for_eval}_for_eval.parquet')

In [4]:
train_file_name = candidate_file_name.format(
    task=task
    , data_type='train'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
eval_file_name = candidate_file_name.format(
    task=task
    , data_type='eval'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
test_file_name = candidate_file_name.format(
    task=task
    , data_type='test'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
test4task3_file_name = candidate_file_name.format(
    task=task
    , data_type='test4task3'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
print(train_file_name)
print(eval_file_name)
print(test_file_name)
print(test4task3_file_name)

task1_train_nfi_task1_v11_True_top100.parquet
task1_eval_nfi_task1_v11_True_top100.parquet
task1_test_nfi_task1_v11_True_top100.parquet
task1_test4task3_nfi_task1_v11_True_top100.parquet


In [5]:
test_file_name

'task1_test_nfi_task1_v11_True_top100.parquet'

In [6]:
! mkdir {model_dir}

In [7]:
model_file

'../model_training/nfi_task1_v11/nic_True_for_eval.parquet'

In [8]:
submit_file

'../data/sub_files/submission_task1_nfi_task1_v11_True_for_eval.parquet'

# Data 

In [9]:
! ls ../{raw_data_session_id_dir}

product_unique2id.json		    sessions_test_task2_phase1.parquet
products_train.parquet		    sessions_test_task3.parquet
sessions_eval.parquet		    sessions_test_task3_phase1.parquet
sessions_test_task1.parquet	    sessions_train.parquet
sessions_test_task1_phase1.parquet  sessions_train1.parquet
sessions_test_task2.parquet	    sessions_train2.parquet


In [10]:
train_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train1.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

train2_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train2.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

eval_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_eval.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

# df_sess.head(3).collect()
test_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, f'sessions_test_{task}.parquet'), n_rows=n_rows).with_columns(pl.col('prev_items').apply(str2list))
test4task3_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_test_task3.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))



In [11]:
# test_phase1 = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, f'sessions_test_{task}_phase1.parquet'))

In [12]:
# test_pl.select('locale').collect().to_series().value_counts()

In [13]:
# test_phase1.select('locale').collect().to_series().value_counts()

In [14]:
# test_csv = pl.scan_csv('../data/raw_data/sessions_test_task1.csv')
# test_csv_phase1 = pl.scan_csv('../data/raw_data/sessions_test_task3.csv')

In [15]:
# test_csv.select('locale').collect().to_series().value_counts()

In [16]:
# test_csv_phase1.select('locale').collect().to_series().value_counts()

In [17]:
# test_csv = pl.scan_csv('../sessions_test_task3.csv')


In [18]:
# test_csv.select('locale').collect().to_series().value_counts()

# Function 

In [19]:
# 'item', 'next_item_prediction', 'next_item_weight'

In [20]:
def nic_rec(target_pl, nic_model, rec_num=topn):
    # rec_num = 100
    final_cols = ['session_id', 'next_item_prediction', 'rec_num']
    target_pl = (
        target_pl
            .with_columns(
                pl.col('prev_items').arr.get(-1).alias('last_item')
            )
            .join(nic_model, how='left', left_on='last_item', right_on='item')
            .with_columns(
                pl.when(pl.col('next_item_prediction').is_null()).then([]).otherwise(pl.col('next_item_prediction').arr.head(rec_num)).alias('next_item_prediction')
            )
            .with_columns(
                pl.col('next_item_prediction').arr.lengths().alias('rec_num')
            )
            .select(
                final_cols
            )
    )#.head(3).collect()
    return target_pl

# Next Item Statistics 

In [21]:
# model_for_eval

In [22]:
train_data = train_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
if not model_for_eval:
    eval_data = eval_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
else:
    eval_data = eval_pl
train2_data = train2_pl

test_data = test_pl

In [23]:
cols_to_keep = ['prev_items']

all_train_data = pl.concat([train_data.select(cols_to_keep),
                            eval_data.select(cols_to_keep),
                            train2_data.select(cols_to_keep),
                            test_data.select(cols_to_keep)], how='vertical')

In [24]:
# all_train_data.head().collect()

In [25]:
def get_cnt(row):
    prev_items = row.to_list()
    # print(type(prev_items))
    # print(prev_items)
    length = len(prev_items)
    res = []
    for idx1, ele1 in enumerate(prev_items[:-1]):
        for idx2 in range(idx1+1, length):
            ele2 = prev_items[idx2]
            # combined_ele = '_'.join([ele1, ele2])
            weight = str(round(1/(idx2-idx1), 2))
            res.append(
                # combined_ele
                [
                    # combined_ele
                ele1
                , ele2
                 , weight
                ]
                      )
    # print(res)
    return res
        

In [26]:
next_items_pl = (
    all_train_data.with_columns(
        pl.col('prev_items').apply(lambda x: get_cnt(x))
    ).explode('prev_items')
    .select(
        pl.col('prev_items').arr.get(0).alias('current_item')
        , pl.col('prev_items').arr.get(1).alias('next_item')

        , pl.col('prev_items').arr.get(2).alias('weight').cast(pl.Float32)
    ).groupby(['current_item', 'next_item'])
    .agg(
        pl.col('weight').sum()
    ).sort(['current_item', 'weight'], descending=True)
    .with_columns(
        pl.col('weight').max().over('current_item').alias('max_weight')
        , pl.lit(0).alias('min_weight')
    )
    .with_columns(
        pl.when(pl.col('max_weight')==pl.col('min_weight')).then(1).otherwise((pl.col('weight')-pl.col('min_weight'))/(pl.col('max_weight')-pl.col('min_weight'))).alias('normalized_weight')
    )
    .groupby(['current_item'])
    .agg(
        pl.col('next_item')
        , pl.col('weight')
        ,  pl.col('normalized_weight')
    )
    .select(
        pl.col('current_item').alias('item')
        , pl.col('next_item').alias('next_item_prediction')
        , pl.col('weight').alias('next_item_weight')
        , pl.col('normalized_weight').alias('next_item_normalized_weight')
    )
)

In [None]:
next_items_pl.head().collect()

## Save model 

In [None]:
model_file

In [None]:
next_items_pl.collect().write_parquet(model_file)

In [None]:
del next_items_pl

## Read Model 

In [None]:
model_file

In [None]:
next_items_pl = pl.scan_parquet(model_file)

In [None]:
type_dict = next_items_pl.schema

In [None]:
type_dict.keys()

In [None]:
next_items_pl.head().collect()

## Model eval 

In [None]:
# train_pl.schema

In [37]:
# eval_pl.select('locale').collect()

In [38]:
eval_candidate_pl = nic_rec(target_pl=eval_pl, nic_model=next_items_pl)#.head().collect()

In [39]:

# if_hit = pl.element().rank()
target_df = eval_pl.join(eval_candidate_pl, how='left', on='session_id')


# eval_final.head().collect()

In [40]:
# eval_pl.select('locale').collect().to_series().value_counts()

In [44]:
model_eval(target_df=target_df)

total_sessions,mrr,recall@20,recall@100
u32,f64,f64,f64
326443,0.2345,0.4298,0.4827


In [43]:
target_df.select('rec_num').collect().describe()

describe,rec_num
str,f64
"""count""",326443.0
"""null_count""",0.0
"""mean""",48.331822
"""std""",37.862834
"""min""",0.0
"""max""",100.0
"""median""",38.0


In [None]:
# model_eval(target_df=target_df)

## Candidate Saving 

### Train & eval 

In [None]:
train_candidate_pl = nic_rec(target_pl=train2_pl, nic_model=next_items_pl)#.head().collect()

In [46]:
eval_candidate_pl = nic_rec(target_pl=eval_pl, nic_model=next_items_pl)#.head().collect()

In [47]:
model_version

'nfi_task1_v10'

In [48]:
for data_pl in [train_candidate_pl, eval_candidate_pl,
                # test_candidate_pl
               ]:
    print(data_pl.select('rec_num').collect().describe())

shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 1.474219e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 48.25649   │
│ std        ┆ 37.873834  │
│ min        ┆ 0.0        │
│ max        ┆ 100.0      │
│ median     ┆ 38.0       │
└────────────┴────────────┘
shape: (7, 2)
┌────────────┬───────────┐
│ describe   ┆ rec_num   │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 326443.0  │
│ null_count ┆ 0.0       │
│ mean       ┆ 48.331822 │
│ std        ┆ 37.862834 │
│ min        ┆ 0.0       │
│ max        ┆ 100.0     │
│ median     ┆ 38.0      │
└────────────┴───────────┘


In [49]:
train_file_name

'task1_train_nfi_task1_v10_True_top100.parquet'

In [None]:
train_candidate_pl.filter(pl.col('rec_num')>0).collect().write_parquet(os.path.join(base_dir, candidate_dir,
                                                        train_file_name)
                                          )

In [51]:
eval_file_name

'task1_eval_nfi_task1_v10_True_top100.parquet'

In [52]:
eval_candidate_pl.filter(pl.col('rec_num')>0).collect().write_parquet(os.path.join(base_dir, candidate_dir,
                                                       eval_file_name))

### test data 

In [53]:
test_cg_file = os.path.join(base_dir, candidate_dir, 
                                                       test_file_name)

test_cg_file

'../data/candidates/task1_test_nfi_task1_v10_True_top100.parquet'

In [54]:
test_candidate_pl = nic_rec(target_pl=test_pl, nic_model=next_items_pl)#.head().collect()

In [55]:
test_candidate_pl.filter(pl.col('rec_num')>0).collect().write_parquet(test_cg_file)

In [56]:
# test_candidate_pl.collect().shape

In [57]:
# test_candidate_pl.tail().collect()

In [58]:
# ! ls ../data/candidates/ | grep task2

### test4task3 

In [59]:
test2task3_cg_file = os.path.join(base_dir, candidate_dir, 
                                                       test4task3_file_name)

test2task3_cg_file
test4task3_candidate_pl = nic_rec(target_pl=test4task3_pl, nic_model=next_items_pl)#.head().collect()
test4task3_candidate_pl.filter(pl.col('rec_num')>0).collect().write_parquet(test2task3_cg_file)

In [60]:
! ls ../data/candidates/ | grep test4task

task1_test4task3_co_visit_True_top300.parquet
task1_test4task3_next_few_items_v1_True_top100.parquet
task1_test4task3_nfi_task1_v10_True_top100.parquet
task1_test4task3_nic_True_top100.parquet
task1_test4task3_w2v_task1_v10_True_top100.parquet
task1_test4task3_w2v_v3_True_top200.parquet
task2_test4task3_co_visit_task2_True_top300.parquet
task2_test4task3_co_visit_task2_v2_True_top300.parquet
task2_test4task3_nfi_task2_True_top100.parquet
task2_test4task3_nfi_task2_v2_True_top100.parquet
task2_test4task3_nic_task2_True_top100.parquet
task2_test4task3_nic_task2_v2_True_top100.parquet
task2_test4task3_w2v_task2_True_top200.parquet
task2_test4task3_w2v_task2_v2_True_top100.parquet


## Save inference result

In [159]:
predictions = test_pl.join(test_candidate_pl, how='left', on='session_id').collect()[['locale', 'next_item_prediction']].to_pandas()

In [160]:
submit_file

'../data/sub_files/submission_task2_nfi_task2_True_for_eval.parquet'

In [161]:
test_candidate_pl.collect().shape

(34691, 3)

In [162]:
predictions.shape

(34691, 2)

In [163]:
predictions['locale'].value_counts()

locale
IT    13992
FR    12521
ES     8177
Name: count, dtype: int64

In [164]:
# predictions.select('locale').collect().to_series().value_counts()

In [165]:
test_pl.collect().head()

prev_items,locale,session_id
list[str],str,i64
"[""B07GTS7SWK"", ""B07GTS7SWK""]","""ES""",4331306
"[""B0B33YWVHR"", ""849988993X"", … ""B09K7TDY1H""]","""ES""",4331307
"[""B08FMPXDTJ"", ""B0B4612MTM"", … ""B0B45YR21M""]","""ES""",4331308
"[""B07R3W4XQ7"", ""B07R3FB5B5""]","""ES""",4331309
"[""B09V4KBWPL"", ""B09G9FTLPB""]","""ES""",4331310


In [166]:
predictions.head()

Unnamed: 0,locale,next_item_prediction
0,ES,"[B07GTS7SWK, B0745795KF, B074581R8Y, B074574XP..."
1,ES,"[B09M8LNB61, B09K7TDY1H, B09J4T4JF5, B09NT33LZ..."
2,ES,"[B0B4612MTM, B0B461V1RB, B0B4614XL6, B0B461KYQ..."
3,ES,"[B07R4VG9X3, B07R3W4XQ7, B07R4WKSSV, B07R3FB5B..."
4,ES,"[B09G9LF91K, B09G97SLGS, B09G99D95Q, B09G9DMQ7..."


In [167]:
check_predictions(predictions, test_sessions=test_pl.collect().to_pandas(), 
                  # check_products=True, product_df=products
                 )
# Its important that the parquet file you submit is saved with pyarrow backend


In [168]:
submit_file

'../data/sub_files/submission_task2_nfi_task2_True_for_eval.parquet'

In [169]:
predictions.to_parquet(submit_file, engine='pyarrow')

In [170]:
submit_file

'../data/sub_files/submission_task2_nfi_task2_True_for_eval.parquet'

In [171]:
!aicrowd submission create -c task-2-next-product-recommendation-for-underrepresented-languages -f {submit_file}

[?25l[1;34msubmission_task2_nfi_task2_True_for_eval.parquet[0m [90m━━━━━[0m [35m0.0%[0m • [32m0.0/…[0m • [31m?[0m • [36m-:--:…[0m
[2K[1A[2K[1;34msubmission_task2_nfi_task2_True_for_eval.parquet[0m [90m━━━━━[0m [35m0.0%[0m • [32m0.0/…[0m • [31m?[0m • [36m-:--:…[0m
[2K[1A[2K[1;34msubmission_task2_nfi_task2_True_for_eval.parquet[0m [90m━━━━━[0m [35m0.0%[0m • [32m0.0/…[0m • [31m?[0m • [36m-:--:…[0m
[2K[1A[2K[1;34msubmission_task2_nfi_task2_True_for_eval.parquet[0m [90m━━━━━[0m [35m0.0%[0m • [32m0.0/…[0m • [31m?[0m • [36m-:--:…[0m
[2K[1A[2K[1;34msubmission_task2_nfi_task2_True_for_eval.parquet[0m [90m━━━━━[0m [35m0.0%[0m • [32m0.0/…[0m • [31m?[0m • [36m-:--:…[0m
[2K[1A[2K[1;34msubmission_task2_nfi_task2_True_for_eval.parquet[0m [90m━━━━━[0m [35m0.3%[0m • [32m0.0/…[0m • [31m?[0m • [36m-:--:…[0m
[2K[1A[2K[1;34msubmission_task2_nfi_task2_True_for_eval.parquet[0m [90m━━━━[0m [35m0.5%[0m • [32m0.0…

# Validate result 

In [14]:
# train_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         train_file_name))
# eval_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         eval_file_name))
# test_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         test_file_name))

In [15]:
# for data_pl in [train_candidate_pl, eval_candidate_pl, test_candidate_pl]:
#     print(data_pl.select('rec_num').collect().describe())

shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.946273e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 51.764814  │
│ std        ┆ 37.761931  │
│ min        ┆ 1.0        │
│ max        ┆ 100.0      │
│ median     ┆ 44.0       │
└────────────┴────────────┘
shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.946273e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 51.764814  │
│ std        ┆ 37.761931  │
│ min        ┆ 1.0        │
│ max        ┆ 100.0      │
│ median     ┆ 44.0       │
└────────────┴────────────┘
shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.946273e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 51.764814  │
│ std 