# Package 

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import logging
base_dir = '../'
sys.path.append(base_dir)
import os
import warnings
warnings.simplefilter('ignore')

import pickle
import gc
import re
import polars as pl
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
import polars as pl
from utils import *
from src.eval import model_eval
from src.config import raw_data_session_id_dir, candidate_dir, model_for_eval, candidate_file_name

In [2]:
# df_prod = pd.read_csv('data/products_train.csv')
# df_prod

# Config 

In [3]:
debug = False

model_version = 'next_few_items_v1'

topn = 100
if debug:
    n_rows = 1000
else:
    n_rows = None
# debug_session_num = 100
train_data_dir = '.'
test_data_dir = '.'
task = 'task1'

model_dir = f'../model_training/{model_version}/'

# target locales: locales needed for task1
target_locals = ["DE", 'JP', 'UK']

# if model_for_eval:
model_file = os.path.join(model_dir, f'nic_{model_for_eval}_for_eval.parquet')
submit_file = os.path.join('../data/sub_files/', f'submission_{task}_{model_version}_{model_for_eval}_for_eval.parquet')

In [5]:

train_file_name = candidate_file_name.format(
    task=task
    , data_type='train'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
eval_file_name = candidate_file_name.format(
    task=task
    , data_type='eval'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
test_file_name = candidate_file_name.format(
    task=task
    , data_type='test'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
test4task3_file_name = candidate_file_name.format(
    task=task
    , data_type='test4task3'
    , model_version=model_version
    , model_for_eval=model_for_eval
    , topn=topn
)
print(train_file_name)
print(eval_file_name)
print(test_file_name)
print(test4task3_file_name)

task1_train_next_few_items_v1_True_top100.parquet
task1_eval_next_few_items_v1_True_top100.parquet
task1_test_next_few_items_v1_True_top100.parquet
task1_test4task3_next_few_items_v1_True_top100.parquet


In [6]:
test_file_name

'task1_test_next_few_items_v1_True_top100.parquet'

In [7]:
! mkdir {model_dir}

mkdir: cannot create directory ‘../model_training/next_few_items_v1/’: File exists


In [8]:
model_file

'../model_training/next_few_items_v1/nic_True_for_eval.parquet'

In [9]:
submit_file

'../data/sub_files/submission_task1_next_few_items_v1_True_for_eval.parquet'

# Data 

In [10]:
! ls ../{raw_data_session_id_dir}

product_unique2id.json		    sessions_test_task2_phase1.parquet
products_train.parquet		    sessions_test_task3.parquet
sessions_eval.parquet		    sessions_test_task3_phase1.parquet
sessions_test_task1.parquet	    sessions_train.parquet
sessions_test_task1_phase1.parquet  sessions_train1.parquet
sessions_test_task2.parquet	    sessions_train2.parquet


In [13]:
train_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

eval_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_eval.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

# df_sess.head(3).collect()
 # df_sess.head(3).collect()
test_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, f'sessions_test_{task}.parquet'), n_rows=n_rows).with_columns(pl.col('prev_items').apply(str2list))
test4task3_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_test_task3.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))



# Function 

In [14]:
# 'item', 'next_item_prediction', 'next_item_weight'

In [15]:
def nic_rec(target_pl, nic_model):
    rec_num = 100
    final_cols = ['session_id', 'next_item_prediction', 'rec_num']
    target_pl = (
        target_pl
            .with_columns(
                pl.col('prev_items').arr.get(-1).alias('last_item')
            )
            .join(nic_model, how='left', left_on='last_item', right_on='item')
            .with_columns(
                pl.when(pl.col('next_item_prediction').is_null()).then([]).otherwise(pl.col('next_item_prediction').arr.head(rec_num)).alias('next_item_prediction')
            )
            .with_columns(
                pl.col('next_item_prediction').arr.lengths().alias('rec_num')
            )
            .select(
                final_cols
            )
    )#.head(3).collect()
    return target_pl

# Next Item Statistics 

In [12]:
train_data = train_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
if not model_for_eval:
    eval_data = eval_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
else:
    eval_data = eval_pl
test_data = test_pl

In [13]:
cols_to_keep = ['prev_items']

all_train_data = pl.concat([train_data.select(cols_to_keep), eval_data.select(cols_to_keep), test_data.select(cols_to_keep)], how='vertical')

In [14]:
# all_train_data.head().collect()

In [15]:
def get_cnt(row):
    prev_items = row.to_list()
    # print(type(prev_items))
    # print(prev_items)
    length = len(prev_items)
    res = []
    for idx1, ele1 in enumerate(prev_items[:-1]):
        for idx2 in range(idx1+1, length):
            ele2 = prev_items[idx2]
            # combined_ele = '_'.join([ele1, ele2])
            weight = str(round(1/(idx2-idx1), 2))
            res.append(
                # combined_ele
                [
                    # combined_ele
                ele1
                , ele2
                 , weight
                ]
                      )
    # print(res)
    return res
        

In [16]:
next_items_pl = (
    all_train_data.with_columns(
        pl.col('prev_items').apply(lambda x: get_cnt(x))
    ).explode('prev_items')
    .select(
        pl.col('prev_items').arr.get(0).alias('current_item')
        , pl.col('prev_items').arr.get(1).alias('next_item')

        , pl.col('prev_items').arr.get(2).alias('weight').cast(pl.Float32)
    ).groupby(['current_item', 'next_item'])
    .agg(
        pl.col('weight').sum()
    ).sort(['current_item', 'weight'], descending=True)
    .groupby(['current_item'])
    .agg(
        pl.col('next_item')
        , pl.col('weight')
    )
    .select(
        pl.col('current_item').alias('item')
        , pl.col('next_item').alias('next_item_prediction')
        , pl.col('weight').alias('next_item_weight')
    )
)

In [17]:
# next_items_pl.head().collect()

## Save model 

In [18]:
model_file

'../model_training/next_few_items_v1/nic_True_for_eval.parquet'

In [19]:
next_items_pl.collect().write_parquet(model_file)

In [20]:
del next_items_pl

## Read Model 

In [16]:
model_file

'../model_training/next_few_items_v1/nic_True_for_eval.parquet'

In [17]:
next_items_pl = pl.scan_parquet(model_file)

In [18]:
type_dict = next_items_pl.schema

In [19]:
type_dict.keys()

dict_keys(['item', 'next_item_prediction', 'next_item_weight'])

## Model eval 

In [17]:
# train_pl.schema

In [18]:
# eval_pl.schema

In [19]:
eval_candidate_pl = nic_rec(target_pl=eval_pl, nic_model=next_items_pl)#.head().collect()

In [20]:

# if_hit = pl.element().rank()
target_df = eval_pl.join(eval_candidate_pl, how='left', on='session_id')


# eval_final.head().collect()

In [21]:
# target_df.schema

In [22]:
model_eval(target_df=target_df)

total_sessions,mrr,recall@20,recall@100
u32,f64,f64,f64
326443,0.2489,0.4497,0.5054


## Candidate Saving 

### Train & eval 

In [31]:
train_candidate_pl = nic_rec(target_pl=train_pl, nic_model=next_items_pl)#.head().collect()

In [32]:
# eval_candidate_pl = nic_rec(target_pl=eval_pl, nic_model=next_items_pl)#.head().collect()

In [34]:
model_version

'next_few_items_v1'

In [13]:
for data_pl in [train_candidate_pl, eval_candidate_pl,
                # test_candidate_pl
               ]:
    print(data_pl.select('rec_num').collect().describe())

NameError: name 'eval_candidate_pl' is not defined

In [None]:
train_candidate_pl.collect().write_parquet(os.path.join(base_dir, candidate_dir,
                                                        train_file_name)
                                          )

In [None]:
eval_candidate_pl.collect().write_parquet(os.path.join(base_dir, candidate_dir,
                                                       eval_file_name))

### test data 

In [18]:
test_file_name

'task1_train_next_few_items_v1_True_top100.parquet'

In [21]:
test_cg_file = os.path.join(base_dir, candidate_dir, 
                                                       test_file_name)

test_cg_file

'../data/candidates/task1_test_next_few_items_v1_True_top100.parquet'

In [23]:
test_candidate_pl = nic_rec(target_pl=test_pl, nic_model=next_items_pl)#.head().collect()



In [None]:
test_candidate_pl.collect().write_parquet()

In [26]:
# test_candidate_pl.head().collect()

### test2task3 

In [20]:
test2task3_cg_file = os.path.join(base_dir, candidate_dir, 
                                                       test4task3_file_name)

print(test2task3_cg_file)
test4task3_candidate_pl = nic_rec(target_pl=test4task3_pl, nic_model=next_items_pl)#.head().collect()
test4task3_candidate_pl.collect().write_parquet(test2task3_cg_file)

../data/candidates/task1_test4task3_next_few_items_v1_True_top100.parquet


## Save test result

In [None]:
predictions = test_pl.join(test_candidate_pl, how='left', on='session_id').collect()[['locale', 'next_item_prediction']].to_pandas()

In [None]:
submit_file

In [None]:
check_predictions(predictions, test_sessions=test_pl.collect().to_pandas(), 
                  # check_products=True, product_df=products
                 )
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(submit_file, engine='pyarrow')

In [None]:
# !aicrowd submission create -c task-1-next-product-recommendation -f {submit_file}

# Validate result 

In [14]:
# train_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         train_file_name))
# eval_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         eval_file_name))
# test_candidate_pl = pl.scan_parquet(os.path.join(base_dir, candidate_dir,
#                                                         test_file_name))

In [15]:
# for data_pl in [train_candidate_pl, eval_candidate_pl, test_candidate_pl]:
#     print(data_pl.select('rec_num').collect().describe())

shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.946273e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 51.764814  │
│ std        ┆ 37.761931  │
│ min        ┆ 1.0        │
│ max        ┆ 100.0      │
│ median     ┆ 44.0       │
└────────────┴────────────┘
shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.946273e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 51.764814  │
│ std        ┆ 37.761931  │
│ min        ┆ 1.0        │
│ max        ┆ 100.0      │
│ median     ┆ 44.0       │
└────────────┴────────────┘
shape: (7, 2)
┌────────────┬────────────┐
│ describe   ┆ rec_num    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.946273e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 51.764814  │
│ std 