# Package 

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import logging
sys.path.append('../')
import os
import warnings
warnings.simplefilter('ignore')

import pickle
import gc
import re
import polars as pl
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
import polars as pl
from utils import *
from src.eval import get_recall_at_k, pd_get_recall_at_k

In [2]:
# df_prod = pd.read_csv('data/products_train.csv')
# df_prod

# Config 

In [3]:
debug = False
model_for_eval = False

model_version = 'next_item_counter_v2'


if debug:
    n_rows = 1000
else:
    n_rows = None
# debug_session_num = 100
train_data_dir = '.'
test_data_dir = '.'
task = 'task1'

model_dir = f'../model_training/{model_version}/'

# target locales: locales needed for task1
target_locals = ["DE", 'JP', 'UK']


submit_file = f'submission_{task}_next_item_counter.parquet'

if model_for_eval:
    model_file = os.path.join(model_dir, 'nic_model_for_eval.parquet')
else:
    model_file = os.path.join(model_dir, 'nic_model.parquet')

In [4]:
! mkdir {model_dir}

mkdir: cannot create directory ‘../model_training/next_item_counter_v2/’: File exists


In [5]:
model_file

'../model_training/next_item_counter_v2/nic_model.parquet'

# Data 

In [7]:
# products = read_product_data(train_data_dir=train_data_dir)

In [8]:
df_sess = pl.scan_csv('sessions_train.csv', n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))
df_sess.head(3).collect()

prev_items,next_item,locale
list[str],str,str
"[""B09W9FND7K"", ""B09JSPLN1M""]","""B09M7GY217""","""DE"""
"[""B076THCGSG"", ""B007MO8IME"", … ""B001B4TKA0""]","""B001B4THSA""","""DE"""
"[""B0B1LGXWDS"", ""B00AZYORS2"", … ""B00AZYORS2""]","""B0767DTG2Q""","""DE"""


In [9]:
df_test = pl.scan_csv('sessions_test_task1.csv', n_rows=n_rows).with_columns(pl.col('prev_items').apply(str2list))
df_test

In [10]:
# df_sess = df_sess[df_sess['locale'].isin(target_locals)]

# if debug:
#     df_sess = df_sess.sample(debug_session_num)
#     df_test = df_test.sample(debug_session_num)

# Next Item Statistics 

In [11]:
train_pl = df_sess.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
test_pl = df_test

In [12]:
# test_pl.head().collect()

In [13]:
# if not model_for_eval:


cols_to_keep = ['prev_items']
next_items_pl = (
    pl.concat([train_pl.select(cols_to_keep), test_pl.select(cols_to_keep)], how='vertical')
        .with_columns(
            pl.col('prev_items').arr.shift(-1).alias('next_item_lst')
            , pl.col('prev_items').arr.lengths().alias('length')
        )
        .select(
            pl.col('prev_items').arr.head(pl.col('length')-1).alias('prev')
            , pl.col('next_item_lst').arr.head(pl.col('length')-1).alias('next')
        )
        .explode(['prev','next' ])
        .groupby(['prev','next' ])
        .agg(
            pl.count().alias('cnt')
        )
        .sort(['prev', 'cnt'], descending=True)
        .with_columns(
            pl.col('cnt').max().over('prev').alias('max_count')
            , pl.col('cnt').min().over('prev').alias('min_count')
        )
        .with_columns(
            pl.when(pl.col('max_count')==pl.col('min_count')).then(1).otherwise((pl.col('cnt')-pl.col('min_count'))/(pl.col('max_count')-pl.col('min_count'))).alias('normalized_cnt')
        )
        .groupby('prev')
        .agg(
            pl.col('next').alias('next_item_prediction')
            , pl.col('cnt').alias('next_item_cnt')
            , (pl.col('normalized_cnt')+pl.lit(0.01)).alias('next_item_weight')
        )
        .select(
            pl.col('prev').alias('item')
            , 'next_item_prediction'
            , pl.col('next_item_weight')
        )
        
)

In [14]:
# next_items_pl.collect()#.filter(pl.col('item')=='B09LXX1PQ9')

In [15]:
# next_item_dict = defaultdict(list)

# for _, row in tqdm(df_sess.iterrows(), total=len(df_sess)):
#     prev_items = str2list(row['prev_items'])
#     if not model_for_eval:
#         next_item = row['next_item']
#     prev_items_length = len(prev_items)
#     if prev_items_length <= 1:
#         if not model_for_eval:
#             next_item_dict[prev_items[0]].append(next_item)
#     else:
#         for i, item in enumerate(prev_items[:-1]):
#             next_item_dict[item].append(prev_items[i+1])
#         if not model_for_eval:
#             next_item_dict[prev_items[-1]].append(next_item)

In [16]:
# next_item_dict

In [17]:
# for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
#     prev_items = str2list(row['prev_items'])
#     prev_items_length = len(prev_items)
#     if prev_items_length <= 1:
#         continue
#     else:
#         for i, item in enumerate(prev_items[:-1]):
#             next_item_dict[item].append(prev_items[i+1])

In [18]:
# # next_item_map = {}
# topn = 100
# item_lst = []
# common_items_lst = []
# weights_lst = []
# for item in tqdm(next_item_dict):
#     counter = Counter(next_item_dict[item])
#     most_common_cnt = counter.most_common(1)[0][1]
#     most_common_lst = list(zip(*counter.most_common(topn)))
#     most_common_lst[1] = list(np.array(most_common_lst[1])/most_common_cnt)
#     item_lst.append(item)
#     common_items_lst.append(list(most_common_lst[0]))
#     weights_lst.append(most_common_lst[1])
#     # next_item_map[item] = most_common_lst
#     # next_item_map[item] = [i[0] for i in counter.most_common(100)]

In [19]:
# next_item_df = pd.DataFrame(
#     {'item': item_lst
#     , 'next_item_prediction': common_items_lst
#      , 'next_item_weight': weights_lst
#     }
# )

In [20]:
# next_item_df.sample(10)

In [21]:
# next_items_pl.collect().filter(pl.col('item')=='B07QGW8LFT')

In [22]:
# nex_item_pl = pl.from_pandas(next_item_df).lazy().select(
#     'item'
#     , pl.col('next_item_prediction').alias('next_item_rec')
#     , 'next_item_weight'
# )

In [23]:
# with open('../model_training/next_item_counter/model.pkl', 'rb') as f:
#     model = pickle.load(f)

In [24]:
# model.keys()

In [25]:
# model['next_item_map']

In [26]:
# nex_item_pl = pl.DataFrame(
#     {
#         'item': model['next_item_map'].keys()
#         , 'next_item_rec': model['next_item_map'].values()
#     }
# ).lazy()

In [35]:
next_items_pl.collect().shape

(1270206, 3)

## Save model 

In [28]:
next_items_pl.collect().write_parquet(model_file)

In [29]:
del next_items_pl

## Read Model 

In [30]:
next_items_pl = pl.scan_parquet(model_file)

In [31]:
next_items_pl.schema

{'item': Utf8,
 'next_item_prediction': List(Utf8),
 'next_item_weight': List(Float64)}

# Top200 for fallback logics

In [32]:
# next_item_df.head()

In [33]:
# next_item_df[cols].info()

In [34]:
popular_df = pd.concat([df_sess[['prev_items', 'locale']], df_test[['prev_items', 'locale']]], axis=0)

TypeError: 'LazyFrame' object is not subscriptable (aside from slicing). Use 'select()' or 'filter()' instead.

In [None]:
popular_df.shape

In [None]:
popular_pl = pl.from_pandas(popular_df).lazy()

In [None]:
topn = 200
locale_popular_pl = (
    popular_pl
        .select(
            pl.col('prev_items').apply(str2list)#.explode().alias('item')
            , pl.col('locale')
        )
        .explode('prev_items')#.alias('item')
        .groupby(['locale', 'prev_items'])
        .agg(
            pl.count()
        )
        .with_columns(
            pl.col('count').rank(method='ordinal', descending=True).over('locale').alias('rank')
        )
        .filter(pl.col('rank')<=topn)
        .with_columns(
            pl.col('count').max().over('locale').alias('max_count')
            , pl.col('count').min().over('locale').alias('min_count')
        )
        .with_columns(
            ((pl.col('count')-pl.col('min_count'))/(pl.col('max_count')-pl.col('min_count'))).alias('weight')
        )
        .sort('locale', 'rank')
        .select(
            'locale'
            , 'prev_items'
            , 'weight'
        )
        .groupby('locale')
        .agg(
            pl.col('weight').alias('locale_popular_weight')
            , pl.col('prev_items').alias('locale_popular_rec')
        )
        # .count()#.head(3).collect())
        # .collect()
)

In [None]:
# locale_popular_pl.collect()

In [None]:
# locale_popular_pl.schema

In [None]:
# popular_df.apply(lambda x: str2list(x['prev_items']), axis=1)

In [None]:
# df_sess.head()

In [None]:
# df_test.head()

In [None]:
# # next_item_df['next_item_prediction'] = next_item_df['next_item_prediction'].astype(str)
# # next_item_df['next_item_weights'] = next_item_df['next_item_weights'].astype(str)
# cols = [
#     # 'item',
#         'next_item_prediction'
#         , 'next_item_weights'
#        ]
# next_item_pl = pl.from_pandas(next_item_df[cols])

In [None]:
# next_item_pl

In [None]:
# # k = []
# # v = []

# # for item in next_item_dict:
# #     k.append(item)
# #     v.append(next_item_dict[item])
    
# # df_next = pd.DataFrame({'item': k, 'next_item': v})
# df_next = next_item_df.explode('next_item_prediction').reset_index(drop=True)
# df_next = df_next.merge(products, how='left', left_on='item', right_on='id')
# df_next

In [None]:
# df_next['next_item'].value_counts().index.tolist()[:200]

In [None]:
# model = {
#     'top200': top200
#     , 'next_item_map': next_item_map
# }

## Save model 

In [None]:
# model_file

In [None]:
# with open(model_file, 'wb') as f:
#     pickle.dump(model, f)

# Get final result 

## Load Model 

In [None]:
# # with open(model_file, 'rb') as f:
#     model = pickle.load(f)

In [None]:
# top200

In [None]:
# next_item_map

In [None]:
# def get_rec(target_df, model):
#     next_item_map = model['next_item_map']
#     top200  = model['top200']
#     target_df['last_item'] = target_df['prev_items'].apply(lambda x: str2list(x)[-1])
#     target_df['next_item_prediction'] = target_df['last_item'].map(next_item_map)
#     preds = []

#     for _, row in tqdm(target_df.iterrows(), total=len(target_df)):
#         pred_orig = row['next_item_prediction']
#         pred = pred_orig
#         prev_items = str2list(row['prev_items'])
#         if type(pred) == float:
#             pred = top200[:100]
#         else:
#             if len(pred_orig) < 100:
#                 for i in top200:
#                     if i not in pred_orig and i not in prev_items:
#                         pred.append(i)
#                     if len(pred) >= 100:
#                         break
#             else:
#                 pred = pred[:100]
#         preds.append(pred)
#     target_df['next_item_prediction'] = preds
#     print(target_df['next_item_prediction'].apply(len).describe())
#     return target_df

In [None]:
# model.keys()

In [None]:
# model['next_item_map']

# Candidate for train data 

In [None]:
# train_pl = pl.scan_csv('sessions_train.csv')
train_pl = pl.scan_parquet('../data/eval_data/next_item_counter_train_eval_300k.parquet')

In [None]:
target_locals

In [None]:
train_pl.schema

In [None]:
train_pl = (
    train_pl
        .filter(pl.col('locale').is_in(target_locals))
        .with_columns(
            pl.col('prev_items').apply(str2list).arr.get(-1).alias('last_item')
        )
        .join(nex_item_pl, how='left', left_on='last_item', right_on='item')
        .with_columns(
            pl.when(pl.col('next_item_rec').is_null()).then([]).otherwise(pl.col('next_item_rec').arr.head(100)).alias('next_item_prediction')
        )
        .with_columns(
            pl.col('next_item_prediction').arr.lengths().alias('rec_num')
        )
        .select(
            'prev_items'
            , 'next_item'
            , 'locale'
            , 'next_item_prediction'
            , 'rec_num'
        )
)#.head(2).collect()

In [None]:
train_pl.collect().write_parquet('../data/candidates/task1_train_nic_without_pupular_top100_300k.parquet')

# Final resul 

In [None]:
def pl_rec(target_pl, locale_popular_pl, nex_item_pl):
    rec_num = 100
    target_pl = (
        target_pl
            .with_columns(
                pl.col('prev_items').apply(str2list).arr.get(-1).alias('last_item')
            )
            .join(nex_item_pl, how='left', left_on='last_item', right_on='item')
            .join(locale_popular_pl, how='left', on='locale')
            .with_columns(
                pl.when(pl.col('next_item_rec').is_null()).then([]).otherwise(pl.col('next_item_rec')).alias('next_item_rec')
            )
            .with_columns(
                pl.concat_list([pl.col('next_item_rec'), pl.col('locale_popular_rec')])
                    .alias('next_item_prediction')
                    .arr.head(rec_num)

            )
            .with_columns(
                pl.col('next_item_prediction').arr.lengths().alias('rec_num')
            )
    )#.head(3).collect()
    return target_pl

In [None]:
eval_pl = pl.scan_parquet(f'../data/eval_data/w2v_train_eval_result_300k.parquet')

In [None]:
eval_pl.schema

In [None]:
nex_item_pl.schema

In [None]:
# locale_popular_pl.head(3).collect()

In [None]:
eval_pl = pl_rec(target_pl=eval_pl, locale_popular_pl=locale_popular_pl, nex_item_pl=nex_item_pl)

In [None]:
# eval_pl.head(3).collect()

In [None]:
eval_pl.select(
    pl.col('next_item_prediction').arr.head(20).arr.contains(pl.col('next_item')).mean().alias('recall@20')
    , pl.col('next_item_prediction').arr.head(100).arr.contains(pl.col('next_item')).mean().alias('recall@100')
).collect()

In [None]:

# eval_cols = ['len', 'recall@20', 'recall@100']
# train_eval_df[eval_cols] = train_eval_df.apply(pd_get_recall_at_k, axis=1, result_type='expand')
# print(train_eval_df[eval_cols].mean())

In [None]:
# train_eval_df.shape

In [None]:
# model_version

In [None]:
eval_pl.collect().shape

In [None]:
eval_pl.collect().write_parquet(f'../data/eval_data/{model_version}_train_eval_300k.parquet', 
                      # engine='pyarrow'
                     )

# Submit result 

In [None]:
test_pl = pl.scan_csv('sessions_test_task1.csv')
test_pl = pl_rec(target_pl=test_pl, locale_popular_pl=locale_popular_pl, nex_item_pl=nex_item_pl)

In [None]:
submit_file

In [None]:
! ls -al | grep {submit_file}

In [None]:
test_pl.collect().shape

In [None]:
submit_file

In [None]:
test_pl.head(3).collect()

In [None]:
# test_pl.collect().select('locale', 'next_item_prediction').write_parquet(submit_file,
#                                                                          # engine='pyarrow'
#                                                                         )

In [None]:
# # You can submit with aicrowd-cli, or upload manually on the challenge page.
# !aicrowd submission create -c task-1-next-product-recommendation -f {submit_file}

# Rank  

In [None]:
w2v_df = pl.scan_parquet('submission_task1.parquet')

In [None]:
assert w2v_df.collect().shape[0] == test_pl.collect().shape[0]

In [None]:
w2v_df.schema

In [None]:
# test_pl.head()

In [None]:
target_df = pl.concat([test_pl.select('prev_items', 'locale', 'next_item_rec').collect(), w2v_df.select('next_item_prediction').collect()]
                    , how='horizontal' )

In [None]:
target_df.shape

In [None]:
target_df.head(10)

In [None]:
rec_num = 100
target_pl = (
    target_df
        .lazy()
        .select(
            'prev_items'
            , 'locale'
            , pl.concat_list([pl.col('next_item_rec'), pl.col('next_item_prediction')])
                .alias('next_item_prediction')
                .arr.head(rec_num)

        )
        .with_columns(
            pl.col('next_item_prediction').arr.lengths().alias('rec_num')
        )
)#.head(3).collect()

In [None]:
target_pl.schema

In [None]:
target_pl.head(6).collect()

In [None]:
# ! mkdir ../data/sub_files

In [None]:
target_pl.collect().select('locale', 'next_item_prediction').write_parquet('../data/sub_files/rank_v1.parquet',
                                                                         # engine='pyarrow'
                                                                        )

In [None]:
# # You can submit with aicrowd-cli, or upload manually on the challenge page.
# !aicrowd submission create -c task-1-next-product-recommendation -f '../data/sub_files/rank_v1.parquet'

## Rank2 

In [None]:
rec_num = 100
target_pl = (
    target_df
        .lazy()
        .select(
            'prev_items'
            , 'locale'
            , pl.concat_list([pl.col('next_item_rec').arr.head(20), pl.col('next_item_prediction')])
                .alias('next_item_prediction')
                .arr.head(rec_num)

        )
        .with_columns(
            pl.col('next_item_prediction').arr.lengths().alias('rec_num')
        )
)#.head(3).collect()

In [None]:
target_pl.collect().select('locale', 'next_item_prediction').write_parquet('../data/sub_files/rank_v2.parquet',
                                                                         # engine='pyarrow'
                                                                        )

In [None]:
# # You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f '../data/sub_files/rank_v2.parquet'

# Debug 

In [None]:
test_pl = pl.scan_parquet(submit_file)

In [None]:
test_pl.schema

In [None]:
test_pl.head(5).collect()

In [None]:
test_pl.select(
    pl.col('next_item_prediction').arr.lengths().min()
    , pl.col('next_item_prediction').arr.lengths().max()
).collect()