# Packages 

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import logging
base_dir = '../'

sys.path.append(base_dir)
import os
from utils import *

import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import pickle


from annoy import AnnoyIndex
import polars as pl
import implicit
import scipy.sparse as sps
from src.eval import get_recall_at_k, pd_get_recall_at_k
from src.config import raw_data_session_id_dir, candidate_dir, model_for_eval
from src.eval import model_eval




# Config 

In [2]:
debug = False

model_version = 'als_v2'

submit_res = False

topn = 100
if debug:
    n_rows = 1000
else:
    n_rows = None
train_data_dir = '.'
test_data_dir = '.'
task = 'task1'

model_dir = f'../model_training/{model_version}'

model_file = os.path.join(model_dir, 'als_model.pkl')

target_locals = ["DE", 'JP', 'UK']


sub_file = f'../data/sub_files/{model_version}.parque'

In [3]:
# sub_file

In [4]:
! mkdir {model_dir}

mkdir: cannot create directory ‘../model_training/als_v2’: File exists


In [5]:
model_file

'../model_training/als_v2/als_model.pkl'

# Function 

In [6]:
def get_rec(target_pl, model, user_item):
    user_lst = target_pl.select('session_id').collect().to_series().to_list()
    topn = 100

    next_items, item_similarity = model.recommend(user_lst, user_item[user_lst, :],
                                  filter_already_liked_items=True,
                                   N=topn
                                 )#[0].tolist()
    from tqdm import tqdm

    final_items = []
    for lst in tqdm(next_items):
        final_items.append([product_unique2id[ele] for ele in lst])
    eval_res = (
        target_pl.with_columns(
            pl.Series(name='next_item_prediction', values=final_items)
            , pl.Series(name='als_similarity', values=item_similarity)
            )
            # .with_columns(
            #     # pl.col("next_item_prediction").arr.eval(
            #     #     # product_unique2id[pl.element()]
            #     #     # dir(pl.element())
            #     #     # pl.element().cast(pl.Int8)#.map(product_unique2id)
            #     #                                        ).alias('test')
            # )
    )
    return eval_res

# Read data

In [7]:
train_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_train.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

eval_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_eval.parquet'), n_rows=n_rows).filter(pl.col('locale').is_in(target_locals)).with_columns(pl.col('prev_items').apply(str2list))

# df_sess.head(3).collect()
test_pl = pl.scan_parquet(os.path.join(base_dir, raw_data_session_id_dir, 'sessions_test_task1.parquet'), n_rows=n_rows).with_columns(pl.col('prev_items').apply(str2list))
product_pl = pl.scan_parquet(
    os.path.join(base_dir, raw_data_session_id_dir, 'products_train.parquet'), 
)
with open(os.path.join(base_dir, raw_data_session_id_dir, 'product_unique2id.json'), 'rb') as f:
    product_unique2id  = pickle.load(f)

In [8]:
train_data = train_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
if not model_for_eval:
    eval_data = eval_pl.with_columns(
    pl.col('prev_items').arr.concat(pl.col('next_item')) 
)
else:
    eval_data = eval_pl
test_data = test_pl

# Process Training data 

In [9]:
cols_to_keep = ['prev_items', 'locale','session_id']
user_item_pl = (
    pl.concat([train_data.select(cols_to_keep), 
               eval_data.select(cols_to_keep), 
               test_data.select(cols_to_keep)], how='vertical')
        .explode('prev_items')
        .join(product_pl.select(['id', 'locale', 'unique_id']), how='left', 
              left_on=['prev_items', 'locale']
              , right_on=['id', 'locale']
             )
        .select(
            pl.col('session_id')
            , pl.col('unique_id').alias('item_id')
        )
)

user_item = user_item_pl.collect().to_pandas()

In [10]:
user_item.shape

(18395684, 2)

In [11]:
user_item.head()

Unnamed: 0,session_id,item_id
0,0,265193
1,0,83226
2,0,387776
3,1,38788
4,1,85634


# ALS Model

## Model config 

In [12]:
vec_size = 100
iterations = 20
if debug:
    iterations = 2

In [13]:
# user_item['session_id']

## training data 

In [14]:
user_item = sps.coo_matrix(
      (
          np.ones(user_item.shape[0]), # We're using a matrix of ones, but using type weights or repurchase weights could help!
          (user_item['session_id'],
          user_item['item_id'])
      ),
      dtype='int8'
    ).tocsr()


In [15]:
user_item.shape


(3923220, 1413516)

## Model Training 

In [16]:
%%time
model = implicit.als.AlternatingLeastSquares(factors=vec_size, regularization=0.1, iterations=iterations)

model.fit(user_items=user_item, show_progress=True)

  0%|          | 0/20 [00:00<?, ?it/s]

CPU times: user 6h 41min 13s, sys: 8h 4min 20s, total: 14h 45min 33s
Wall time: 35min 56s


## Save model 

In [17]:
debug

False

In [18]:
model_file

'../model_training/als_v2/als_model.pkl'

In [19]:
user_item

<3923220x1413516 sparse matrix of type '<class 'numpy.int8'>'
	with 15941444 stored elements in Compressed Sparse Row format>

In [20]:
# if not debug:
sps.save_npz(os.path.join(model_dir, "user_item.npz"), user_item)
with open(model_file, 'wb') as f:
    pickle.dump(model, f)
del model
del user_item

In [21]:
! ls {model_dir}

als_model.pkl  user_item.npz


# Load model

In [22]:
user_item = sps.load_npz(os.path.join(model_dir, "user_item.npz"))
with open(model_file, 'rb') as f:
    model = pickle.load(f)

# Model eval 

In [23]:
%%time

eval_res = get_rec(target_pl=eval_pl, model=model, user_item=user_item)

100%|██████████| 326443/326443 [00:09<00:00, 35096.29it/s]


CPU times: user 4h 58min 19s, sys: 1h 25min 47s, total: 6h 24min 6s
Wall time: 22min 10s


In [24]:
eval_res.schema

{'prev_items': Unknown,
 'next_item': Utf8,
 'locale': Utf8,
 'session_id': Int64,
 'next_item_prediction': List(Utf8),
 'als_similarity': List(Float32)}

In [25]:
%%time
model_eval(eval_res)

CPU times: user 3min 9s, sys: 1min 19s, total: 4min 28s
Wall time: 19.9 s


total_sessions,mrr,recall@20,recall@100
u32,f64,f64,f64
326443,0.003665,0.011157,0.026602


# submit res

In [26]:
%%time
test_res = get_rec(target_pl=test_pl, model=model, user_item=user_item)

100%|██████████| 316971/316971 [00:09<00:00, 33192.70it/s]


CPU times: user 4h 46min 27s, sys: 1h 18min 7s, total: 6h 4min 34s
Wall time: 13min 54s


In [27]:
predictions = test_res.collect().to_pandas()

In [28]:
check_predictions(predictions, test_sessions=test_pl.collect().to_pandas(), 
                  # check_products=True, product_df=products
                 )
# Its important that the parquet file you submit is saved with pyarrow backend
if not debug:
    predictions.to_parquet(sub_file, engine='pyarrow')

In [None]:
# # You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f {sub_file}

[2K[1;34mals_v2.parque[0m [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [35m77.1%[0m • [32m145.4/188.6 MB[0m • [31m2.9 MB/s[0m • [36m0:00:16[0m

In [38]:
test_model = implicit.als.AlternatingLeastSquares(factors=50)

In [40]:
help(test_model.fit)

Help on method fit in module implicit.cpu.als:

fit(user_items, show_progress=True, callback=None) method of implicit.cpu.als.AlternatingLeastSquares instance
    Factorizes the user_items matrix.
    
    After calling this method, the members 'user_factors' and 'item_factors' will be
    initialized with a latent factor model of the input data.
    
    The user_items matrix does double duty here. It defines which items are liked by which
    users (P_ui in the original paper), as well as how much confidence we have that the user
    liked the item (C_ui).
    
    The negative items are implicitly defined: This code assumes that positive items in the
    user_items matrix means that the user liked the item. The negatives are left unset in this
    sparse matrix: the library will assume that means Piu = 0 and Ciu = 1 for all these items.
    Negative items can also be passed with a higher confidence value by passing a negative
    value, indicating that the user disliked the item.
  