# Packages 

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import logging
sys.path.append('../')
import os
from utils import *

import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import pickle

from annoy import AnnoyIndex
import polars as pl
import implicit
import scipy.sparse as sps




# Config 

In [2]:
debug = False

debug_session_num = 1000
train_data_dir = '.'
test_data_dir = '.'
task = 'task1'
PREDS_PER_SESSION = 100

num_tree = 100

model_dir = '../model_training/als_v1'

model_file = os.path.join(model_dir, 'als_model.pkl')

# target locales: locales needed for task1
target_locals = ["DE", 'JP', 'UK']

submit_file = f'submission_{task}_ALS.parquet'

In [3]:
! mkdir {model_dir}

mkdir: cannot create directory ‘../model_training/als_v1’: File exists


In [4]:
model_file

'../model_training/als_v1/als_model.pkl'

In [5]:
submit_file

'submission_task1_ALS.parquet'

In [6]:
train_sessions = read_train_data(train_data_dir=train_data_dir)
test_sessions = read_test_data(task, test_data_dir=test_data_dir)
train_sessions = train_sessions[train_sessions['locale'].isin(target_locals)]

if debug:
    train_sessions = train_sessions.sample(debug_session_num)
    test_sessions = test_sessions.sample(debug_session_num)
f"{train_sessions.shape}; {test_sessions.shape}"

'(3272716, 3); (316971, 2)'

In [7]:
train_sessions['type'] = 'train'
test_sessions['type'] = 'test'
train_sessions.reset_index(inplace=True)
test_sessions.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sessions['type'] = 'train'


In [8]:
train_sessions.head()

Unnamed: 0,index,prev_items,next_item,locale,type
0,0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE,train
1,1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE,train
2,2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE,train
3,3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE,train
4,4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE,train


In [9]:
test_sessions.head()

Unnamed: 0,index,prev_items,locale,type
0,0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE,test
1,1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE,test
2,2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE,test
3,3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE,test
4,4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE,test


In [10]:
test_sessions.columns

Index(['index', 'prev_items', 'locale', 'type'], dtype='object')

In [11]:
cols_to_keep = ['index', 'type', 'prev_items', 'locale']

In [12]:
original_train_test_df = pd.concat([train_sessions[cols_to_keep], test_sessions[cols_to_keep]], 
                          axis=0)

In [13]:
original_train_test_df.head()

Unnamed: 0,index,type,prev_items,locale
0,0,train,['B09W9FND7K' 'B09JSPLN1M'],DE
1,1,train,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,DE
2,2,train,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,DE
3,3,train,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,DE
4,4,train,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],DE


In [14]:
train_test_pl = pl.from_dataframe(original_train_test_df)

In [15]:
def process_prev_item(prev_items):
    # print(prev_items)
    res = [ele.replace('[', '').replace(']', '').replace('\n', '').replace("'", '').replace(' ', '') for ele in prev_items.split(' ')]
    return res

In [16]:
train_test_pl = (
    train_test_pl
        .with_columns(pl.col('prev_items').apply(lambda row: process_prev_item(row)))
        .with_columns(pl.col('index').cast(pl.Utf8))
        .with_columns((pl.col('index')+pl.col('type')).alias('user'))
        # .with_columns(pl.col('prev_items').explode())
)

In [17]:
train_test_df = train_test_pl.to_pandas()

In [18]:
train_test_df.head()

Unnamed: 0,index,type,prev_items,locale,user
0,0,train,"[B09W9FND7K, B09JSPLN1M]",DE,0train
1,1,train,"[B076THCGSG, B007MO8IME, B08MF65MLV, B001B4TKA0]",DE,1train
2,2,train,"[B0B1LGXWDS, B00AZYORS2, B0B1LGXWDS, B00AZYORS...",DE,2train
3,3,train,"[B09XMTWDVT, B0B4MZZ8MB, B0B7HZ2GWX, B09XMTWDV...",DE,3train
4,4,train,"[B09Y5CSL3T, B09Y5DPTXN, B09FKD61R8]",DE,4train


In [19]:
train_test_df.head()

Unnamed: 0,index,type,prev_items,locale,user
0,0,train,"[B09W9FND7K, B09JSPLN1M]",DE,0train
1,1,train,"[B076THCGSG, B007MO8IME, B08MF65MLV, B001B4TKA0]",DE,1train
2,2,train,"[B0B1LGXWDS, B00AZYORS2, B0B1LGXWDS, B00AZYORS...",DE,2train
3,3,train,"[B09XMTWDVT, B0B4MZZ8MB, B0B7HZ2GWX, B09XMTWDV...",DE,3train
4,4,train,"[B09Y5CSL3T, B09Y5DPTXN, B09FKD61R8]",DE,4train


In [20]:
train_test_df['user'] = train_test_df['user'].astype("category")
train_test_df['user_id'] = train_test_df['user'].cat.codes

In [21]:
test_sessions['index'] = test_sessions['index'].astype(str)
test_sessions = test_sessions.merge(train_test_df[['index', 'type', 'user_id']], how='left', on=['index', 'type'])

In [22]:
train_test_df = train_test_df.explode('prev_items')
train_test_df.shape

(15449411, 6)

In [23]:
test_sessions.shape

(316971, 5)

In [24]:
train_test_df.head()

Unnamed: 0,index,type,prev_items,locale,user,user_id
0,0,train,B09W9FND7K,DE,0train,1
0,0,train,B09JSPLN1M,DE,0train,1
1,1,train,B076THCGSG,DE,1train,1222223
1,1,train,B007MO8IME,DE,1train,1222223
1,1,train,B08MF65MLV,DE,1train,1222223


In [25]:


train_test_df['prev_items'] = train_test_df['prev_items'].astype("category")
train_test_df['item_id'] = train_test_df['prev_items'].cat.codes

In [26]:
train_test_df.head()

Unnamed: 0,index,type,prev_items,locale,user,user_id,item_id
0,0,train,B09W9FND7K,DE,0train,1,1043498
0,0,train,B09JSPLN1M,DE,0train,1,917910
1,1,train,B076THCGSG,DE,1train,1222223,310647
1,1,train,B007MO8IME,DE,1train,1222223,103874
1,1,train,B08MF65MLV,DE,1train,1222223,678646


In [27]:
len(train_test_df)

15449411

In [28]:
itemid2item = dict(zip(train_test_df['item_id'], train_test_df['prev_items']))

In [29]:
len(itemid2item)

1271674

In [30]:
max(itemid2item.keys())

1271673

In [31]:
# itemid2item = 
# train_test_df.groupby(['prev_items', 'item_id'])['user'].count().reset_index()

In [32]:
# itemid2item

# ALS Model

In [33]:
user_item = sps.coo_matrix(
      (
          np.ones(train_test_df.shape[0]), # We're using a matrix of ones, but using type weights or repurchase weights could help!
          (train_test_df['user_id'],
          train_test_df['item_id'])
      ),
      dtype='int8'
    ).tocsr()


In [34]:
user_item.shape


(3589687, 1271674)

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20)

model.fit(user_items=user_item, show_progress=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
def get_rec(user_id, model, user_item, itemid2item, topn=100):
    item_ids = model.recommend(user_id, 
                              user_item[user_id, :],
                              filter_already_liked_items=True,
                               N=topn
                             )[0].tolist()
    # print(item_ids)
    items = [itemid2item[key] for key in item_ids]
    return items
    

In [None]:
assert model.item_factors.shape[0] == len(itemid2item)

In [None]:
# itemid2item[7550]

In [None]:
# itemid2item


In [None]:
# help(model.recommend)

In [None]:
# original_train_test_df

In [None]:
# test_prediction = train_test_df[(train_test_df['type']=='test')]['item_id'].unique()

In [None]:
test_sessions.shape

In [None]:
%%time
predictions = (
    pl.from_dataframe(test_sessions)
        .with_columns(pl.col('user_id').apply(lambda row: get_rec(row, model=model, user_item=user_item
                                                                 , itemid2item=itemid2item)).alias('next_item_prediction'))
).to_pandas()[['locale', 'next_item_prediction']]

In [None]:
# predictions.head()

## Save model 

In [None]:
with open(model_file, 'wb') as f:
    pickle.dump(model, f)

In [None]:
del model

In [None]:
with open(model_file, 'rb') as f:
    model = pickle.load(f)

In [None]:
%%time
train_df = (
    pl.from_dataframe(train_sessions.sample(300000))
        .with_columns(pl.col('user_id').apply(lambda row: get_rec(row, model=model, user_item=user_item
                                                                 , itemid2item=itemid2item)).alias('next_item_prediction'))
).to_pandas()#[['locale', 'next_item_prediction']]

# Rec & Save Result 

In [51]:
check_predictions(predictions, test_sessions=test_sessions, 
                  # check_products=True, product_df=products
                 )
# Its important that the parquet file you submit is saved with pyarrow backend
predictions.to_parquet(submit_file, engine='pyarrow')

# Submission 

In [None]:
# You can submit with aicrowd-cli, or upload manually on the challenge page.
!aicrowd submission create -c task-1-next-product-recommendation -f {submit_file}

[2K[1;34msubmission_task1_ALS.parquet[0m [90m━━━━━━━━━━[0m [35m0.1%[0m • [32m0.1/51.7  [0m • [31m341.0 kB/s[0m • [36m0:02:32[0m0m
[2K[1A[2K[1;34msubmission_task1_ALS.parquet[0m [90m━━━━━━━━━━[0m [35m0.1%[0m • [32m0.1/51.7  [0m • [31m390.9 kB/s[0m • [36m0:02:13[0m
[2K[1A[2K[1;34msubmission_task1_ALS.parquet[0m [90m━━━━━━━━━━[0m [35m0.1%[0m • [32m0.1/51.7  [0m • [31m439.3 kB/s[0m • [36m0:01:58[0m
[2K[1A[2K[1;34msubmission_task1_ALS.parquet[0m [90m━━━━━━━━━━[0m [35m0.2%[0m • [32m0.1/51.7  [0m • [31m486.7 kB/s[0m • [36m0:01:47[0m
[2K[1A[2K[1;34msubmission_task1_ALS.parquet[0m [90m━━━━━━━━━━[0m [35m0.2%[0m • [32m0.1/51.7  [0m • [31m532.5 kB/s[0m • [36m0:01:37[0m
[2K[1A[2K[1;34msubmission_task1_ALS.parquet[0m [90m━━━━━━━━━━[0m [35m0.2%[0m • [32m0.1/51.7  [0m • [31m576.7 kB/s[0m • [36m0:01:30[0m
[2K[1A[2K[1;34msubmission_task1_ALS.parquet[0m [90m━━━━━━━━━━[0m [35m0.2%[0m • [32m0.1/51.7  [0m • [

# Example 

In [None]:
# # -*- coding: utf-8 -*-
# """
# Created on Sun Jun 23 22:20:58 2019

# @author: himansh
# """
# #import libraries
# import sys
# import pandas as pd
# import numpy as np
# import scipy.sparse as sparse
# from scipy.sparse.linalg import spsolve
# import random



# from sklearn.preprocessing import MinMaxScaler

# import implicit 
# from datetime import datetime, timedelta


# #Data Preprocessing
# def create_data(datapath,start_date,end_date):
#     df=pd.read_csv(datapath)
#     df=df.assign(date=pd.Series(datetime.fromtimestamp(a/1000).date() for a in df.timestamp))
#     df=df.sort_values(by='date').reset_index(drop=True) # for some reasons RetailRocket did NOT sort data by date
#     df=df[(df.date>=datetime.strptime(start_date,'%Y-%m-%d').date())&(df.date<=datetime.strptime(end_date,'%Y-%m-%d').date())]
#     df=df[['visitorid','itemid','event']]
#     return df


In [None]:
# #Download the kaggle RetailRocket data and give the events.csv file path
# datapath= 'events.csv'
# data=create_data(datapath,'2015-5-3','2015-5-3')
# data['visitorid'] = data['visitorid'].astype("category")
# data['itemid'] = data['itemid'].astype("category")
# data['visitor_id'] = data['visitorid'].cat.codes
# data['item_id'] = data['itemid'].cat.codes

# data['event']=data['event'].astype('category')
# data['event']=data['event'].cat.codes


In [None]:
# sparse_item_user = sparse.csr_matrix((data['event'].astype(float), (data['item_id'], data['visitor_id'])))
# sparse_user_item = sparse.csr_matrix((data['event'].astype(float), (data['visitor_id'], data['item_id'])))


In [None]:



# #Building the model
# model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)


# # ???
# # alpha_val = 40
# # data_conf = (sparse_item_user * alpha_val).astype('double')


In [None]:
# ###USING THE MODEL

# #Get Recommendations
# user_id =   14
# recommended = model.recommend(user_id, 
#                               sparse_user_item[user_id],
#                               filter_already_liked_items=True
#                              )[0].tolist()
# print(recommended)




# # #Get similar items
# # item_id = 7
# # n_similar = 3
# # similar = model.similar_items(item_id, n_similar)
# # print(similar)

In [None]:
# user_item = sps.coo_matrix(
#       (
#           np.ones(data.shape[0]), # We're using a matrix of ones, but using type weights or repurchase weights could help!
#           (data['visitor_id'],
#           data['item_id'])
#       ),
#       dtype='int8'
#     ).tocsr()
# user_item.todense().shape
# new_model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20)

# new_model.fit(user_items=user_item, show_progress=True)


In [None]:
# user_id =   14
# recommended = model.recommend(user_id, 
#                               user_item[user_id, :],
#                               filter_already_liked_items=True
#                              )[0].tolist()
# print(recommended)