In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP
from rectools.model_selection import TimeRangeSplitter
from rectools.models import ImplicitItemKNNWrapperModel, PopularModel, PureSVDModel

from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

import dill

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [2]:
N = 20

# üé¨ Get KION dataset 

<a href="https://ods.ai/competitions/competition-recsys-21/data"> Dataset description [ru] </a>

In [3]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('../data/kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

In [4]:
!cd ../data && unzip kion_train.zip && cd -

# EDA

In [3]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## interactions

In [5]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [6]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [7]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [8]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


In [9]:
rec_interactions = Interactions(interactions)

## users

In [10]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,–ú,1
1,962099,age_18_24,income_20_40,–ú,0
2,1047345,age_45_54,income_40_60,–ñ,0
3,721985,age_45_54,income_20_40,–ñ,0
4,704055,age_35_44,income_60_90,–ñ,0
840192,339025,age_65_inf,income_0_20,–ñ,0
840193,983617,age_18_24,income_20_40,–ñ,1
840194,251008,,,,0
840195,590706,,,–ñ,0
840196,166555,age_65_inf,income_20_40,–ñ,0


In [11]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


## items

In [12]:
pd.concat([items.head(3), items.tail(3)])

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,–ü–æ–≥–æ–≤–æ—Ä–∏ —Å –Ω–µ–π,Hable con ella,2002.0,"–¥—Ä–∞–º—ã, –∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã, –º–µ–ª–æ–¥—Ä–∞–º—ã",–ò—Å–ø–∞–Ω–∏—è,,16.0,,–ü–µ–¥—Ä–æ –ê–ª—å–º–æ–¥–æ–≤–∞—Ä,"–ê–¥–æ–ª—å—Ñ–æ –§–µ—Ä–Ω–∞–Ω–¥–µ—Å, –ê–Ω–∞ –§–µ—Ä–Ω–∞–Ω–¥–µ—Å, –î–∞—Ä–∏–æ –ì—Ä–∞–Ω–¥–∏–Ω–µ—Ç—Ç–∏, –î–∂–µ—Ä–∞–ª—å–¥–∏–Ω –ß–∞–ø–ª–∏–Ω, –ï–ª–µ–Ω–∞ –ê–Ω–∞–π—è, –ö–∞—ç—Ç–∞–Ω–æ –í–µ–ª–æ–∑–æ, –õ–µ–æ–Ω–æ—Ä –£–æ—Ç–ª–∏–Ω–≥, –õ–æ–ª–∞ –î—É—ç–Ω—å—è—Å, –õ–æ–ª–µ—Å –õ–µ–æ–Ω, –ú–∞–ª—É –ê–π—Ä–æ–¥–æ, –ú–∞—Ä–∏–æ–ª–∞ –§—É—ç–Ω—Ç–µ—Å, –ü–∞—Å –í–µ–≥–∞, –ü–∏–Ω–∞ –ë–∞—É—à, –†–æ...",–ú–µ–ª–æ–¥—Ä–∞–º–∞ –ª–µ–≥–µ–Ω–¥–∞—Ä–Ω–æ–≥–æ –ü–µ–¥—Ä–æ –ê–ª—å–º–æ–¥–æ–≤–∞—Ä–∞ ¬´–ü–æ–≥–æ–≤–æ—Ä–∏ —Å –Ω–µ–π¬ª –≤ 2003 –≥–æ–¥—É –ø–æ–ª—É—á–∏–ª–∞ –ø—Ä–µ–º–∏—é ¬´–û—Å–∫–∞—Ä¬ª –∑–∞ –ª—É—á—à–∏–π —Å—Ü–µ–Ω–∞—Ä–∏–π. –ñ—É—Ä–Ω–∞–ª–∏—Å—Ç –ú–∞—Ä–∫–æ –±–µ—Ä–µ—Ç –∏–Ω—Ç–µ—Ä–≤—å—é —É –∑–Ω–∞–º–µ–Ω–∏—Ç–æ–π –∂–µ–Ω—â–∏–Ω—ã-—Ç–æ—Ä–µ—Ä–æ –õ–∏–¥–∏–∏ –∏ –≤—Å–∫–æ—Ä–µ –≤–ª—é–±–ª—è–µ...,"–ü–æ–≥–æ–≤–æ—Ä–∏, –Ω–µ–π, 2002, –ò—Å–ø–∞–Ω–∏—è, –¥—Ä—É–∑—å—è, –ª—é–±–æ–≤—å, —Å–∏–ª—å–Ω—ã–µ, –∂–µ–Ω—â–∏–Ω—ã, –ø—Ä–µ–æ–¥–æ–ª–µ–Ω–∏–µ, —Ç—Ä—É–¥–Ω–æ—Å—Ç–µ–π, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –¥—Ä—É–∂–±–∞, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –ø–∞—Ä–µ, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –º—É–∂—á–∏–Ω—ã, –∂–µ–Ω—â–∏–Ω—ã, —Ä–æ–º–∞–Ω—Ç–∏—á–µ—Å–∫–∏–µ, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –ø–æ—Ç–µ—Ä—è, –±–ª–∏–∑–∫–∏—Ö,..."
1,2508,film,–ì–æ–ª—ã–µ –ø–µ—Ä—Ü—ã,Search Party,2014.0,"–∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è, –∫–æ–º–µ–¥–∏–∏",–°–®–ê,,16.0,,–°–∫–æ—Ç –ê—Ä–º—Å—Ç—Ä–æ–Ω–≥,"–ê–¥–∞–º –ü–∞–ª–ª–∏, –ë—Ä–∞–π–∞–Ω –•–∞—Å–∫–∏, –î–∂.–ë. –°–º—É–≤, –î–∂–µ–π—Å–æ–Ω –ú–∞–Ω—Ü—É–∫–∞—Å, –î–∂–æ–Ω –ì–ª–µ–π—Å–µ—Ä, –ö–∞—Ä–ª –ì—Ä–∏–Ω, –ö—Ä–∏—Å—Ç–µ–Ω –†–∏—Ç—Ç–µ—Ä, –õ—ç–Ω—Å –†–µ–¥–¥–∏–∫, –ú–æ—Ä–∏—Å –ö–æ–º—Ç, –ü–∞—Ç—Ä–∏–∫ –ö–µ—Ä–Ω—Å, –†–µ–±–µ–∫–∫–∞ –ö–æ–ª–ª–∏–Ω–∑, –†–æ–∑–∞ –°–∞–ª–∞–∑–∞—Ä, –†–æ—Å—Å –ü. –ö—É–∫, –°—Ç–µ—Ñ—Ñ–∏ –ì—Ä–æ—É—Ç, –¢–∏...","–£–º–æ—Ä–∏—Ç–µ–ª—å–Ω–∞—è —Å–æ–≤—Ä–µ–º–µ–Ω–Ω–∞—è –∫–æ–º–µ–¥–∏—è –Ω–∞ –ø–æ–ø—É–ª—è—Ä–Ω—É—é —Ç–µ–º—É –æ —Ç–æ–º, –∫–∞–∫ –Ω–µ –Ω–∞–¥–æ –æ—Ç–º–µ—á–∞—Ç—å –º–∞–ª—å—á–∏—à–Ω–∏–∫. –ì–ª–∞–≤–Ω—ã–π –≥–µ—Ä–æ–π —É—Å–≤–æ–∏–ª, —á—Ç–æ –Ω–µ –Ω–∞–¥–æ –∑–≤–∞—Ç—å –Ω–∞ —Å–≤–∞–¥—å–±—É —Å–≤–æ–µ–≥–æ –¥—Ä—É–≥–∞ –î–∂–µ–π—Å–æ–Ω–∞, –∏–∑-–∑–∞ –∫–æ—Ç–æ—Ä–æ–≥–æ –æ–Ω –≤–º–µ—Å—Ç–æ —Å–≤–∞–¥–µ...","–ì–æ–ª—ã–µ, –ø–µ—Ä—Ü—ã, 2014, –°–®–ê, –¥—Ä—É–∑—å—è, —Å–≤–∞–¥—å–±—ã, –ø—Ä–µ–æ–¥–æ–ª–µ–Ω–∏–µ, —Ç—Ä—É–¥–Ω–æ—Å—Ç–µ–π, —Ä–∞—Å—Å—Ç–∞–≤–∞–Ω–∏—è, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –¥—Ä—É–∂–±–∞, —Ä–∏—Å–∫, –Ω–µ–¥–æ—Ä–∞–∑—É–º–µ–Ω–∏–µ, –º—É–∂—Å–∫–∞—è, –¥—Ä—É–∂–±–∞, –º–∞–ª—å—á–∏—à–Ω–∏–∫–∏, –¥–µ–≤–∏—á–Ω–∏–∫–∏"
2,10716,film,–¢–∞–∫—Ç–∏—á–µ—Å–∫–∞—è —Å–∏–ª–∞,Tactical Force,2011.0,"–∫—Ä–∏–º–∏–Ω–∞–ª, –∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, —Ç—Ä–∏–ª–ª–µ—Ä—ã, –±–æ–µ–≤–∏–∫–∏, –∫–æ–º–µ–¥–∏–∏",–ö–∞–Ω–∞–¥–∞,,16.0,,–ê–¥–∞–º –ü. –ö–∞–ª—Ç—Ä–∞—Ä–æ,"–ê–¥—Ä–∏–∞–Ω –•–æ–ª–º—Å, –î–∞—Ä—Ä–µ–Ω –®–∞–ª–∞–≤–∏, –î–∂–µ—Ä—Ä–∏ –í–∞—Å—Å–µ—Ä–º–∞–Ω, –î—ç–Ω –†–∏–∑–∑—É—Ç–æ, –ö–µ–Ω–¥–µ—Å –ò–ª—ç–π–Ω –ö–∞–ª—Ç—Ä–∞—Ä–æ, –ö–∏—Ç –î–∂–∞—Ä–¥–∏–Ω, –õ–µ–∫—Å–∞ –î–æ–π–≥, –ú–∞–π–∫–ª –î–∂–µ–π –£–∞–π—Ç, –ú–∞–π–∫–ª –®—ç–Ω–∫—Å, –ú–∞–π–∫–ª –≠–∫–ª—É–Ω–¥, –ü–∏—Ç–µ—Ä –ë—Ä–∞–π–∞–Ω—Ç, –ü–∏—Ç–µ—Ä –ö–µ–Ω—Ç, –°—Ç–∏–≤ –ë–∞—á–∏—á, –°—Ç–∏–≤ ...","–ü—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π —Ä–µ—Å—Ç–ª–µ—Ä –°—Ç–∏–≤ –û—Å—Ç–∏–Ω (¬´–í—Å–µ –∏–ª–∏ –Ω–∏—á–µ–≥–æ¬ª) –∏ —Ç–µ–º–Ω–æ–∫–æ–∂–∏–π –º–∞—á–æ –ú–∞–π–∫–ª –î–∂–µ–π –£–∞–π—Ç (¬´–¢–µ–º–Ω—ã–π —Ä—ã—Ü–∞—Ä—å¬ª) –≤ –∏–Ω—Ç—Ä–∏–≥—É—é—â–µ–º –∫—Ä–∏–º–∏–Ω–∞–ª—å–Ω–æ–º –±–æ–µ–≤–∏–∫–µ. –í —Ü–µ–Ω—Ç—Ä–µ —Å—é–∂–µ—Ç–∞ ‚Äì –∫–æ–º–∞–Ω–¥–∞ —Å–ø–µ—Ü–Ω–∞–∑–æ–≤—Ü–µ–≤, –∫–æ—Ç–æ—Ä–∞—è –æ–∫–∞–∑–∞...","–¢–∞–∫—Ç–∏—á–µ—Å–∫–∞—è, —Å–∏–ª–∞, 2011, –ö–∞–Ω–∞–¥–∞, –±–∞–Ω–¥–∏—Ç—ã, –≥–∞–Ω–≥—Å—Ç–µ—Ä—ã, –ø—Ä–µ—Å—Ç—É–ø–ª–µ–Ω–∏—è, –ø—Ä–µ–æ–¥–æ–ª–µ–Ω–∏–µ, —Ç—Ä—É–¥–Ω–æ—Å—Ç–µ–π, —É–±–∏–π—Å—Ç–≤–∞, —É–±–∏–π—Ü—ã, –Ω–∞—Å—Ç–æ—è—â–∏–µ, –º—É–∂—á–∏–Ω—ã, —Ä–∏—Å–∫, –Ω–µ–¥–æ—Ä–∞–∑—É–º–µ–Ω–∏–µ, —Å–∏–ª—ã, –ø—Ä–∞–≤–æ–ø–æ—Ä—è–¥–∫–∞, –±–æ—Ä—å–±–∞, –∑–∞, –≤—ã–∂–∏–≤–∞–Ω–∏–µ, —Å–ø..."
15960,10632,series,–°–≥–æ–≤–æ—Ä,Hassel,2017.0,"–¥—Ä–∞–º—ã, —Ç—Ä–∏–ª–ª–µ—Ä—ã, –∫—Ä–∏–º–∏–Ω–∞–ª",–†–æ—Å—Å–∏—è,0.0,18.0,,"–≠—à—Ä–µ—Ñ –†–µ–π–±—Ä—É–∫, –ê–º–∏—Ä –ö–∞–º–¥–∏–Ω, –≠—Ä–∏–∫ –≠–≥–µ—Ä","–û–ª–∞ –†–∞–ø–∞—Å, –ê–ª–∏–µ—Ç—Ç –û—Ñ–µ–π–º, –£–∏–ª—å–º–∞ –õ–∏–¥–µ–Ω, –®–∞–Ω—Ç–∏ –†–æ–Ω–∏, –¢–æ–º–∞ –•–æ–ª–º–∏–Ω","–ö—Ä–∏–º–∏–Ω–∞–ª—å–Ω–∞—è –¥—Ä–∞–º–∞ –ø–æ –º–æ—Ç–∏–≤–∞–º —Ä–æ–º–∞–Ω–æ–≤ –æ —à–≤–µ–¥—Å–∫–æ–º –¥–µ—Ç–µ–∫—Ç–∏–≤–µ –†–æ–ª–∞–Ω–¥–µ –•–∞—Å—Å–µ–ª–µ. –°—Ä–µ–¥—å –±–µ–ª–∞ –¥–Ω—è —É–±–∏—Ç –ø–æ–ª–∏—Ü–µ–π—Å–∫–∏–π, –∏ –Ω–∏—Ç–∏ –≤ —ç—Ç–æ–º –¥–µ–ª–µ –≤–µ–¥—É—Ç –ø—Ä—è–º–æ –≤ –∫–æ—Ä–∏–¥–æ—Ä—ã –≤–ª–∞—Å—Ç–∏. –†–∞—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ–º –∑–∞–Ω–∏–º–∞–µ—Ç—Å—è –¥–µ—Ç–µ–∫—Ç–∏–≤ –•–∞...","–°–≥–æ–≤–æ—Ä, 2017, –†–æ—Å—Å–∏—è"
15961,4538,series,–°—Ä–µ–¥–∏ –∫–∞–º–Ω–µ–π,Darklands,2019.0,"–¥—Ä–∞–º—ã, —Å–ø–æ—Ä—Ç, –∫—Ä–∏–º–∏–Ω–∞–ª",–†–æ—Å—Å–∏—è,0.0,18.0,,"–ú–∞—Ä–∫ –û‚Äô–ö–æ–Ω–Ω–æ—Ä, –ö–æ–Ω–æ—Ä –ú–∞–∫–ú–∞—Ö–æ–Ω","–î—ç–π–Ω –£–∞–π—Ç –û‚Äô–•–∞—Ä–∞, –¢–æ–º–∞—Å –ö—ç–π–Ω-–ë–∏—Ä–Ω, –î–∂—É–¥–∏—Ç –†–æ–¥–¥–∏, –ú–∞—Ä–∫ –û‚Äô–•–∞–ª–ª–æ—Ä–∞–Ω, –î–∂–∏–º–º–∏ –°–º–æ–ª–ª—Ö–æ—Ä–Ω","–°–µ–º–Ω–∞–¥—Ü–∞—Ç–∏–ª–µ—Ç–Ω–∏–π –î—ç–º–∏–µ–Ω –º–µ—á—Ç–∞–µ—Ç –≤—ã—Ä–≤–∞—Ç—å—Å—è –∑–∞ –ø—Ä–µ–¥–µ–ª—ã —Å–≤–æ–µ–≥–æ —Ä–∞–π–æ–Ω–∞ –∏ —Å—Ç–∞—Ç—å –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–º –±–æ–π—Ü–æ–º. –ö–æ–≥–¥–∞ –µ–≥–æ –∫—É–º–∏—Ä –∏ —Å—Ç–∞—Ä—à–∏–π –±—Ä–∞—Ç –∏—Å—á–µ–∑–∞–µ—Ç, –ø–∞—Ä–µ–Ω—å –ø–æ–ø–∞–¥–∞–µ—Ç –≤ —á—É–∂–¥—ã–π –µ–º—É –º–∏—Ä –Ω–∞—Å–∏–ª–∏—è, –Ω–∞—Ä–∫–æ—Ç–∏–∫–æ–≤ –∏ ...","–°—Ä–µ–¥–∏, –∫–∞–º–Ω–µ–π, 2019, –†–æ—Å—Å–∏—è"
15962,3206,series,–ì–æ—à–∞,,2019.0,–∫–æ–º–µ–¥–∏–∏,–†–æ—Å—Å–∏—è,0.0,16.0,,–ú–∏—Ö–∞–∏–ª –ú–∏—Ä–æ–Ω–æ–≤,"–ú–∫—Ä—Ç—ã—á –ê—Ä–∑—É–º–∞–Ω—è–Ω, –í–∏–∫—Ç–æ—Ä–∏—è –†—É–Ω—Ü–æ–≤–∞","–î–æ–±—Ä–æ–¥—É—à–Ω—ã–π –ì–æ—à–∞ –Ω–µ –º–æ–∂–µ—Ç –≤—ã–π—Ç–∏ –∏–∑ –¥–æ–º–∞, —á—Ç–æ–±—ã –Ω–µ –ø–æ–ø–∞—Å—Ç—å –≤ –Ω–µ–ª–µ–ø—É—é –∏ –∫—É—Ä—å—ë–∑–Ω—É—é –∏—Å—Ç–æ—Ä–∏—é. –ù–æ –¥–∞–∂–µ –Ω–µ—É–¥–∞—á–Ω–∏–∫–∏ –º–µ—á—Ç–∞—é—Ç –æ –ª—é–±–≤–∏, –∏ –Ω–∞—à –≥–µ—Ä–æ–π ‚Äî –Ω–µ –∏—Å–∫–ª—é—á–µ–Ω–∏–µ, –≤–µ–¥—å –æ–ø—Ç–∏–º–∏–∑–º–∞ –µ–º—É –Ω–µ –∑–∞–Ω–∏–º–∞—Ç—å.","–ì–æ—à–∞, 2019, –†–æ—Å—Å–∏—è"


In [13]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


# Define Blending model

In [14]:
class BlendingModel():
    
    def __init__(self, model1, model2):
        self._model1 = model1
        self._model2 = model2
        
    def fit(self, X):
        self._model1.fit(X)
        self._model2.fit(X)
    
    def recommend(self, *args, **kwargs):
        reco1 = self._model1.recommend(*args, **kwargs)
        reco2 = self._model2.recommend(*args, **kwargs)
        mreco = pd.concat((reco1,reco2)).sort_values(["user_id", "rank"])
        mreco = mreco.drop_duplicates(subset=["user_id", "item_id"], keep="first")
        reco_groups = mreco.groupby("user_id")
        reco = reco_groups.head(kwargs["k"])
        reco["rank"] = np.concatenate([list(range(x)) for x in reco.groupby("user_id")["user_id"].count()])
        return reco

# Split

In [15]:
# setting for cv 
n_folds = 7
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


### Test fold borders

In [16]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(rec_interactions)}")

start_date: 2021-06-27 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-06-27' '2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25'
 '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 7


In [17]:
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "map@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

In [18]:
# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(K=N),
    "tfidf_itemknn": TFIDFRecommender(K=N),
    "bm25_itemknn": BM25Recommender(K=N),
    "pure_svd": PureSVDModel(),
    "blend_cosine_tfidf": BlendingModel(
        ImplicitItemKNNWrapperModel(CosineRecommender(K=N)),
        ImplicitItemKNNWrapperModel(TFIDFRecommender(K=N)),
    ),
    "blend_cosine_bm25": BlendingModel(
        ImplicitItemKNNWrapperModel(CosineRecommender(K=N)),
        ImplicitItemKNNWrapperModel(BM25Recommender(K=N)),
    ),
    "blend_cosine_pure_svd": BlendingModel(
        ImplicitItemKNNWrapperModel(CosineRecommender(K=N)),
        PureSVDModel(),
    ),
    "blend_tfidf_bm25": BlendingModel(
        ImplicitItemKNNWrapperModel(TFIDFRecommender(K=N)),
        ImplicitItemKNNWrapperModel(BM25Recommender(K=N)),
    ),
    "blend_tfidf_pure_svd": BlendingModel(
        ImplicitItemKNNWrapperModel(TFIDFRecommender(K=N)),
        PureSVDModel(),
    ),
    "blend_bm25_pure_svd": BlendingModel(
        ImplicitItemKNNWrapperModel(BM25Recommender(K=N)),
        PureSVDModel(),
    ),
}

# Model training by fold

In [19]:
%%time

results = []

fold_iterator = cv.split(rec_interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in tqdm(enumerate(fold_iterator), total=cv.get_n_splits(rec_interactions)):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    interactions_train  = Dataset.construct(
        interactions_df=df_train,
        user_features_df=None,
        item_features_df=None
    )
    
    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        if model_name == "pure_svd":
            model = model
        elif model_name.startswith("blend_"):
            modle = model
        else:
            model = ImplicitItemKNNWrapperModel(model=model)
        model.fit(interactions_train)
        
        recs_itemknn = model.recommend(
            df_test['user_id'].unique(), 
            dataset=interactions_train, 
            k=N, 
            filter_viewed=False  # False - same items to every user
        )
        metric_values = calc_metrics(
            metrics,
            reco=recs_itemknn,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        pprint(fold)
        results.append(fold)
        

  0%|          | 0/7 [00:00<?, ?it/s]


{'End date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-06-27 00:00:00', freq='W-SUN'),
 'Test': 237414,
 'Test items': 5947,
 'Test users': 98930,
 'Train': 2533586,
 'Train items': 14092,
 'Train users': 536802}
{'fold': 0,
 'map@10': 0.03075359459250865,
 'model': 'cosine_itemknn',
 'novelty': 8.560670884128767,
 'prec@10': 0.02810573132517942,
 'recall@10': 0.17016937028416568,
 'serendipity': 3.5860072345484765e-05}
{'fold': 0,
 'map@10': 0.03762342914064018,
 'model': 'tfidf_itemknn',
 'novelty': 7.207676609557101,
 'prec@10': 0.03577580107146467,
 'recall@10': 0.20843044146161446,
 'serendipity': 5.41154388287347e-05}
{'fold': 0,
 'map@10': 0.07084579142326884,
 'model': 'bm25_itemknn',
 'novelty': 4.429813839871021,
 'prec@10': 0.060503386232689776,
 'recall@10': 0.347113748952724,
 'serendipity': 2.134204136402579e-05}
{'fold': 0,
 'map@10': 0.07638439498385285,
 'model': 'pure_svd',
 'novelty': 5.864203775918098,
 'prec@10': 0.045365409885

# üëå Metrics 

`Metrics by fold`



In [20]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,map@10,novelty,serendipity
0,0,cosine_itemknn,0.028106,0.170169,0.030754,8.560671,0.000036
1,0,tfidf_itemknn,0.035776,0.208430,0.037623,7.207677,0.000054
2,0,bm25_itemknn,0.060503,0.347114,0.070846,4.429814,0.000021
3,0,pure_svd,0.045365,0.277388,0.076384,5.864204,0.000012
4,0,blend_cosine_tfidf,0.034223,0.200607,0.044698,8.064863,0.000047
...,...,...,...,...,...,...,...
65,6,blend_cosine_bm25,0.029396,0.156011,0.051011,6.055953,0.000044
66,6,blend_cosine_pure_svd,0.022850,0.124286,0.054601,6.440598,0.000039
67,6,blend_tfidf_bm25,0.032134,0.169321,0.055993,5.428684,0.000065
68,6,blend_tfidf_pure_svd,0.025901,0.139440,0.058389,5.852272,0.000058


In [21]:
df_metrics.to_pickle("../data/df_metrics.pickle")

## Metrics mean by fold
`we can compare two models`

In [22]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,map@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
blend_bm25_pure_svd,0.038568,0.214104,0.090257,4.805587,2.2e-05
blend_cosine_bm25,0.036278,0.202373,0.063879,6.181315,3.7e-05
blend_cosine_pure_svd,0.028058,0.160783,0.068,6.660195,3.4e-05
blend_cosine_tfidf,0.027161,0.153928,0.04889,7.640792,6.9e-05
blend_tfidf_bm25,0.039191,0.216983,0.069049,5.567993,5.4e-05
blend_tfidf_pure_svd,0.031021,0.175763,0.07174,6.09987,4.9e-05
bm25_itemknn,0.04218,0.231087,0.062185,4.049365,2.4e-05
cosine_itemknn,0.021999,0.127819,0.030119,8.109853,5.3e-05
pure_svd,0.028351,0.160402,0.061973,5.31631,1.4e-05
tfidf_itemknn,0.028537,0.160412,0.037118,6.827194,8e-05


## Metrics std by fold

`If a diff between model metrics less than an std value => there is no significant difference observed`

- For instance, for the serendipity metric there is no such difference between cosine_itemknn and tfidf_itemknn model results

In [23]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,map@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
blend_bm25_pure_svd,0.007259,0.049631,0.023675,0.276969,2e-06
blend_cosine_bm25,0.007176,0.049029,0.015224,0.194977,7e-06
blend_cosine_pure_svd,0.005433,0.038019,0.018824,0.267323,6e-06
blend_cosine_tfidf,0.005165,0.036234,0.011837,0.189257,1.5e-05
blend_tfidf_bm25,0.007829,0.053007,0.015623,0.177851,1e-05
blend_tfidf_pure_svd,0.005616,0.039193,0.019101,0.265492,9e-06
bm25_itemknn,0.01065,0.069899,0.014546,0.179575,3e-06
cosine_itemknn,0.004735,0.033637,0.007421,0.203701,1.1e-05
pure_svd,0.00849,0.05874,0.016332,0.307155,2e-06
tfidf_itemknn,0.005393,0.037941,0.008137,0.171582,1.8e-05


# Fit best model (bm25_itemknn)

In [24]:
dataset  = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [25]:
model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=N))
model.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7fea069aafa0>

In [26]:
with open('../data/bm25_itemknn.dill', 'wb') as f:
    dill.dump(model, f)

# Fit model for cold users

In [27]:
popular_model = PopularModel(add_cold=True)
popular_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7fea06257160>

In [28]:
with open('../data/popular_model.dill', 'wb') as f:
    dill.dump(model, f)

# Offline

In [29]:
with open("../data/bm25_itemknn.dill", "rb") as f:
    model = dill.load(f)

In [30]:
users_ids = interactions['user_id'].unique()

In [31]:
offline_recs = model.recommend(
    users_ids, 
    dataset=dataset, 
    k=20, 
    filter_viewed=False  # False - same items to every user
)

In [32]:
offline_recs = offline_recs.groupby('user_id').agg({'item_id': list})

In [33]:
offline_recs_list = [ None ] * (max(users_ids) + 1)
for i, item in tqdm(offline_recs.iterrows(), total = len(users_ids)):
    offline_recs_list[item.name] = item["item_id"]

  0%|          | 0/962179 [00:00<?, ?it/s]

In [34]:
with open("../data/popular_model.dill", "rb") as f:
    popular_model = dill.load(f)

In [35]:
popular_model_recs = list(popular_model.recommend(
    [0], 
    dataset=dataset, 
    k=20, 
    filter_viewed=False  # False - same items to every user
)['item_id'])

In [36]:
with open('../data/offline_bm25_itemknn.dill', 'wb') as f:
    dill.dump(
        {
            "recs": offline_recs_list,
            "popular_recs": popular_model_recs,
        },
        f,
    )