# Dependencies

In [43]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import dill
from implicit.nearest_neighbours import CosineRecommender,\
                                        TFIDFRecommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, calc_metrics

from userknn import UserKnn

# Dataset

In [2]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [3]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [4]:
pd.concat([interactions.head(1), interactions.tail(1)])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
5476250,319709,4436,2021-08-15,3921,45.0


In [5]:
pd.concat([users.head(1), users.tail(1)])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
840196,166555,age_65_inf,income_20_40,Ж,0


In [6]:
pd.concat([items.head(1), items.tail(1)])

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
15962,3206,series,Гоша,,2019.0,комедии,Россия,0.0,16.0,,Михаил Миронов,"Мкртыч Арзуманян, Виктория Рунцова","Добродушный Гоша не может выйти из дома, чтобы...","Гоша, 2019, Россия"


# Baseline

## Train test data

In [7]:
# train test split 
# test = last 1 week 
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset.interactions import Interactions


n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

interactions_s = Interactions(interactions)
print(f"Real number of folds: {cv.get_n_splits(interactions_s)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [8]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(interactions_s, collect_fold_stats=True).__next__()

In [9]:
train_ids

array([      0,       1,       2, ..., 5476245, 5476247, 5476249])

In [10]:
test_ids

array([      6,      33,      56, ..., 5476229, 5476230, 5476240])

In [11]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

## Model

In [20]:
# base knn
with open('base_userknn.dill', 'rb') as f:
    userknn = dill.load(f)

base_userknn = UserKnn(userknn)
base_userknn.is_fitted = True
base_userknn.fit(train)

In [22]:
recos = base_userknn.predict(test)

In [34]:
catalog = train[Columns.Item].unique()

In [47]:
metric = MAP(k=10)

In [48]:
metric.calc(recos, test)

0.0019812212483438455