## Домашнее задание `максимум 25 баллов (бывшее 10)`

## Критерии оценки 
`❗️Оцениваться будет значение метрики + ревью кода + реализация сервиса.` 

Вы можете сделать НЕ ВСЕ пункты и все равно получить 25 баллов. Получение > 25 баллов будет расцениваться как 25.


### 1. Побейте метрику на лидерборде `map@10 = 0.063` для userKnn модели с семинара (`4 балла`)


### 2. Предоставьте ноутбук(и) с экспериментами (`16 баллов`)

Что можно сделать:
   - сделать кол-во рекомендаций не меньше N (`2 балла`)
   - наличие тюнинга гиперпараметров (например, векторного расстояния или типов kNN моделей (implicit/rectools/...)) (`4 балла`)
   - другие варианты ранжированивания айтемов похожих пользователей (`2 балла`)
   - эксперименты с оффлайн валидацией (`2 балла`)
   - в тесте вас ждут холодные пользователи. Сделайте рекомендации для них (обратите внимание на <a href="https://rectools.readthedocs.io/en/latest/api/rectools.models.popular.html"> rectools.models.popular</a>) (`2 балла`)
   - блендинг моделей (`4 балла`)


### 3. Оберните модель в сервис.
- **предпочтительный онлайн вариант**: обучаете модель в ноутбуке, сохраняете обученную модель (pickle, dill), при запуске сервиса ее поднимаете и запрашиваете рекомендации "на лету" (`9 баллов`)
- или оффлайн вариант: предварительно посчитайте рекомендации для всех пользователей, сохраните и запрашивайте их (`4 балла`)
   

### Хороший код ДЗ это:
- комментарии и объяснения. В ipynb пользуйтесь силой маркдауна. 
В скриптах пишите комментарии и докстринг. 
- легкая читаемость и воспроизводимость
- стандарт PEP8 
- обоснование схемы валидации
- анализ метрики качества 

## Imports

In [1]:
from pathlib import Path
from collections import Counter
import optuna
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel
from rectools.dataset import Dataset
from rectools import Columns
from rectools.metrics import Precision, Recall, MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.model_selection import TimeRangeSplitter

import dill
import json
from pprint import pprint
import numpy as np
import scipy as sp
import time
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

## Подинимаем данные

In [2]:
DATA_PATH = Path("data_original")

In [3]:
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [5]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [6]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [7]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


## Разбиваем на train/test

In [8]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [9]:
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(dataset.interactions)}")


Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [10]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(dataset.interactions, collect_fold_stats=True).__next__()

In [11]:
train_ids

array([      0,       1,       2, ..., 5476245, 5476247, 5476249])

In [12]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

## Сравнение с ItemKnn

In [13]:
item_knn = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=30))
item_knn.fit(dataset);

In [14]:
# take a look at the recommended items by the simple itemknn model
recs_itemknn = item_knn.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False  # False - same items to every user
)

In [15]:
recs_itemknn.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,10440,20449.246972,1
1,1016458,4038,14580.000128,2
2,1016458,12192,8147.508352,3
3,1016458,1986,8044.704301,4
4,1016458,734,8043.999968,5


## Обучение userKnn с различными мерами расстояния

### Подготовка данных

In [16]:
max_date = interactions[Columns.Datetime].max()

In [17]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [18]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (490982, 5)


In [19]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [20]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

### Подготовка фичей

#### Пользовательские фичи

In [21]:
# % не большой => можно заменить пустые значения
users.fillna('Unknown', inplace=True)

In [22]:
# оставляем у df users только тех, кто попал в train
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [23]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1
840190,393868,age_25_34,income_20_40,М,0
840192,339025,age_65_inf,income_0_20,Ж,0


In [24]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


#### Фичи фильмов

In [25]:
# оставляем у df users только тех, кто попал в train
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

#### Жанр

In [26]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


#### Содержание

In [27]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


#### Режисер

In [28]:
items["director"] = items["directors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
director_feature = items[["item_id", "director"]].explode("director")
director_feature.columns = ["id", "value"]
director_feature["feature"] = "director"
director_feature.head()

Unnamed: 0,id,value,feature
0,10711,педро альмодовар,director
1,2508,скот армстронг,director
2,10716,адам п. калтраро,director
3,7868,эндрю хэй,director
4,16268,виктор садовский,director


#### Страна

In [29]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

Unnamed: 0,id,value,feature
0,10711,испания,country
1,2508,сша,country
2,10716,канада,country
3,7868,великобритания,country
4,16268,ссср,country


#### Год Выпуска

In [30]:
year_feature = items.reindex(columns=[Columns.Item, "release_year"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "release_year"
year_feature.head()

Unnamed: 0,id,value,feature
0,10711,2002.0,release_year
1,2508,2014.0,release_year
2,10716,2011.0,release_year
3,7868,2015.0,release_year
4,16268,1978.0,release_year


In [31]:
# Объединяем фичи
item_features = pd.concat((genre_feature, content_feature, country_feature, year_feature, director_feature))
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15960,10632,амир камдин,director
15960,10632,эрик эгер,director
15961,4538,марк о’коннор,director
15961,4538,конор макмахон,director


In [32]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


pprint(metrics)

{'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'MAP@2': MAP(k=2, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@4': MAP(k=4, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@6': MAP(k=6, divide_by_k=False),
 'MAP@7': MAP(k=7, divide_by_k=False),
 'MAP@8': MAP(k=8, divide_by_k=False),
 'MAP@9': MAP(k=9, divide_by_k=False),
 'Precision@1': Precision(k=1),
 'Precision@10': Precision(k=10),
 'Precision@2': Precision(k=2),
 'Precision@3': Precision(k=3),
 'Precision@4': Precision(k=4),
 'Precision@5': Precision(k=5),
 'Precision@6': Precision(k=6),
 'Precision@7': Precision(k=7),
 'Precision@8': Precision(k=8),
 'Precision@9': Precision(k=9),
 'Recall@1': Recall(k=1),
 'Recall@10': Recall(k=10),
 'Recall@2': Recall(k=2),
 'Recall@3': Recall(k=3),
 'Recall@4': Recall(k=4),
 'Recall@5': Recall(k=5),
 'Recall@6': Recall(k=6),
 'Recall@7': Recall(k=7),
 'Recall@8': Recall(k=8),
 'Recall@9': Recall(k=9)}


### Обучение

In [33]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "director", "country", "release_year"],
)
TEST_USERS = test[Columns.User].unique()

In [34]:
dataset.interactions.df

Unnamed: 0,user_id,item_id,weight,datetime
0,0,0,3.0,2021-05-11
1,1,1,3.0,2021-05-29
2,2,2,1.0,2021-05-09
3,3,3,3.0,2021-07-05
4,4,0,3.0,2021-04-30
...,...,...,...,...
5476244,69627,219,3.0,2021-08-02
5476245,40052,132,1.0,2021-05-12
5476246,896790,318,1.0,2021-08-13
5476247,206604,2546,3.0,2021-04-13


In [35]:
K_RECOS = 10

model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=100, K1=0.05, B=0.1, num_threads=2))
model.fit(dataset)
recos = model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)


In [36]:
model.predict(
    users=[123],
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

array([ 7571,  7582, 16166,  1105, 10761,  3182,  9506, 11756,  5411,
       13018])

In [37]:
results = []
model_quality = {'model': 'BM25Recommender'}
metric_values = calc_metrics(metrics, recos, test, train)
model_quality.update(metric_values)
results.append(model_quality)

df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [38]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,BM25Recommender
Precision@1,0.09055
Recall@1,0.04635
Precision@2,0.076266
Recall@2,0.076927
Precision@3,0.066919
Recall@3,0.099395
Precision@4,0.0601
Recall@4,0.117729
Precision@5,0.054182
Recall@5,0.131566


In [39]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'BM25Recommender_0.095432.dill', 'wb') as f:
    dill.dump(model, f)

In [40]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_BM25Recommender_0.095432.dill', 'wb') as f:
    dill.dump(dataset, f)

### Подбор гиперпараметров

In [41]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "director", "country", "release_year"],
)
TEST_USERS = test[Columns.User].unique()

results_opto = []

def objective(trial):
    # общие параметры
    K_RECOS = 10
    RANDOM_STATE = 42

    reco_model = trial.suggest_categorical("reco_model", ["BM25Recommender", 
                                                          "CosineRecommender",
                                                          "TFIDFRecommender"])
    model_quality_opto = {"model": f"{reco_model}_{trial.number}"}

    if reco_model == "BM25Recommender":
        # гиперпараметры для BM25Recommender
        K = trial.suggest_int("K", 100, 500, 50, log=False)
        K1 = trial.suggest_float("K1", 0.01, 0.09, log=False)
        B = trial.suggest_float("B", 0.01, 0.5, log=False)
        # Инициализация BM25Recommender
        model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=K, K1=K1, B=B, num_threads=2))
        
    elif reco_model == "CosineRecommender":
        # гиперпараметры для CosineRecommender
        K = trial.suggest_int("K", 50, 200, 50, log=False)
        # Инициализация CosineRecommender
        model = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=K))
        
    elif reco_model == "TFIDFRecommender":
        # гиперпараметры для TFIDFRecommender
        K = trial.suggest_int("K", 10, 100, 20, log=False)
        # Инициализация TFIDFRecommender
        model = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=K))
        
    # обучение модели
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    # Подсчет метрик
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality_opto.update(metric_values)
    results_opto.append(model_quality_opto)

    return metric_values.get('MAP@10') # максимизируемая метрика

In [42]:
# запуск подбора гиперпараметров
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

pprint(f"Number of finished trials: {len(study.trials)}")
trial = study.best_trial
pprint(f"Best trial: {trial}")

[32m[I 2022-12-13 20:00:36,790][0m A new study created in memory with name: no-name-a2f18b05-6ca8-4859-809f-7bf727738b60[0m
[32m[I 2022-12-13 20:00:50,876][0m Trial 0 finished with value: 0.07964252966443905 and parameters: {'reco_model': 'CosineRecommender', 'K': 200}. Best is trial 0 with value: 0.07964252966443905.[0m
[32m[I 2022-12-13 20:01:03,586][0m Trial 1 finished with value: 0.06800383347073066 and parameters: {'reco_model': 'BM25Recommender', 'K': 150, 'K1': 0.07782610712770957, 'B': 0.42335558083017355}. Best is trial 0 with value: 0.07964252966443905.[0m
[32m[I 2022-12-13 20:01:16,077][0m Trial 2 finished with value: 0.0792681615423525 and parameters: {'reco_model': 'CosineRecommender', 'K': 100}. Best is trial 0 with value: 0.07964252966443905.[0m
[32m[I 2022-12-13 20:01:30,130][0m Trial 3 finished with value: 0.08751019619860949 and parameters: {'reco_model': 'BM25Recommender', 'K': 250, 'K1': 0.014417799251195396, 'B': 0.3133982798478412}. Best is trial 3 w

[32m[I 2022-12-13 20:09:27,030][0m Trial 34 finished with value: 0.08852555865128656 and parameters: {'reco_model': 'BM25Recommender', 'K': 300, 'K1': 0.018896228646936138, 'B': 0.07792793549841882}. Best is trial 14 with value: 0.08855889234956228.[0m
[32m[I 2022-12-13 20:09:43,049][0m Trial 35 finished with value: 0.08837041325461713 and parameters: {'reco_model': 'BM25Recommender', 'K': 300, 'K1': 0.018729772554718745, 'B': 0.12503218944666827}. Best is trial 14 with value: 0.08855889234956228.[0m
[32m[I 2022-12-13 20:09:58,545][0m Trial 36 finished with value: 0.08839237900075846 and parameters: {'reco_model': 'BM25Recommender', 'K': 250, 'K1': 0.0287105329612647, 'B': 0.07898392049781393}. Best is trial 14 with value: 0.08855889234956228.[0m
[32m[I 2022-12-13 20:10:14,627][0m Trial 37 finished with value: 0.0860888920613562 and parameters: {'reco_model': 'BM25Recommender', 'K': 300, 'K1': 0.04200426059652389, 'B': 0.17879238124601513}. Best is trial 14 with value: 0.088

[32m[I 2022-12-13 20:18:16,274][0m Trial 68 finished with value: 0.08839438052871892 and parameters: {'reco_model': 'BM25Recommender', 'K': 350, 'K1': 0.016059904787650514, 'B': 0.14017752677490192}. Best is trial 65 with value: 0.08857039912509447.[0m
[32m[I 2022-12-13 20:18:31,983][0m Trial 69 finished with value: 0.07964252966443905 and parameters: {'reco_model': 'CosineRecommender', 'K': 200}. Best is trial 65 with value: 0.08857039912509447.[0m
[32m[I 2022-12-13 20:18:49,494][0m Trial 70 finished with value: 0.08857807227974675 and parameters: {'reco_model': 'BM25Recommender', 'K': 400, 'K1': 0.010198495975612048, 'B': 0.09523066628575887}. Best is trial 70 with value: 0.08857807227974675.[0m
[32m[I 2022-12-13 20:19:06,359][0m Trial 71 finished with value: 0.08853594998326636 and parameters: {'reco_model': 'BM25Recommender', 'K': 400, 'K1': 0.01148155556925561, 'B': 0.0962187743368087}. Best is trial 70 with value: 0.08857807227974675.[0m
[32m[I 2022-12-13 20:19:25,07

'Number of finished trials: 100'
('Best trial: FrozenTrial(number=70, values=[0.08857807227974675], '
 'datetime_start=datetime.datetime(2022, 12, 13, 20, 18, 31, 984417), '
 'datetime_complete=datetime.datetime(2022, 12, 13, 20, 18, 49, 494521), '
 "params={'reco_model': 'BM25Recommender', 'K': 400, 'K1': "
 "0.010198495975612048, 'B': 0.09523066628575887}, "
 "distributions={'reco_model': "
 "CategoricalDistribution(choices=('BM25Recommender', 'CosineRecommender', "
 "'TFIDFRecommender')), 'K': IntDistribution(high=500, log=False, low=100, "
 "step=50), 'K1': FloatDistribution(high=0.09, log=False, low=0.01, "
 "step=None), 'B': FloatDistribution(high=0.5, log=False, low=0.01, "
 'step=None)}, user_attrs={}, system_attrs={}, intermediate_values={}, '
 'trial_id=70, state=TrialState.COMPLETE, value=None)')


In [43]:
df_quality = pd.DataFrame(results_opto).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [44]:
df_quality = df_quality.T.drop_duplicates().T

In [45]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,CosineRecommender_0,BM25Recommender_1,CosineRecommender_2,BM25Recommender_3,TFIDFRecommender_4,BM25Recommender_5,CosineRecommender_7,CosineRecommender_8,BM25Recommender_9,TFIDFRecommender_10,BM25Recommender_11,BM25Recommender_12,BM25Recommender_13,BM25Recommender_14,BM25Recommender_15,BM25Recommender_16,TFIDFRecommender_17,BM25Recommender_18,BM25Recommender_19,BM25Recommender_21,BM25Recommender_22,BM25Recommender_23,BM25Recommender_24,BM25Recommender_25,BM25Recommender_26,BM25Recommender_27,BM25Recommender_28,BM25Recommender_31,BM25Recommender_32,BM25Recommender_33,BM25Recommender_34,BM25Recommender_35,BM25Recommender_36,BM25Recommender_37,BM25Recommender_39,BM25Recommender_41,BM25Recommender_42,BM25Recommender_43,BM25Recommender_44,BM25Recommender_46,BM25Recommender_47,BM25Recommender_48,BM25Recommender_50,BM25Recommender_51,BM25Recommender_52,BM25Recommender_53,BM25Recommender_54,BM25Recommender_56,BM25Recommender_57,BM25Recommender_58,BM25Recommender_59,BM25Recommender_60,BM25Recommender_61,BM25Recommender_62,BM25Recommender_63,BM25Recommender_64,BM25Recommender_65,BM25Recommender_67,BM25Recommender_68,BM25Recommender_70,BM25Recommender_71,BM25Recommender_72,BM25Recommender_73,BM25Recommender_74,BM25Recommender_75,BM25Recommender_76,BM25Recommender_77,BM25Recommender_78,BM25Recommender_79,BM25Recommender_81,BM25Recommender_82,BM25Recommender_83,BM25Recommender_84,BM25Recommender_85,BM25Recommender_86,BM25Recommender_88,BM25Recommender_89,BM25Recommender_90,BM25Recommender_91,BM25Recommender_92,BM25Recommender_93,BM25Recommender_94,BM25Recommender_95,BM25Recommender_96,BM25Recommender_97,BM25Recommender_98,BM25Recommender_99
Precision@1,0.083315,0.060936,0.08285,0.090724,0.083323,0.079631,0.082443,0.083257,0.086957,0.079473,0.090442,0.090235,0.089795,0.090699,0.089845,0.08982,0.083364,0.090592,0.083423,0.091081,0.090973,0.090998,0.088318,0.089795,0.090882,0.090799,0.090691,0.090334,0.09089,0.089753,0.090998,0.090998,0.091064,0.088642,0.090143,0.090923,0.091123,0.091214,0.091123,0.081307,0.090882,0.090708,0.066546,0.090915,0.090691,0.090069,0.090351,0.090915,0.089778,0.090683,0.090442,0.090865,0.090741,0.090824,0.089563,0.091147,0.090674,0.090077,0.091048,0.090932,0.09094,0.090957,0.090932,0.09099,0.090733,0.091006,0.090285,0.091023,0.091081,0.090965,0.090558,0.091189,0.091048,0.089422,0.090899,0.090741,0.091056,0.089646,0.09104,0.090882,0.09094,0.090666,0.090923,0.090567,0.09099,0.090691,0.089579
Recall@1,0.041663,0.028675,0.04155,0.046455,0.041703,0.039631,0.041475,0.041644,0.043962,0.040013,0.047335,0.04736,0.04718,0.047478,0.045798,0.047162,0.041859,0.046452,0.041822,0.047407,0.047182,0.047261,0.044845,0.047167,0.046905,0.046666,0.046474,0.047339,0.047417,0.047149,0.047396,0.04723,0.047245,0.044994,0.046029,0.047473,0.047402,0.04748,0.047411,0.04063,0.047143,0.047045,0.031595,0.047472,0.047458,0.047294,0.047342,0.047039,0.047155,0.047034,0.047355,0.047107,0.047443,0.047442,0.047079,0.047441,0.047468,0.047276,0.047239,0.04749,0.047407,0.047414,0.047407,0.047236,0.04745,0.047407,0.047327,0.047378,0.047261,0.047357,0.0474,0.047465,0.047239,0.047061,0.047419,0.046574,0.047344,0.047116,0.047399,0.047446,0.04748,0.047422,0.04738,0.046386,0.04735,0.047458,0.045593
Precision@2,0.070209,0.057929,0.069923,0.076399,0.070408,0.067176,0.06918,0.070026,0.072847,0.067417,0.076444,0.076366,0.076299,0.076498,0.075909,0.076295,0.0704,0.07654,0.070059,0.076631,0.076743,0.076648,0.07466,0.076345,0.076801,0.076685,0.076498,0.076345,0.076494,0.076307,0.076631,0.076743,0.076706,0.075237,0.0762,0.076453,0.076668,0.076681,0.076664,0.068446,0.076822,0.076805,0.06099,0.076461,0.076453,0.076316,0.076353,0.076751,0.076278,0.076801,0.076432,0.076785,0.07644,0.076465,0.076,0.076664,0.076469,0.076324,0.076722,0.07644,0.076507,0.076498,0.076511,0.076639,0.07644,0.076498,0.076378,0.076644,0.076681,0.076577,0.076478,0.076685,0.076722,0.075876,0.076502,0.076681,0.076614,0.076042,0.07661,0.076457,0.076486,0.076428,0.076631,0.076415,0.076644,0.076453,0.075507
Recall@2,0.068332,0.054757,0.067977,0.077129,0.068666,0.065474,0.06741,0.068111,0.072268,0.065669,0.07795,0.077899,0.07782,0.078015,0.076279,0.077801,0.068677,0.077288,0.06891,0.077962,0.077843,0.077892,0.074529,0.077861,0.077769,0.077592,0.077225,0.077886,0.077942,0.077809,0.078001,0.077918,0.077904,0.075308,0.076757,0.077919,0.07803,0.077977,0.078036,0.067006,0.077863,0.077854,0.058319,0.077941,0.077908,0.077825,0.077886,0.077785,0.077803,0.077844,0.07794,0.07782,0.077895,0.077927,0.077464,0.07797,0.077994,0.077851,0.077927,0.077934,0.077914,0.077892,0.077914,0.077876,0.077892,0.077908,0.077905,0.078012,0.077873,0.077949,0.077968,0.077959,0.077921,0.077459,0.077955,0.077547,0.07789,0.07752,0.077975,0.077919,0.077944,0.077866,0.077984,0.077139,0.078015,0.077913,0.075662
Precision@3,0.061083,0.053405,0.060829,0.067124,0.061196,0.059316,0.060204,0.060978,0.063766,0.058663,0.068177,0.0681,0.068,0.068164,0.066739,0.068042,0.061382,0.067198,0.061567,0.068318,0.068103,0.068244,0.06516,0.067984,0.06771,0.067574,0.06714,0.068152,0.06823,0.068039,0.068241,0.068119,0.068211,0.066233,0.066783,0.06818,0.068288,0.068205,0.068305,0.060389,0.067876,0.067821,0.055673,0.06818,0.068175,0.068053,0.06815,0.067715,0.068011,0.067829,0.068191,0.067854,0.068172,0.068194,0.068,0.068288,0.06813,0.068039,0.068194,0.068186,0.068241,0.068247,0.068238,0.068219,0.068164,0.068233,0.06813,0.068255,0.068216,0.068213,0.068169,0.068216,0.068191,0.068006,0.068224,0.06745,0.068222,0.067992,0.068269,0.068188,0.068208,0.06818,0.068266,0.067124,0.068285,0.068169,0.066562
Recall@3,0.087136,0.075424,0.08663,0.099795,0.087242,0.085234,0.085821,0.086909,0.093121,0.084054,0.10268,0.102632,0.102549,0.102634,0.098916,0.102588,0.087719,0.099921,0.088919,0.10254,0.102042,0.102282,0.095823,0.102474,0.101126,0.100742,0.099783,0.10268,0.102636,0.102582,0.102545,0.102015,0.102185,0.097874,0.099136,0.102546,0.102512,0.102306,0.102563,0.086974,0.101449,0.101391,0.079073,0.102571,0.102623,0.102531,0.102679,0.101171,0.10254,0.101408,0.102722,0.101447,0.102576,0.10262,0.102615,0.102459,0.102573,0.102511,0.102185,0.102576,0.102631,0.102643,0.102634,0.10224,0.102558,0.102615,0.102656,0.102492,0.102241,0.102577,0.102677,0.102304,0.102171,0.102631,0.102627,0.10036,0.102278,0.102575,0.102684,0.102567,0.102597,0.102612,0.102599,0.099796,0.102593,0.102613,0.098536
Precision@4,0.055452,0.050222,0.055195,0.060119,0.05607,0.054087,0.054622,0.055437,0.057997,0.053894,0.060961,0.06099,0.060932,0.061007,0.059978,0.060939,0.056296,0.06019,0.055676,0.060955,0.060661,0.060733,0.059121,0.060928,0.06042,0.060403,0.060154,0.060986,0.06099,0.060943,0.06092,0.060688,0.060737,0.059536,0.059993,0.061009,0.060949,0.060785,0.060951,0.054807,0.060623,0.060565,0.052112,0.061015,0.061011,0.060984,0.060984,0.060461,0.06093,0.060565,0.060965,0.060584,0.060995,0.060982,0.060914,0.060943,0.060992,0.060934,0.060725,0.061015,0.060976,0.060974,0.06098,0.06071,0.060997,0.060968,0.060999,0.060924,0.060723,0.060976,0.06097,0.060781,0.060719,0.060909,0.061005,0.060314,0.060741,0.060903,0.060924,0.061007,0.061005,0.061009,0.060936,0.060121,0.060955,0.061009,0.059845
Recall@4,0.10384,0.094156,0.103211,0.117883,0.105375,0.102129,0.102367,0.103754,0.111655,0.10215,0.120837,0.120879,0.120794,0.120957,0.117181,0.120806,0.105803,0.118069,0.105804,0.120429,0.119572,0.119787,0.114738,0.120778,0.118722,0.118628,0.117997,0.120897,0.120852,0.120811,0.120472,0.119676,0.119756,0.115866,0.117324,0.1209,0.120437,0.119999,0.120457,0.103789,0.11937,0.119195,0.09806,0.120924,0.121006,0.120839,0.120882,0.118849,0.12078,0.119198,0.120851,0.11927,0.120943,0.120899,0.120823,0.120346,0.120965,0.120738,0.119744,0.120928,0.120804,0.120785,0.120814,0.119745,0.120952,0.120725,0.12091,0.12046,0.11975,0.120702,0.120905,0.119983,0.119737,0.120858,0.12087,0.118382,0.119918,0.120777,0.120626,0.120892,0.120874,0.120977,0.120552,0.117831,0.120514,0.121006,0.116655
Precision@5,0.051172,0.047604,0.05095,0.054267,0.051984,0.05059,0.05042,0.051087,0.052893,0.049886,0.054654,0.054639,0.054639,0.054645,0.053987,0.054642,0.05211,0.05432,0.051487,0.054672,0.054582,0.054605,0.053361,0.054622,0.054413,0.05444,0.054297,0.054627,0.05468,0.054644,0.05473,0.054584,0.054589,0.053603,0.054091,0.054664,0.054728,0.054594,0.054727,0.050941,0.054592,0.054524,0.049066,0.054667,0.054632,0.05464,0.054629,0.054415,0.054627,0.054504,0.054645,0.054549,0.054645,0.054659,0.054644,0.054637,0.054637,0.05464,0.054587,0.054664,0.054674,0.054685,0.054677,0.054604,0.054647,0.054692,0.054662,0.054743,0.054594,0.054693,0.054659,0.054609,0.054587,0.054639,0.054682,0.054413,0.054596,0.054649,0.054712,0.054665,0.054669,0.054645,0.054725,0.054255,0.054753,0.054632,0.053835
Recall@5,0.118492,0.111228,0.117932,0.131899,0.121219,0.119506,0.117126,0.118324,0.126341,0.117276,0.134239,0.13424,0.13441,0.134221,0.130824,0.134409,0.121545,0.132067,0.122211,0.133783,0.133282,0.13348,0.128345,0.134337,0.132556,0.13249,0.131949,0.134211,0.134082,0.134419,0.134074,0.133394,0.133413,0.129185,0.131157,0.134092,0.133998,0.133509,0.13401,0.120626,0.13318,0.132957,0.115021,0.134101,0.134094,0.134274,0.134223,0.132553,0.134361,0.132905,0.134228,0.133058,0.134086,0.13409,0.134432,0.133638,0.134189,0.134284,0.133413,0.134101,0.134045,0.134076,0.134055,0.13347,0.13409,0.134077,0.134297,0.134078,0.133442,0.134074,0.134247,0.133514,0.133411,0.134467,0.134069,0.132385,0.133483,0.13442,0.134062,0.134076,0.134063,0.134107,0.134072,0.131849,0.134131,0.134094,0.130135


## Обучение модели с лучшими параметрами

### подготавливаем матрицы

### Создаем маппинг для users и items

In [46]:
dataset.interactions.df

Unnamed: 0,user_id,item_id,weight,datetime
0,0,0,3.0,2021-05-11
1,1,1,3.0,2021-05-29
2,2,2,1.0,2021-05-09
3,3,3,3.0,2021-07-05
4,4,0,3.0,2021-04-30
...,...,...,...,...
5476244,69627,219,3.0,2021-08-02
5476245,40052,132,1.0,2021-05-12
5476246,896790,318,1.0,2021-08-13
5476247,206604,2546,3.0,2021-04-13


In [47]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}


items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [48]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 896791
items_mapping amount: 15565


### Получаем разреженную матрицу

In [44]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

### Обучаем ItemKNN

In [None]:
userknn = BM25Recommender(K=50, K1=0.012556305101247701, B=0.05289835164246949, num_threads=2)
userknn.fit(interaction_matrix)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'userknn_BM25Recommender.dill', 'wb') as f:
    dill.dump(userknn, f)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_userknn_BM25Recommender.dill', 'wb') as f:
    dill.dump(dataset, f)

## Inference ItemKNN

### Предподготовка для ускорения инференса по одному пользователю

In [47]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'userknn_BM25Recommender.dill', 'rb') as f:
    userknn = dill.load(f)

In [48]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_userknn_BM25Recommender.dill', 'rb') as f:
    dataset = dill.load(f)

In [None]:
train = dataset.interactions.df

In [36]:
user_id = 6064
K_RECOS = 10
N = 50
cnt = Counter(train['item_id'].values)
idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
n = train.shape[0]
idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))

def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper

In [37]:
mapper = generate_implicit_recs_mapper(
    userknn, 
    N=N,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)

NameError: name 'userknn' is not defined

In [51]:
watched = train.groupby('user_id').agg({'item_id': list})
watched.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[7102, 14359, 15297, 6006, 9728, 12192]"
1,[10440]
2,"[7571, 3541, 15266, 13867, 12841, 10770, 4475,..."
3,"[12192, 9728, 16406, 15719, 10440, 3475, 4151,..."
4,"[4700, 6317]"


In [52]:
wwatched = {k:v['item_id'] for k,v in json.loads(watched.T.to_json()).items()}

In [38]:
# user_id = 33
def make_reco_fast(user_id, mapper, k_recos):
    recss = {}
    recss['similar_user_id'], recss['similarity'] = mapper(user_id)
    recss['similar_user_id'] = recss['similar_user_id'][1:]
    recss['similarity'] = recss['similarity'][1:]

    recss['item_id'] = [wwatched.get(f"{x}") for x in recss['similar_user_id']]
    recs = pd.DataFrame(recss)
    recs = recs.explode('item_id')
    recs = recs.sort_values(['similarity'], ascending=False)
    recs = recs.merge(idf[['index', 'idf']], 
                        left_on='item_id',
                        right_on='index',
                        how='left').drop(['index'], axis=1)
    recs['rank_idf'] = recs['similarity'] * recs['idf']
    recs = recs.sort_values(['rank_idf'], ascending=False)
    recs.dropna(inplace=True)
    return recs['item_id'].unique()[:k_recos]

In [39]:
def make_reco(user_id, mapper, k_recos):
    recs = pd.DataFrame({
        'user_id': train[train['user_id'] == user_id]['user_id'].unique()
    })
    recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))


    # explode lists to get vertical representation
    recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

    # delete recommendations of itself 
    recs = recs[~(recs['user_id'] == recs['similar_user_id'])]
    
    # join watched items
    recs = recs.merge(watched, left_on=['similar_user_id'], right_on=['user_id'], how='left')
    recs = recs.explode('item_id')
    # drop duplicates pairs user_id-item_id 
    # keep with the largest similiarity
    recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
    recs = recs.merge(idf[['index', 'idf']], 
                        left_on='item_id',
                        right_on='index',
                        how='left').drop(['index'], axis=1)
    recs['rank_idf'] = recs['similarity'] * recs['idf']
    recs = recs.sort_values(['user_id', 'rank_idf'], ascending=False)
    recs['rank'] = recs.groupby('user_id').cumcount() + 1 
    return recs[recs['rank'] <= k_recos]['item_id'].values

In [55]:
user_id = 100
make_reco(user_id, mapper, K_RECOS)

array([7833, 9951, 10211, 12134, 2620, 1871, 9047, 9517, 9025, 13856],
      dtype=object)

In [56]:
make_reco_fast(user_id, mapper, K_RECOS)

array([7833, 9951, 2620, 12134, 10211, 1871, 9047, 9517, 13856, 9025],
      dtype=object)